radeonsi: make get_indirect_index globally visible
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
index 593383e3da1065dbe1eb76d6bc2d806de3417423..bf66879571a1c08451413e27c5dd8af610f47e9e 100644 (file)
@@ -46,6 +46,7 @@
 #include "si_pipe.h"
 #include "sid.h"
 
+#include "compiler/nir/nir.h"
 
 static const char *scratch_rsrc_dword0_symbol =
        "SCRATCH_RSRC_DWORD0";
@@ -61,6 +62,22 @@ struct si_shader_output_values
        ubyte vertex_stream[4];
 };
 
+/**
+ * Used to collect types and other info about arguments of the LLVM function
+ * before the function is created.
+ */
+struct si_function_info {
+       LLVMTypeRef types[100];
+       LLVMValueRef *assign[100];
+       unsigned num_sgpr_params;
+       unsigned num_params;
+};
+
+enum si_arg_regfile {
+       ARG_SGPR,
+       ARG_VGPR
+};
+
 static void si_init_shader_ctx(struct si_shader_context *ctx,
                               struct si_screen *sscreen,
                               LLVMTargetMachineRef tm);
@@ -69,15 +86,13 @@ static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
                                 struct lp_build_tgsi_context *bld_base,
                                 struct lp_build_emit_data *emit_data);
 
-static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
+static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
                               FILE *f);
 
 static unsigned llvm_get_type_size(LLVMTypeRef type);
 
 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                                        union si_shader_part_key *key);
-static void si_build_vs_epilog_function(struct si_shader_context *ctx,
-                                       union si_shader_part_key *key);
 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
                                         union si_shader_part_key *key);
 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
@@ -90,44 +105,73 @@ static void si_build_ps_epilog_function(struct si_shader_context *ctx,
  */
 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
 
-/* The VS location of the PrimitiveID input is the same in the epilog,
- * so that the main shader part doesn't have to move it.
- */
-#define VS_EPILOG_PRIMID_LOC 2
-
 enum {
        CONST_ADDR_SPACE = 2,
        LOCAL_ADDR_SPACE = 3,
 };
 
+static bool is_merged_shader(struct si_shader *shader)
+{
+       if (shader->selector->screen->b.chip_class <= VI)
+               return false;
+
+       return shader->key.as_ls ||
+              shader->key.as_es ||
+              shader->selector->type == PIPE_SHADER_TESS_CTRL ||
+              shader->selector->type == PIPE_SHADER_GEOMETRY;
+}
+
+static void si_init_function_info(struct si_function_info *fninfo)
+{
+       fninfo->num_params = 0;
+       fninfo->num_sgpr_params = 0;
+}
+
+static unsigned add_arg_assign(struct si_function_info *fninfo,
+                       enum si_arg_regfile regfile, LLVMTypeRef type,
+                       LLVMValueRef *assign)
+{
+       assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params);
+
+       unsigned idx = fninfo->num_params++;
+       assert(idx < ARRAY_SIZE(fninfo->types));
+
+       if (regfile == ARG_SGPR)
+               fninfo->num_sgpr_params = fninfo->num_params;
+
+       fninfo->types[idx] = type;
+       fninfo->assign[idx] = assign;
+       return idx;
+}
+
+static unsigned add_arg(struct si_function_info *fninfo,
+                       enum si_arg_regfile regfile, LLVMTypeRef type)
+{
+       return add_arg_assign(fninfo, regfile, type, NULL);
+}
+
+static void add_arg_checked(struct si_function_info *fninfo,
+                           enum si_arg_regfile regfile, LLVMTypeRef type,
+                           unsigned idx)
+{
+       MAYBE_UNUSED unsigned actual = add_arg(fninfo, regfile, type);
+       assert(actual == idx);
+}
+
 /**
- * Returns a unique index for a semantic name and index. The index must be
- * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
- * calculated.
+ * Returns a unique index for a per-patch semantic name and index. The index
+ * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
+ * can be calculated.
  */
-unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
+unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
 {
        switch (semantic_name) {
-       case TGSI_SEMANTIC_POSITION:
-               return 0;
-       case TGSI_SEMANTIC_PSIZE:
-               return 1;
-       case TGSI_SEMANTIC_CLIPDIST:
-               assert(index <= 1);
-               return 2 + index;
-       case TGSI_SEMANTIC_GENERIC:
-               if (index <= 63-4)
-                       return 4 + index;
-
-               assert(!"invalid generic index");
-               return 0;
-
-       /* patch indices are completely separate and thus start from 0 */
        case TGSI_SEMANTIC_TESSOUTER:
                return 0;
        case TGSI_SEMANTIC_TESSINNER:
                return 1;
        case TGSI_SEMANTIC_PATCH:
+               assert(index < 30);
                return 2 + index;
 
        default:
@@ -136,28 +180,68 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
        }
 }
 
-unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index)
+/**
+ * Returns a unique index for a semantic name and index. The index must be
+ * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
+ * calculated.
+ */
+unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 {
-       switch (name) {
-       case TGSI_SEMANTIC_FOG:
+       switch (semantic_name) {
+       case TGSI_SEMANTIC_POSITION:
+               return 0;
+       case TGSI_SEMANTIC_GENERIC:
+               /* Since some shader stages use the the highest used IO index
+                * to determine the size to allocate for inputs/outputs
+                * (in LDS, tess and GS rings). GENERIC should be placed right
+                * after POSITION to make that size as small as possible.
+                */
+               if (index < SI_MAX_IO_GENERIC)
+                       return 1 + index;
+
+               assert(!"invalid generic index");
                return 0;
+       case TGSI_SEMANTIC_PSIZE:
+               return SI_MAX_IO_GENERIC + 1;
+       case TGSI_SEMANTIC_CLIPDIST:
+               assert(index <= 1);
+               return SI_MAX_IO_GENERIC + 2 + index;
+       case TGSI_SEMANTIC_FOG:
+               return SI_MAX_IO_GENERIC + 4;
        case TGSI_SEMANTIC_LAYER:
-               return 1;
+               return SI_MAX_IO_GENERIC + 5;
        case TGSI_SEMANTIC_VIEWPORT_INDEX:
-               return 2;
+               return SI_MAX_IO_GENERIC + 6;
        case TGSI_SEMANTIC_PRIMID:
-               return 3;
+               return SI_MAX_IO_GENERIC + 7;
        case TGSI_SEMANTIC_COLOR: /* these alias */
        case TGSI_SEMANTIC_BCOLOR:
-               return 4 + index;
+               assert(index < 2);
+               return SI_MAX_IO_GENERIC + 8 + index;
        case TGSI_SEMANTIC_TEXCOORD:
-               return 6 + index;
+               assert(index < 8);
+               assert(SI_MAX_IO_GENERIC + 10 + index < 64);
+               return SI_MAX_IO_GENERIC + 10 + index;
        default:
                assert(!"invalid semantic name");
                return 0;
        }
 }
 
+/**
+ * Helper function that builds an LLVM IR PHI node and immediately adds
+ * incoming edges.
+ */
+static LLVMValueRef
+build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
+         unsigned count_incoming, LLVMValueRef *values,
+         LLVMBasicBlockRef *blocks)
+{
+       LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
+       LLVMAddIncoming(phi, values, blocks, count_incoming);
+       return phi;
+}
+
 /**
  * Get the value of a shader input parameter and extract a bitfield.
  */
@@ -296,17 +380,15 @@ get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
 
 static LLVMValueRef get_instance_index_for_fetch(
        struct si_shader_context *ctx,
-       unsigned param_start_instance, unsigned divisor)
+       unsigned param_start_instance, LLVMValueRef divisor)
 {
        struct gallivm_state *gallivm = &ctx->gallivm;
 
-       LLVMValueRef result = LLVMGetParam(ctx->main_fn,
-                                          ctx->param_instance_id);
+       LLVMValueRef result = ctx->abi.instance_id;
 
        /* The division must be done before START_INSTANCE is added. */
-       if (divisor > 1)
-               result = LLVMBuildUDiv(gallivm->builder, result,
-                               LLVMConstInt(ctx->i32, divisor, 0), "");
+       if (divisor != ctx->i32_1)
+               result = LLVMBuildUDiv(gallivm->builder, result, divisor, "");
 
        return LLVMBuildAdd(gallivm->builder, result,
                            LLVMGetParam(ctx->main_fn, param_start_instance), "");
@@ -327,10 +409,9 @@ static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
        return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
 }
 
-static void declare_input_vs(
+void si_llvm_load_input_vs(
        struct si_shader_context *ctx,
        unsigned input_index,
-       const struct tgsi_full_declaration *decl,
        LLVMValueRef out[4])
 {
        struct gallivm_state *gallivm = &ctx->gallivm;
@@ -533,11 +614,18 @@ static void declare_input_vs(
        }
 }
 
-static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
-                                    unsigned swizzle)
+static void declare_input_vs(
+       struct si_shader_context *ctx,
+       unsigned input_index,
+       const struct tgsi_full_declaration *decl,
+       LLVMValueRef out[4])
 {
-       struct si_shader_context *ctx = si_shader_context(bld_base);
+       si_llvm_load_input_vs(ctx, input_index, out);
+}
 
+static LLVMValueRef get_primitive_id(struct si_shader_context *ctx,
+                                    unsigned swizzle)
+{
        if (swizzle > 0)
                return ctx->i32_0;
 
@@ -564,9 +652,9 @@ static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
  * Return the value of tgsi_ind_register for indexing.
  * This is the indirect index with the constant offset added to it.
  */
-static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
-                                      const struct tgsi_ind_register *ind,
-                                      int rel_index)
+LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx,
+                                  const struct tgsi_ind_register *ind,
+                                  int rel_index)
 {
        struct gallivm_state *gallivm = &ctx->gallivm;
        LLVMValueRef result;
@@ -579,21 +667,14 @@ static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
 }
 
 /**
- * Like get_indirect_index, but restricts the return value to a (possibly
+ * Like si_get_indirect_index, but restricts the return value to a (possibly
  * undefined) value inside [0..num).
  */
-static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
-                                              const struct tgsi_ind_register *ind,
-                                              int rel_index, unsigned num)
+LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
+                                          const struct tgsi_ind_register *ind,
+                                          int rel_index, unsigned num)
 {
-       LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
-
-       /* LLVM 3.8: If indirect resource indexing is used:
-        * - SI & CIK hang
-        * - VI crashes
-        */
-       if (HAVE_LLVM == 0x0308)
-               return LLVMGetUndef(ctx->i32);
+       LLVMValueRef result = si_get_indirect_index(ctx, ind, rel_index);
 
        return si_llvm_bound_index(ctx, result, num);
 }
@@ -633,7 +714,7 @@ static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
                LLVMValueRef index;
 
                if (reg.Dimension.Indirect)
-                       index = get_indirect_index(ctx, &reg.DimIndirect,
+                       index = si_get_indirect_index(ctx, &reg.DimIndirect,
                                                   reg.Dimension.Index);
                else
                        index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
@@ -666,17 +747,22 @@ static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
                else
                        first = reg.Register.Index;
 
-               ind_index = get_indirect_index(ctx, &reg.Indirect,
+               ind_index = si_get_indirect_index(ctx, &reg.Indirect,
                                           reg.Register.Index - first);
 
                base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
                                    LLVMBuildMul(gallivm->builder, ind_index,
                                                 LLVMConstInt(ctx->i32, 4, 0), ""), "");
 
-               param = si_shader_io_get_unique_index(name[first], index[first]);
+               param = reg.Register.Dimension ?
+                       si_shader_io_get_unique_index(name[first], index[first]) :
+                       si_shader_io_get_unique_index_patch(name[first], index[first]);
        } else {
-               param = si_shader_io_get_unique_index(name[reg.Register.Index],
-                                                     index[reg.Register.Index]);
+               param = reg.Register.Dimension ?
+                       si_shader_io_get_unique_index(name[reg.Register.Index],
+                                                     index[reg.Register.Index]) :
+                       si_shader_io_get_unique_index_patch(name[reg.Register.Index],
+                                                           index[reg.Register.Index]);
        }
 
        /* Add the base address of the element. */
@@ -711,8 +797,8 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
        LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
        LLVMValueRef param_stride, constant16;
 
-       vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 9, 6);
-       num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 9);
+       vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
+       num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
        total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
                                      num_patches, "");
 
@@ -738,7 +824,7 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
 
        if (!vertex_index) {
                LLVMValueRef patch_data_offset =
-                          unpack_param(ctx, ctx->param_tcs_offchip_layout, 16, 16);
+                          unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
 
                base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
                                         patch_data_offset, "");
@@ -764,7 +850,7 @@ static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
        if (reg.Register.Dimension) {
 
                if (reg.Dimension.Indirect)
-                       vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
+                       vertex_index = si_get_indirect_index(ctx, &reg.DimIndirect,
                                                          reg.Dimension.Index);
                else
                        vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
@@ -790,7 +876,7 @@ static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
                else
                        param_base = reg.Register.Index;
 
-               param_index = get_indirect_index(ctx, &reg.Indirect,
+               param_index = si_get_indirect_index(ctx, &reg.Indirect,
                                                 reg.Register.Index - param_base);
 
        } else {
@@ -798,8 +884,9 @@ static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
                param_index = ctx->i32_0;
        }
 
-       param_index_base = si_shader_io_get_unique_index(name[param_base],
-                                                        index[param_base]);
+       param_index_base = reg.Register.Dimension ?
+               si_shader_io_get_unique_index(name[param_base], index[param_base]) :
+               si_shader_io_get_unique_index_patch(name[param_base], index[param_base]);
 
        param_index = LLVMBuildAdd(gallivm->builder, param_index,
                                   LLVMConstInt(ctx->i32, param_index_base, 0),
@@ -812,7 +899,7 @@ static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
                                 enum tgsi_opcode_type type, unsigned swizzle,
                                 LLVMValueRef buffer, LLVMValueRef offset,
-                                LLVMValueRef base, bool readonly_memory)
+                                LLVMValueRef base, bool can_speculate)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct gallivm_state *gallivm = &ctx->gallivm;
@@ -822,14 +909,14 @@ static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
 
        if (swizzle == ~0) {
                value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
-                                            0, 1, 0, readonly_memory);
+                                            0, 1, 0, can_speculate, false);
 
                return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
        }
 
        if (!tgsi_type_is_64bit(type)) {
                value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
-                                            0, 1, 0, readonly_memory);
+                                            0, 1, 0, can_speculate, false);
 
                value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
                return LLVMBuildExtractElement(gallivm->builder, value,
@@ -837,10 +924,10 @@ static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
        }
 
        value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
-                                 swizzle * 4, 1, 0, readonly_memory);
+                                 swizzle * 4, 1, 0, can_speculate, false);
 
        value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
-                                  swizzle * 4 + 4, 1, 0, readonly_memory);
+                                  swizzle * 4 + 4, 1, 0, can_speculate, false);
 
        return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
 }
@@ -894,20 +981,44 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
  * \param value                value to store
  */
 static void lds_store(struct lp_build_tgsi_context *bld_base,
-                     unsigned swizzle, LLVMValueRef dw_addr,
+                     unsigned dw_offset_imm, LLVMValueRef dw_addr,
                      LLVMValueRef value)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct gallivm_state *gallivm = &ctx->gallivm;
 
        dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
-                           LLVMConstInt(ctx->i32, swizzle, 0));
+                           LLVMConstInt(ctx->i32, dw_offset_imm, 0));
 
        value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
        ac_build_indexed_store(&ctx->ac, ctx->lds,
                               dw_addr, value);
 }
 
+static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx,
+                                                 unsigned param)
+{
+       LLVMBuilderRef builder = ctx->gallivm.builder;
+
+       LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
+       addr = LLVMBuildZExt(builder, addr, ctx->i64, "");
+       addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), "");
+
+       uint64_t desc2 = 0xffffffff;
+       uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+                        S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                        S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+                        S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+                        S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                        S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+       LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0);
+
+       LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2));
+       desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, "");
+       desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, "");
+       return LLVMBuildBitCast(builder, desc, ctx->v4i32, "");
+}
+
 static LLVMValueRef fetch_input_tcs(
        struct lp_build_tgsi_context *bld_base,
        const struct tgsi_full_src_register *reg,
@@ -949,12 +1060,9 @@ static LLVMValueRef fetch_input_tes(
        enum tgsi_opcode_type type, unsigned swizzle)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
-       LLVMValueRef rw_buffers, buffer, base, addr;
+       LLVMValueRef buffer, base, addr;
 
-       rw_buffers = LLVMGetParam(ctx->main_fn,
-                                 ctx->param_rw_buffers);
-       buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
-                       LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
+       buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
 
        base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
        addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
@@ -973,7 +1081,7 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
        const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
        unsigned chan_index;
        LLVMValueRef dw_addr, stride;
-       LLVMValueRef rw_buffers, buffer, base, buf_addr;
+       LLVMValueRef buffer, base, buf_addr;
        LLVMValueRef values[4];
        bool skip_lds_store;
        bool is_tess_factor = false;
@@ -1009,10 +1117,7 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
                }
        }
 
-       rw_buffers = LLVMGetParam(ctx->main_fn,
-                                 ctx->param_rw_buffers);
-       buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
-                       LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
+       buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
 
        base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
        buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
@@ -1057,7 +1162,6 @@ static LLVMValueRef fetch_input_gs(
        struct lp_build_context *uint = &ctx->bld_base.uint_bld;
        struct gallivm_state *gallivm = &ctx->gallivm;
        LLVMValueRef vtx_offset, soffset;
-       unsigned vtx_offset_param;
        struct tgsi_shader_info *info = &shader->selector->info;
        unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
        unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
@@ -1065,11 +1169,41 @@ static LLVMValueRef fetch_input_gs(
        LLVMValueRef value;
 
        if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
-               return get_primitive_id(bld_base, swizzle);
+               return get_primitive_id(ctx, swizzle);
 
        if (!reg->Register.Dimension)
                return NULL;
 
+       param = si_shader_io_get_unique_index(semantic_name, semantic_index);
+
+       /* GFX9 has the ESGS ring in LDS. */
+       if (ctx->screen->b.chip_class >= GFX9) {
+               unsigned index = reg->Dimension.Index;
+
+               switch (index / 2) {
+               case 0:
+                       vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
+                                                 index % 2 ? 16 : 0, 16);
+                       break;
+               case 1:
+                       vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
+                                                 index % 2 ? 16 : 0, 16);
+                       break;
+               case 2:
+                       vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
+                                                 index % 2 ? 16 : 0, 16);
+                       break;
+               default:
+                       assert(0);
+                       return NULL;
+               }
+
+               vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
+                                         LLVMConstInt(ctx->i32, param * 4, 0), "");
+               return lds_load(bld_base, type, swizzle, vtx_offset);
+       }
+
+       /* GFX6: input load from the ESGS ring in memory. */
        if (swizzle == ~0) {
                LLVMValueRef values[TGSI_NUM_CHANNELS];
                unsigned chan;
@@ -1080,8 +1214,8 @@ static LLVMValueRef fetch_input_gs(
                                              TGSI_NUM_CHANNELS);
        }
 
-       /* Get the vertex offset parameter */
-       vtx_offset_param = reg->Dimension.Index;
+       /* Get the vertex offset parameter on GFX6. */
+       unsigned vtx_offset_param = reg->Dimension.Index;
        if (vtx_offset_param < 2) {
                vtx_offset_param += ctx->param_gs_vtx0_offset;
        } else {
@@ -1093,18 +1227,17 @@ static LLVMValueRef fetch_input_gs(
                                                   vtx_offset_param),
                                      4);
 
-       param = si_shader_io_get_unique_index(semantic_name, semantic_index);
        soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
 
        value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
-                                    vtx_offset, soffset, 0, 1, 0, true);
+                                    vtx_offset, soffset, 0, 1, 0, true, false);
        if (tgsi_type_is_64bit(type)) {
                LLVMValueRef value2;
                soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
 
                value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
                                              ctx->i32_0, vtx_offset, soffset,
-                                             0, 1, 0, true);
+                                             0, 1, 0, true, false);
                return si_llvm_emit_fetch_64bit(bld_base, type,
                                                value, value2);
        }
@@ -1142,6 +1275,24 @@ static int lookup_interp_param_index(unsigned interpolate, unsigned location)
        }
 }
 
+static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
+                                      unsigned attr_index, unsigned chan,
+                                      LLVMValueRef prim_mask,
+                                      LLVMValueRef i, LLVMValueRef j)
+{
+       if (i || j) {
+               return ac_build_fs_interp(&ctx->ac,
+                                         LLVMConstInt(ctx->i32, chan, 0),
+                                         LLVMConstInt(ctx->i32, attr_index, 0),
+                                         prim_mask, i, j);
+       }
+       return ac_build_fs_interp_mov(&ctx->ac,
+                                     LLVMConstInt(ctx->i32, 2, 0), /* P0 */
+                                     LLVMConstInt(ctx->i32, chan, 0),
+                                     LLVMConstInt(ctx->i32, attr_index, 0),
+                                     prim_mask);
+}
+
 /**
  * Interpolate a fragment shader input.
  *
@@ -1168,9 +1319,7 @@ static void interp_fs_input(struct si_shader_context *ctx,
                            LLVMValueRef result[4])
 {
        struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMValueRef attr_number;
-       LLVMValueRef i, j;
-
+       LLVMValueRef i = NULL, j = NULL;
        unsigned chan;
 
        /* fs.constant returns the param from the middle vertex, so it's not
@@ -1188,8 +1337,6 @@ static void interp_fs_input(struct si_shader_context *ctx,
         */
        bool interp = interp_param != NULL;
 
-       attr_number = LLVMConstInt(ctx->i32, input_index, 0);
-
        if (interp) {
                interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
                                                LLVMVectorType(ctx->f32, 2), "");
@@ -1203,7 +1350,6 @@ static void interp_fs_input(struct si_shader_context *ctx,
        if (semantic_name == TGSI_SEMANTIC_COLOR &&
            ctx->shader->key.part.ps.prolog.color_two_side) {
                LLVMValueRef is_face_positive;
-               LLVMValueRef back_attr_number;
 
                /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
                 * otherwise it's at offset "num_inputs".
@@ -1212,30 +1358,18 @@ static void interp_fs_input(struct si_shader_context *ctx,
                if (semantic_index == 1 && colors_read_mask & 0xf)
                        back_attr_offset += 1;
 
-               back_attr_number = LLVMConstInt(ctx->i32, back_attr_offset, 0);
-
                is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
                                                 face, ctx->i32_0, "");
 
                for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-                       LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
                        LLVMValueRef front, back;
 
-                       if (interp) {
-                               front = ac_build_fs_interp(&ctx->ac, llvm_chan,
-                                                       attr_number, prim_mask,
-                                                       i, j);
-                               back = ac_build_fs_interp(&ctx->ac, llvm_chan,
-                                                       back_attr_number, prim_mask,
-                                                       i, j);
-                       } else {
-                               front = ac_build_fs_interp_mov(&ctx->ac,
-                                       LLVMConstInt(ctx->i32, 2, 0), /* P0 */
-                                       llvm_chan, attr_number, prim_mask);
-                               back = ac_build_fs_interp_mov(&ctx->ac,
-                                       LLVMConstInt(ctx->i32, 2, 0), /* P0 */
-                                       llvm_chan, back_attr_number, prim_mask);
-                       }
+                       front = si_build_fs_interp(ctx,
+                                                  input_index, chan,
+                                                  prim_mask, i, j);
+                       back = si_build_fs_interp(ctx,
+                                                 back_attr_offset, chan,
+                                                 prim_mask, i, j);
 
                        result[chan] = LLVMBuildSelect(gallivm->builder,
                                                is_face_positive,
@@ -1244,52 +1378,42 @@ static void interp_fs_input(struct si_shader_context *ctx,
                                                "");
                }
        } else if (semantic_name == TGSI_SEMANTIC_FOG) {
-               if (interp) {
-                       result[0] = ac_build_fs_interp(&ctx->ac, ctx->i32_0,
-                                                      attr_number, prim_mask, i, j);
-               } else {
-                       result[0] = ac_build_fs_interp_mov(&ctx->ac, ctx->i32_0,
-                                                          LLVMConstInt(ctx->i32, 2, 0), /* P0 */
-                                                          attr_number, prim_mask);
-               }
+               result[0] = si_build_fs_interp(ctx, input_index,
+                                              0, prim_mask, i, j);
                result[1] =
                result[2] = LLVMConstReal(ctx->f32, 0.0f);
                result[3] = LLVMConstReal(ctx->f32, 1.0f);
        } else {
                for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-                       LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
-
-                       if (interp) {
-                               result[chan] = ac_build_fs_interp(&ctx->ac,
-                                       llvm_chan, attr_number, prim_mask, i, j);
-                       } else {
-                               result[chan] = ac_build_fs_interp_mov(&ctx->ac,
-                                       LLVMConstInt(ctx->i32, 2, 0), /* P0 */
-                                       llvm_chan, attr_number, prim_mask);
-                       }
+                       result[chan] = si_build_fs_interp(ctx,
+                                                         input_index, chan,
+                                                         prim_mask, i, j);
                }
        }
 }
 
-static void declare_input_fs(
+void si_llvm_load_input_fs(
        struct si_shader_context *ctx,
        unsigned input_index,
-       const struct tgsi_full_declaration *decl,
        LLVMValueRef out[4])
 {
        struct lp_build_context *base = &ctx->bld_base.base;
        struct si_shader *shader = ctx->shader;
+       struct tgsi_shader_info *info = &shader->selector->info;
        LLVMValueRef main_fn = ctx->main_fn;
        LLVMValueRef interp_param = NULL;
        int interp_param_idx;
+       enum tgsi_semantic semantic_name = info->input_semantic_name[input_index];
+       unsigned semantic_index = info->input_semantic_index[input_index];
+       enum tgsi_interpolate_mode interp_mode = info->input_interpolate[input_index];
+       enum tgsi_interpolate_loc interp_loc = info->input_interpolate_loc[input_index];
 
        /* Get colors from input VGPRs (set by the prolog). */
-       if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
-               unsigned i = decl->Semantic.Index;
+       if (semantic_name == TGSI_SEMANTIC_COLOR) {
                unsigned colors_read = shader->selector->info.colors_read;
-               unsigned mask = colors_read >> (i * 4);
+               unsigned mask = colors_read >> (semantic_index * 4);
                unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
-                                 (i ? util_bitcount(colors_read & 0xf) : 0);
+                                 (semantic_index ? util_bitcount(colors_read & 0xf) : 0);
 
                out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
                out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
@@ -1298,27 +1422,30 @@ static void declare_input_fs(
                return;
        }
 
-       interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
-                                                    decl->Interp.Location);
+       interp_param_idx = lookup_interp_param_index(interp_mode, interp_loc);
        if (interp_param_idx == -1)
                return;
        else if (interp_param_idx) {
                interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
        }
 
-       if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
-           decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
-           ctx->shader->key.part.ps.prolog.flatshade_colors)
-               interp_param = NULL; /* load the constant color */
-
-       interp_fs_input(ctx, input_index, decl->Semantic.Name,
-                       decl->Semantic.Index, shader->selector->info.num_inputs,
+       interp_fs_input(ctx, input_index, semantic_name,
+                       semantic_index, 0, /* this param is unused */
                        shader->selector->info.colors_read, interp_param,
                        LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
                        LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
                        &out[0]);
 }
 
+static void declare_input_fs(
+       struct si_shader_context *ctx,
+       unsigned input_index,
+       const struct tgsi_full_declaration *decl,
+       LLVMValueRef out[4])
+{
+       si_llvm_load_input_fs(ctx, input_index, out);
+}
+
 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
 {
        return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
@@ -1332,12 +1459,8 @@ static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
                                      LLVMValueRef resource,
                                      LLVMValueRef offset)
 {
-       LLVMBuilderRef builder = ctx->gallivm.builder;
-       LLVMValueRef args[2] = {resource, offset};
-
-       return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2,
-                                 LP_FUNC_ATTR_READNONE |
-                                 LP_FUNC_ATTR_LEGACY);
+       return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
+                                   0, 0, 0, true, true);
 }
 
 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
@@ -1375,16 +1498,13 @@ static void declare_system_value(struct si_shader_context *ctx,
 
        switch (decl->Semantic.Name) {
        case TGSI_SEMANTIC_INSTANCEID:
-               value = LLVMGetParam(ctx->main_fn,
-                                    ctx->param_instance_id);
+               value = ctx->abi.instance_id;
                break;
 
        case TGSI_SEMANTIC_VERTEXID:
                value = LLVMBuildAdd(gallivm->builder,
-                                    LLVMGetParam(ctx->main_fn,
-                                                 ctx->param_vertex_id),
-                                    LLVMGetParam(ctx->main_fn,
-                                                 ctx->param_base_vertex), "");
+                                    ctx->abi.vertex_id,
+                                    ctx->abi.base_vertex, "");
                break;
 
        case TGSI_SEMANTIC_VERTEXID_NOBASE:
@@ -1406,17 +1526,16 @@ static void declare_system_value(struct si_shader_context *ctx,
                indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
 
                value = LLVMBuildSelect(gallivm->builder, indexed,
-                                       LLVMGetParam(ctx->main_fn, ctx->param_base_vertex),
-                                       ctx->i32_0, "");
+                                       ctx->abi.base_vertex, ctx->i32_0, "");
                break;
        }
 
        case TGSI_SEMANTIC_BASEINSTANCE:
-               value = LLVMGetParam(ctx->main_fn, ctx->param_start_instance);
+               value = ctx->abi.start_instance;
                break;
 
        case TGSI_SEMANTIC_DRAWID:
-               value = LLVMGetParam(ctx->main_fn, ctx->param_draw_id);
+               value = ctx->abi.draw_id;
                break;
 
        case TGSI_SEMANTIC_INVOCATIONID:
@@ -1496,7 +1615,7 @@ static void declare_system_value(struct si_shader_context *ctx,
                if (ctx->type == PIPE_SHADER_TESS_CTRL)
                        value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
                else if (ctx->type == PIPE_SHADER_TESS_EVAL)
-                       value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 9, 7);
+                       value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
                else
                        assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
                break;
@@ -1504,13 +1623,10 @@ static void declare_system_value(struct si_shader_context *ctx,
        case TGSI_SEMANTIC_TESSINNER:
        case TGSI_SEMANTIC_TESSOUTER:
        {
-               LLVMValueRef rw_buffers, buffer, base, addr;
-               int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
+               LLVMValueRef buffer, base, addr;
+               int param = si_shader_io_get_unique_index_patch(decl->Semantic.Name, 0);
 
-               rw_buffers = LLVMGetParam(ctx->main_fn,
-                                         ctx->param_rw_buffers);
-               buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
-                       LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
+               buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
 
                base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
                addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
@@ -1541,11 +1657,11 @@ static void declare_system_value(struct si_shader_context *ctx,
        }
 
        case TGSI_SEMANTIC_PRIMID:
-               value = get_primitive_id(&ctx->bld_base, 0);
+               value = get_primitive_id(ctx, 0);
                break;
 
        case TGSI_SEMANTIC_GRID_SIZE:
-               value = LLVMGetParam(ctx->main_fn, SI_PARAM_GRID_SIZE);
+               value = LLVMGetParam(ctx->main_fn, ctx->param_grid_size);
                break;
 
        case TGSI_SEMANTIC_BLOCK_SIZE:
@@ -1566,31 +1682,37 @@ static void declare_system_value(struct si_shader_context *ctx,
 
                        value = lp_build_gather_values(gallivm, values, 3);
                } else {
-                       value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_SIZE);
+                       value = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
                }
                break;
        }
 
        case TGSI_SEMANTIC_BLOCK_ID:
-               value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_ID);
+       {
+               LLVMValueRef values[3];
+
+               for (int i = 0; i < 3; i++) {
+                       values[i] = ctx->i32_0;
+                       if (ctx->param_block_id[i] >= 0) {
+                               values[i] = LLVMGetParam(ctx->main_fn,
+                                                        ctx->param_block_id[i]);
+                       }
+               }
+               value = lp_build_gather_values(gallivm, values, 3);
                break;
+       }
 
        case TGSI_SEMANTIC_THREAD_ID:
-               value = LLVMGetParam(ctx->main_fn, SI_PARAM_THREAD_ID);
+               value = LLVMGetParam(ctx->main_fn, ctx->param_thread_id);
                break;
 
        case TGSI_SEMANTIC_HELPER_INVOCATION:
-               if (HAVE_LLVM >= 0x0309) {
-                       value = lp_build_intrinsic(gallivm->builder,
-                                                  "llvm.amdgcn.ps.live",
-                                                  ctx->i1, NULL, 0,
-                                                  LP_FUNC_ATTR_READNONE);
-                       value = LLVMBuildNot(gallivm->builder, value, "");
-                       value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
-               } else {
-                       assert(!"TGSI_SEMANTIC_HELPER_INVOCATION unsupported");
-                       return;
-               }
+               value = lp_build_intrinsic(gallivm->builder,
+                                          "llvm.amdgcn.ps.live",
+                                          ctx->i1, NULL, 0,
+                                          LP_FUNC_ATTR_READNONE);
+               value = LLVMBuildNot(gallivm->builder, value, "");
+               value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
                break;
 
        case TGSI_SEMANTIC_SUBGROUP_SIZE:
@@ -1666,10 +1788,22 @@ static void declare_compute_memory(struct si_shader_context *ctx,
 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
 {
        LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
-                                            ctx->param_const_buffers);
+                                            ctx->param_const_and_shader_buffers);
 
        return ac_build_indexed_load_const(&ctx->ac, list_ptr,
-                                       LLVMConstInt(ctx->i32, i, 0));
+                       LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0));
+}
+
+static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
+{
+       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+       LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
+
+       index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
+       index = LLVMBuildAdd(ctx->gallivm.builder, index,
+                            LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
+
+       return ac_build_indexed_load_const(&ctx->ac, ptr, index);
 }
 
 static LLVMValueRef fetch_constant(
@@ -1699,11 +1833,13 @@ static LLVMValueRef fetch_constant(
        idx = reg->Register.Index * 4 + swizzle;
 
        if (reg->Register.Dimension && reg->Dimension.Indirect) {
-               LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_buffers);
+               LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
                LLVMValueRef index;
-               index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
-                                                  reg->Dimension.Index,
-                                                  SI_NUM_CONST_BUFFERS);
+               index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
+                                                     reg->Dimension.Index,
+                                                     ctx->num_const_buffers);
+               index = LLVMBuildAdd(ctx->gallivm.builder, index,
+                                    LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
                bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
        } else
                bufp = load_const_buffer_desc(ctx, buf);
@@ -2198,122 +2334,111 @@ static void si_llvm_emit_streamout(struct si_shader_context *ctx,
        lp_build_endif(&if_ctx);
 }
 
+static void si_export_param(struct si_shader_context *ctx, unsigned index,
+                           LLVMValueRef *values)
+{
+       struct ac_export_args args;
 
-/* Generate export instructions for hardware VS shader stage */
-static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
-                             struct si_shader_output_values *outputs,
-                             unsigned noutput)
+       si_llvm_init_export_args(&ctx->bld_base, values,
+                                V_008DFC_SQ_EXP_PARAM + index, &args);
+       ac_build_export(&ctx->ac, &args);
+}
+
+static void si_build_param_exports(struct si_shader_context *ctx,
+                                  struct si_shader_output_values *outputs,
+                                  unsigned noutput)
 {
-       struct si_shader_context *ctx = si_shader_context(bld_base);
        struct si_shader *shader = ctx->shader;
-       struct lp_build_context *base = &bld_base->base;
-       struct ac_export_args args, pos_args[4] = {};
-       LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
-       unsigned semantic_name, semantic_index;
-       unsigned target;
        unsigned param_count = 0;
-       unsigned pos_idx;
-       int i;
-
-       for (i = 0; i < noutput; i++) {
-               semantic_name = outputs[i].semantic_name;
-               semantic_index = outputs[i].semantic_index;
-               bool export_param = true;
 
-               switch (semantic_name) {
-               case TGSI_SEMANTIC_POSITION: /* ignore these */
-               case TGSI_SEMANTIC_PSIZE:
-               case TGSI_SEMANTIC_CLIPVERTEX:
-               case TGSI_SEMANTIC_EDGEFLAG:
-                       break;
-               case TGSI_SEMANTIC_GENERIC:
-               case TGSI_SEMANTIC_CLIPDIST:
-                       if (shader->key.opt.hw_vs.kill_outputs &
-                           (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
-                               export_param = false;
-                       break;
-               default:
-                       if (shader->key.opt.hw_vs.kill_outputs2 &
-                           (1u << si_shader_io_get_unique_index2(semantic_name, semantic_index)))
-                               export_param = false;
-                       break;
-               }
+       for (unsigned i = 0; i < noutput; i++) {
+               unsigned semantic_name = outputs[i].semantic_name;
+               unsigned semantic_index = outputs[i].semantic_index;
 
                if (outputs[i].vertex_stream[0] != 0 &&
                    outputs[i].vertex_stream[1] != 0 &&
                    outputs[i].vertex_stream[2] != 0 &&
                    outputs[i].vertex_stream[3] != 0)
-                       export_param = false;
-
-handle_semantic:
-               /* Select the correct target */
-               switch(semantic_name) {
-               case TGSI_SEMANTIC_PSIZE:
-                       psize_value = outputs[i].values[0];
-                       continue;
-               case TGSI_SEMANTIC_EDGEFLAG:
-                       edgeflag_value = outputs[i].values[0];
                        continue;
+
+               switch (semantic_name) {
                case TGSI_SEMANTIC_LAYER:
-                       layer_value = outputs[i].values[0];
-                       semantic_name = TGSI_SEMANTIC_GENERIC;
-                       goto handle_semantic;
                case TGSI_SEMANTIC_VIEWPORT_INDEX:
-                       viewport_index_value = outputs[i].values[0];
-                       semantic_name = TGSI_SEMANTIC_GENERIC;
-                       goto handle_semantic;
-               case TGSI_SEMANTIC_POSITION:
-                       target = V_008DFC_SQ_EXP_POS;
-                       break;
                case TGSI_SEMANTIC_CLIPDIST:
-                       if (shader->key.opt.hw_vs.clip_disable) {
-                               semantic_name = TGSI_SEMANTIC_GENERIC;
-                               goto handle_semantic;
-                       }
-                       target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
-                       break;
-               case TGSI_SEMANTIC_CLIPVERTEX:
-                       if (shader->key.opt.hw_vs.clip_disable)
-                               continue;
-                       si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
-                       continue;
                case TGSI_SEMANTIC_COLOR:
                case TGSI_SEMANTIC_BCOLOR:
                case TGSI_SEMANTIC_PRIMID:
                case TGSI_SEMANTIC_FOG:
                case TGSI_SEMANTIC_TEXCOORD:
                case TGSI_SEMANTIC_GENERIC:
-                       if (!export_param)
-                               continue;
-                       target = V_008DFC_SQ_EXP_PARAM + param_count;
-                       assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
-                       shader->info.vs_output_param_offset[i] = param_count;
-                       param_count++;
                        break;
                default:
-                       target = 0;
-                       fprintf(stderr,
-                               "Warning: SI unhandled vs output type:%d\n",
-                               semantic_name);
+                       continue;
                }
 
-               si_llvm_init_export_args(bld_base, outputs[i].values, target, &args);
+               if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
+                    semantic_index < SI_MAX_IO_GENERIC) &&
+                   shader->key.opt.kill_outputs &
+                   (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
+                       continue;
 
-               if (target >= V_008DFC_SQ_EXP_POS &&
-                   target <= (V_008DFC_SQ_EXP_POS + 3)) {
-                       memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
-                              &args, sizeof(args));
-               } else {
-                       ac_build_export(&ctx->ac, &args);
-               }
+               si_export_param(ctx, param_count, outputs[i].values);
 
-               if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
-                       semantic_name = TGSI_SEMANTIC_GENERIC;
-                       goto handle_semantic;
-               }
+               assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
+               shader->info.vs_output_param_offset[i] = param_count++;
        }
 
        shader->info.nr_param_exports = param_count;
+}
+
+/* Generate export instructions for hardware VS shader stage */
+static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
+                             struct si_shader_output_values *outputs,
+                             unsigned noutput)
+{
+       struct si_shader_context *ctx = si_shader_context(bld_base);
+       struct si_shader *shader = ctx->shader;
+       struct lp_build_context *base = &bld_base->base;
+       struct ac_export_args pos_args[4] = {};
+       LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
+       unsigned pos_idx;
+       int i;
+
+       /* Build position exports. */
+       for (i = 0; i < noutput; i++) {
+               switch (outputs[i].semantic_name) {
+               case TGSI_SEMANTIC_POSITION:
+                       si_llvm_init_export_args(bld_base, outputs[i].values,
+                                                V_008DFC_SQ_EXP_POS, &pos_args[0]);
+                       break;
+               case TGSI_SEMANTIC_PSIZE:
+                       psize_value = outputs[i].values[0];
+                       break;
+               case TGSI_SEMANTIC_LAYER:
+                       layer_value = outputs[i].values[0];
+                       break;
+               case TGSI_SEMANTIC_VIEWPORT_INDEX:
+                       viewport_index_value = outputs[i].values[0];
+                       break;
+               case TGSI_SEMANTIC_EDGEFLAG:
+                       edgeflag_value = outputs[i].values[0];
+                       break;
+               case TGSI_SEMANTIC_CLIPDIST:
+                       if (!shader->key.opt.clip_disable) {
+                               unsigned index = 2 + outputs[i].semantic_index;
+                               si_llvm_init_export_args(bld_base, outputs[i].values,
+                                                        V_008DFC_SQ_EXP_POS + index,
+                                                        &pos_args[index]);
+                       }
+                       break;
+               case TGSI_SEMANTIC_CLIPVERTEX:
+                       if (!shader->key.opt.clip_disable) {
+                               si_llvm_emit_clipvertex(bld_base, pos_args,
+                                                       outputs[i].values);
+                       }
+                       break;
+               }
+       }
 
        /* We need to add the position output manually if it's missing. */
        if (!pos_args[0].out[0]) {
@@ -2335,8 +2460,8 @@ handle_semantic:
            shader->selector->info.writes_layer) {
                pos_args[1].enabled_channels = shader->selector->info.writes_psize |
                                               (shader->selector->info.writes_edgeflag << 1) |
-                                              (shader->selector->info.writes_layer << 2) |
-                                              (shader->selector->info.writes_viewport_index << 3);
+                                              (shader->selector->info.writes_layer << 2);
+
                pos_args[1].valid_mask = 0; /* EXEC mask */
                pos_args[1].done = 0; /* last export? */
                pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
@@ -2365,11 +2490,34 @@ handle_semantic:
                                                          ctx->f32, "");
                }
 
-               if (shader->selector->info.writes_layer)
-                       pos_args[1].out[2] = layer_value;
+               if (ctx->screen->b.chip_class >= GFX9) {
+                       /* GFX9 has the layer in out.z[10:0] and the viewport
+                        * index in out.z[19:16].
+                        */
+                       if (shader->selector->info.writes_layer)
+                               pos_args[1].out[2] = layer_value;
+
+                       if (shader->selector->info.writes_viewport_index) {
+                               LLVMValueRef v = viewport_index_value;
+
+                               v = bitcast(bld_base, TGSI_TYPE_UNSIGNED, v);
+                               v = LLVMBuildShl(ctx->gallivm.builder, v,
+                                                LLVMConstInt(ctx->i32, 16, 0), "");
+                               v = LLVMBuildOr(ctx->gallivm.builder, v,
+                                               bitcast(bld_base, TGSI_TYPE_UNSIGNED,
+                                                       pos_args[1].out[2]), "");
+                               pos_args[1].out[2] = bitcast(bld_base, TGSI_TYPE_FLOAT, v);
+                               pos_args[1].enabled_channels |= 1 << 2;
+                       }
+               } else {
+                       if (shader->selector->info.writes_layer)
+                               pos_args[1].out[2] = layer_value;
 
-               if (shader->selector->info.writes_viewport_index)
-                       pos_args[1].out[3] = viewport_index_value;
+                       if (shader->selector->info.writes_viewport_index) {
+                               pos_args[1].out[3] = viewport_index_value;
+                               pos_args[1].enabled_channels |= 1 << 3;
+                       }
+               }
        }
 
        for (i = 0; i < 4; i++)
@@ -2390,6 +2538,9 @@ handle_semantic:
 
                ac_build_export(&ctx->ac, &pos_args[i]);
        }
+
+       /* Build parameter exports. */
+       si_build_param_exports(ctx, outputs, noutput);
 }
 
 /**
@@ -2400,16 +2551,12 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
+       LLVMValueRef invocation_id, buffer, buffer_offset;
        LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
        uint64_t inputs;
 
        invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
-
-       rw_buffers = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
-       buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
-                       LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
-
+       buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
        buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
 
        lds_vertex_stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
@@ -2418,7 +2565,7 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
        lds_base = get_tcs_in_current_patch_offset(ctx);
        lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
 
-       inputs = ctx->shader->key.mono.ff_tcs_inputs_to_copy;
+       inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
        while (inputs) {
                unsigned i = u_bit_scan64(&inputs);
 
@@ -2449,8 +2596,8 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
        struct si_shader *shader = ctx->shader;
        unsigned tess_inner_index, tess_outer_index;
        LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
-       LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base, inner[4], outer[4];
-       unsigned stride, outer_comps, inner_comps, i;
+       LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
+       unsigned stride, outer_comps, inner_comps, i, offset;
        struct lp_build_if_state if_ctx, inner_if_ctx;
 
        si_llvm_emit_barrier(NULL, bld_base, NULL);
@@ -2490,8 +2637,8 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
        /* Load tess_inner and tess_outer from LDS.
         * Any invocation can write them, so we can't get them from a temporary.
         */
-       tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
-       tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
+       tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
+       tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
 
        lds_base = tcs_out_current_patch_data_offset;
        lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
@@ -2531,10 +2678,7 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
                vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
 
        /* Get the buffer. */
-       rw_buffers = LLVMGetParam(ctx->main_fn,
-                                 ctx->param_rw_buffers);
-       buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
-                       LLVMConstInt(ctx->i32, SI_HS_RING_TESS_FACTOR, 0));
+       buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k);
 
        /* Get the offset. */
        tf_base = LLVMGetParam(ctx->main_fn,
@@ -2547,21 +2691,26 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
                                  rel_patch_id, ctx->i32_0, ""));
 
        /* Store the dynamic HS control word. */
-       ac_build_buffer_store_dword(&ctx->ac, buffer,
-                                   LLVMConstInt(ctx->i32, 0x80000000, 0),
-                                   1, ctx->i32_0, tf_base,
-                                   0, 1, 0, true, false);
+       offset = 0;
+       if (ctx->screen->b.chip_class <= VI) {
+               ac_build_buffer_store_dword(&ctx->ac, buffer,
+                                           LLVMConstInt(ctx->i32, 0x80000000, 0),
+                                           1, ctx->i32_0, tf_base,
+                                           offset, 1, 0, true, false);
+               offset += 4;
+       }
 
        lp_build_endif(&inner_if_ctx);
 
        /* Store the tessellation factors. */
        ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
                                    MIN2(stride, 4), byteoffset, tf_base,
-                                   4, 1, 0, true, false);
+                                   offset, 1, 0, true, false);
+       offset += 16;
        if (vec1)
                ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
                                            stride - 4, byteoffset, tf_base,
-                                           20, 1, 0, true, false);
+                                           offset, 1, 0, true, false);
 
        /* Store the tess factors into the offchip buffer if TES reads them. */
        if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
@@ -2569,11 +2718,10 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
                LLVMValueRef tf_inner_offset;
                unsigned param_outer, param_inner;
 
-               buf = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
-                               LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
+               buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
                base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
 
-               param_outer = si_shader_io_get_unique_index(
+               param_outer = si_shader_io_get_unique_index_patch(
                                      TGSI_SEMANTIC_TESSOUTER, 0);
                tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
                                        LLVMConstInt(ctx->i32, param_outer, 0));
@@ -2585,7 +2733,7 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
                                            outer_comps, tf_outer_offset,
                                            base, 0, 1, 0, true, false);
                if (inner_comps) {
-                       param_inner = si_shader_io_get_unique_index(
+                       param_inner = si_shader_io_get_unique_index_patch(
                                              TGSI_SEMANTIC_TESSINNER, 0);
                        tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
                                        LLVMConstInt(ctx->i32, param_inner, 0));
@@ -2601,63 +2749,188 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
        lp_build_endif(&if_ctx);
 }
 
-/* This only writes the tessellation factor levels. */
-static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
+static LLVMValueRef
+si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
+                   unsigned param, unsigned return_index)
 {
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
-       LLVMValueRef offchip_soffset, offchip_layout;
+       return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
+                                   LLVMGetParam(ctx->main_fn, param),
+                                   return_index, "");
+}
 
-       si_copy_tcs_inputs(bld_base);
+static LLVMValueRef
+si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
+                         unsigned param, unsigned return_index)
+{
+       LLVMBuilderRef builder = ctx->gallivm.builder;
+       LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
+
+       return LLVMBuildInsertValue(builder, ret,
+                                   LLVMBuildBitCast(builder, p, ctx->f32, ""),
+                                   return_index, "");
+}
+
+static LLVMValueRef
+si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
+                            unsigned param, unsigned return_index)
+{
+       LLVMBuilderRef builder = ctx->gallivm.builder;
+       LLVMValueRef ptr, lo, hi;
+
+       ptr = LLVMGetParam(ctx->main_fn, param);
+       ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
+       ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
+       lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
+       hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
+       ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
+       return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
+}
+
+/* This only writes the tessellation factor levels. */
+static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
+{
+       struct si_shader_context *ctx = si_shader_context(bld_base);
+       LLVMBuilderRef builder = ctx->gallivm.builder;
+       LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
+
+       si_copy_tcs_inputs(bld_base);
 
        rel_patch_id = get_rel_patch_id(ctx);
        invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
        tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
 
+       if (ctx->screen->b.chip_class >= GFX9) {
+               LLVMBasicBlockRef blocks[2] = {
+                       LLVMGetInsertBlock(builder),
+                       ctx->merged_wrap_if_state.entry_block
+               };
+               LLVMValueRef values[2];
+
+               lp_build_endif(&ctx->merged_wrap_if_state);
+
+               values[0] = rel_patch_id;
+               values[1] = LLVMGetUndef(ctx->i32);
+               rel_patch_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
+
+               values[0] = tf_lds_offset;
+               values[1] = LLVMGetUndef(ctx->i32);
+               tf_lds_offset = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
+
+               values[0] = invocation_id;
+               values[1] = ctx->i32_1; /* cause the epilog to skip threads */
+               invocation_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
+       }
+
        /* Return epilog parameters from this function. */
-       LLVMBuilderRef builder = ctx->gallivm.builder;
        LLVMValueRef ret = ctx->return_value;
-       LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
        unsigned vgpr;
 
-       /* RW_BUFFERS pointer */
-       rw_buffers = LLVMGetParam(ctx->main_fn,
-                                 ctx->param_rw_buffers);
-       rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
-       rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
-       rw0 = LLVMBuildExtractElement(builder, rw_buffers,
-                                     ctx->i32_0, "");
-       rw1 = LLVMBuildExtractElement(builder, rw_buffers,
-                                     ctx->i32_1, "");
-       ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
-       ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
-
-       /* Tess offchip and factor buffer soffset are after user SGPRs. */
-       offchip_layout = LLVMGetParam(ctx->main_fn,
-                                     ctx->param_tcs_offchip_layout);
-       offchip_soffset = LLVMGetParam(ctx->main_fn,
-                                      ctx->param_tcs_offchip_offset);
-       tf_soffset = LLVMGetParam(ctx->main_fn,
-                                 ctx->param_tcs_factor_offset);
-       ret = LLVMBuildInsertValue(builder, ret, offchip_layout,
-                                  GFX6_SGPR_TCS_OFFCHIP_LAYOUT, "");
-       ret = LLVMBuildInsertValue(builder, ret, offchip_soffset,
-                                  GFX6_TCS_NUM_USER_SGPR, "");
-       ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
-                                  GFX6_TCS_NUM_USER_SGPR + 1, "");
+       if (ctx->screen->b.chip_class >= GFX9) {
+               ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
+                                         8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
+               ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
+                                         8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
+               ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
+                                         8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
+               /* Tess offchip and tess factor offsets are at the beginning. */
+               ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
+               ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
+               vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1;
+       } else {
+               ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
+                                         GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
+               ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
+                                         GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
+               ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
+                                         GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K);
+               /* Tess offchip and tess factor offsets are after user SGPRs. */
+               ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
+                                         GFX6_TCS_NUM_USER_SGPR);
+               ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
+                                         GFX6_TCS_NUM_USER_SGPR + 1);
+               vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
+       }
 
        /* VGPRs */
        rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
        invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
        tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
 
-       vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
+       /* Leave a hole corresponding to the two input VGPRs. This ensures that
+        * the invocation_id output does not alias the param_tcs_rel_ids input,
+        * which saves a V_MOV on gfx9.
+        */
+       vgpr += 2;
+
        ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
        ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
        ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
        ctx->return_value = ret;
 }
 
+/* Pass TCS inputs from LS to TCS on GFX9. */
+static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
+{
+       LLVMValueRef ret = ctx->return_value;
+
+       ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
+       ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
+       ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
+       ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
+       ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
+
+       ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
+                                 8 + SI_SGPR_VS_STATE_BITS);
+       ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
+                                 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
+       ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
+                                 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
+       ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
+                                 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
+       ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
+                                 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
+       ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
+                                 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
+
+       unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2;
+       ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
+                                          8 + GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS);
+       ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
+                                          8 + GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES);
+
+       unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
+       ret = si_insert_input_ret_float(ctx, ret,
+                                       ctx->param_tcs_patch_id, vgpr++);
+       ret = si_insert_input_ret_float(ctx, ret,
+                                       ctx->param_tcs_rel_ids, vgpr++);
+       ctx->return_value = ret;
+}
+
+/* Pass GS inputs from ES to GS on GFX9. */
+static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
+{
+       LLVMValueRef ret = ctx->return_value;
+
+       ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
+       ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
+       ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
+
+       ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
+
+       unsigned desc_param = ctx->param_vs_state_bits + 1;
+       ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
+                                          8 + GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS);
+       ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
+                                          8 + GFX9_SGPR_GS_SAMPLERS_AND_IMAGES);
+
+       unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
+       for (unsigned i = 0; i < 5; i++) {
+               unsigned param = ctx->param_gs_vtx01_offset + i;
+               ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
+       }
+       ctx->return_value = ret;
+}
+
 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
@@ -2707,6 +2980,9 @@ static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
                                  LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
                }
        }
+
+       if (ctx->screen->b.chip_class >= GFX9)
+               si_set_ls_return_value_for_tcs(ctx);
 }
 
 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
@@ -2717,31 +2993,60 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
        struct tgsi_shader_info *info = &es->selector->info;
        LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
                                            ctx->param_es2gs_offset);
+       LLVMValueRef lds_base = NULL;
        unsigned chan;
        int i;
 
+       if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
+               unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
+               LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
+               LLVMValueRef wave_idx = unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
+               vertex_idx = LLVMBuildOr(gallivm->builder, vertex_idx,
+                                        LLVMBuildMul(gallivm->builder, wave_idx,
+                                                     LLVMConstInt(ctx->i32, 64, false), ""), "");
+               lds_base = LLVMBuildMul(gallivm->builder, vertex_idx,
+                                       LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
+       }
+
        for (i = 0; i < info->num_outputs; i++) {
                LLVMValueRef *out_ptr = ctx->outputs[i];
-               int param_index;
+               int param;
 
                if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
                    info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
                        continue;
 
-               param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
-                                                           info->output_semantic_index[i]);
+               param = si_shader_io_get_unique_index(info->output_semantic_name[i],
+                                                     info->output_semantic_index[i]);
 
                for (chan = 0; chan < 4; chan++) {
                        LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
                        out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
 
+                       /* GFX9 has the ESGS ring in LDS. */
+                       if (ctx->screen->b.chip_class >= GFX9) {
+                               lds_store(bld_base, param * 4 + chan, lds_base, out_val);
+                               continue;
+                       }
+
                        ac_build_buffer_store_dword(&ctx->ac,
                                                    ctx->esgs_ring,
                                                    out_val, 1, NULL, soffset,
-                                                   (4 * param_index + chan) * 4,
+                                                   (4 * param + chan) * 4,
                                                    1, 1, true, true);
                }
        }
+
+       if (ctx->screen->b.chip_class >= GFX9)
+               si_set_es_return_value_for_gs(ctx);
+}
+
+static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
+{
+       if (ctx->screen->b.chip_class >= GFX9)
+               return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
+       else
+               return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
 }
 
 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
@@ -2749,18 +3054,24 @@ static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
        struct si_shader_context *ctx = si_shader_context(bld_base);
 
        ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
-                        LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id));
+                        si_get_gs_wave_id(ctx));
+
+       if (ctx->screen->b.chip_class >= GFX9)
+               lp_build_endif(&ctx->merged_wrap_if_state);
 }
 
-static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
+static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
+                                    unsigned max_outputs,
+                                    LLVMValueRef *addrs)
 {
-       struct si_shader_context *ctx = si_shader_context(bld_base);
+       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
        struct gallivm_state *gallivm = &ctx->gallivm;
        struct tgsi_shader_info *info = &ctx->shader->selector->info;
        struct si_shader_output_values *outputs = NULL;
        int i,j;
 
        assert(!ctx->shader->is_gs_copy_shader);
+       assert(info->num_outputs <= max_outputs);
 
        outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
 
@@ -2791,7 +3102,7 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
                        }
 
                        for (j = 0; j < 4; j++) {
-                               addr = ctx->outputs[i][j];
+                               addr = addrs[4 * i + j];
                                val = LLVMBuildLoad(gallivm->builder, addr, "");
                                val = ac_build_clamp(&ctx->ac, val);
                                LLVMBuildStore(gallivm->builder, val, addr);
@@ -2809,28 +3120,42 @@ static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
                for (j = 0; j < 4; j++) {
                        outputs[i].values[j] =
                                LLVMBuildLoad(gallivm->builder,
-                                             ctx->outputs[i][j],
+                                             addrs[4 * i + j],
                                              "");
                        outputs[i].vertex_stream[j] =
                                (info->output_streams[i] >> (2 * j)) & 3;
                }
-
        }
 
-       /* Return the primitive ID from the LLVM function. */
-       ctx->return_value =
-               LLVMBuildInsertValue(gallivm->builder,
-                                    ctx->return_value,
-                                    bitcast(bld_base, TGSI_TYPE_FLOAT,
-                                            get_primitive_id(bld_base, 0)),
-                                    VS_EPILOG_PRIMID_LOC, "");
-
        if (ctx->shader->selector->so.num_outputs)
                si_llvm_emit_streamout(ctx, outputs, i, 0);
-       si_llvm_export_vs(bld_base, outputs, i);
+
+       /* Export PrimitiveID. */
+       if (ctx->shader->key.mono.u.vs_export_prim_id) {
+               outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
+               outputs[i].semantic_index = 0;
+               outputs[i].values[0] = LLVMBuildBitCast(gallivm->builder,
+                               get_primitive_id(ctx, 0), ctx->f32, "");
+               for (j = 1; j < 4; j++)
+                       outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
+
+               memset(outputs[i].vertex_stream, 0,
+                      sizeof(outputs[i].vertex_stream));
+               i++;
+       }
+
+       si_llvm_export_vs(&ctx->bld_base, outputs, i);
        FREE(outputs);
 }
 
+static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base)
+{
+       struct si_shader_context *ctx = si_shader_context(bld_base);
+
+       ctx->abi.emit_outputs(&ctx->abi, RADEON_LLVM_MAX_OUTPUTS,
+                             &ctx->outputs[0][0]);
+}
+
 struct si_ps_exports {
        unsigned num;
        struct ac_export_args args[10];
@@ -2975,2039 +3300,249 @@ static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
                        } else if (!args[c].enabled_channels)
                                continue; /* unnecessary NULL export */
 
-                       memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
-               }
-       } else {
-               struct ac_export_args args;
-
-               /* Export */
-               si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
-                                        &args);
-               if (is_last) {
-                       args.valid_mask = 1; /* whether the EXEC mask is valid */
-                       args.done = 1; /* DONE bit */
-               } else if (!args.enabled_channels)
-                       return; /* unnecessary NULL export */
-
-               memcpy(&exp->args[exp->num++], &args, sizeof(args));
-       }
-}
-
-static void si_emit_ps_exports(struct si_shader_context *ctx,
-                              struct si_ps_exports *exp)
-{
-       for (unsigned i = 0; i < exp->num; i++)
-               ac_build_export(&ctx->ac, &exp->args[i]);
-}
-
-static void si_export_null(struct lp_build_tgsi_context *bld_base)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct lp_build_context *base = &bld_base->base;
-       struct ac_export_args args;
-
-       args.enabled_channels = 0x0; /* enabled channels */
-       args.valid_mask = 1; /* whether the EXEC mask is valid */
-       args.done = 1; /* DONE bit */
-       args.target = V_008DFC_SQ_EXP_NULL;
-       args.compr = 0; /* COMPR flag (0 = 32-bit export) */
-       args.out[0] = base->undef; /* R */
-       args.out[1] = base->undef; /* G */
-       args.out[2] = base->undef; /* B */
-       args.out[3] = base->undef; /* A */
-
-       ac_build_export(&ctx->ac, &args);
-}
-
-/**
- * Return PS outputs in this order:
- *
- * v[0:3] = color0.xyzw
- * v[4:7] = color1.xyzw
- * ...
- * vN+0 = Depth
- * vN+1 = Stencil
- * vN+2 = SampleMask
- * vN+3 = SampleMaskIn (used for OpenGL smoothing)
- *
- * The alpha-ref SGPR is returned via its original location.
- */
-static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct si_shader *shader = ctx->shader;
-       struct tgsi_shader_info *info = &shader->selector->info;
-       LLVMBuilderRef builder = ctx->gallivm.builder;
-       unsigned i, j, first_vgpr, vgpr;
-
-       LLVMValueRef color[8][4] = {};
-       LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
-       LLVMValueRef ret;
-
-       /* Read the output values. */
-       for (i = 0; i < info->num_outputs; i++) {
-               unsigned semantic_name = info->output_semantic_name[i];
-               unsigned semantic_index = info->output_semantic_index[i];
-
-               switch (semantic_name) {
-               case TGSI_SEMANTIC_COLOR:
-                       assert(semantic_index < 8);
-                       for (j = 0; j < 4; j++) {
-                               LLVMValueRef ptr = ctx->outputs[i][j];
-                               LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
-                               color[semantic_index][j] = result;
-                       }
-                       break;
-               case TGSI_SEMANTIC_POSITION:
-                       depth = LLVMBuildLoad(builder,
-                                             ctx->outputs[i][2], "");
-                       break;
-               case TGSI_SEMANTIC_STENCIL:
-                       stencil = LLVMBuildLoad(builder,
-                                               ctx->outputs[i][1], "");
-                       break;
-               case TGSI_SEMANTIC_SAMPLEMASK:
-                       samplemask = LLVMBuildLoad(builder,
-                                                  ctx->outputs[i][0], "");
-                       break;
-               default:
-                       fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
-                               semantic_name);
-               }
-       }
-
-       /* Fill the return structure. */
-       ret = ctx->return_value;
-
-       /* Set SGPRs. */
-       ret = LLVMBuildInsertValue(builder, ret,
-                                  bitcast(bld_base, TGSI_TYPE_SIGNED,
-                                          LLVMGetParam(ctx->main_fn,
-                                                       SI_PARAM_ALPHA_REF)),
-                                  SI_SGPR_ALPHA_REF, "");
-
-       /* Set VGPRs */
-       first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
-       for (i = 0; i < ARRAY_SIZE(color); i++) {
-               if (!color[i][0])
-                       continue;
-
-               for (j = 0; j < 4; j++)
-                       ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
-       }
-       if (depth)
-               ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
-       if (stencil)
-               ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
-       if (samplemask)
-               ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
-
-       /* Add the input sample mask for smoothing at the end. */
-       if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
-               vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
-       ret = LLVMBuildInsertValue(builder, ret,
-                                  LLVMGetParam(ctx->main_fn,
-                                               SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
-
-       ctx->return_value = ret;
-}
-
-/**
- * Given a v8i32 resource descriptor for a buffer, extract the size of the
- * buffer in number of elements and return it as an i32.
- */
-static LLVMValueRef get_buffer_size(
-       struct lp_build_tgsi_context *bld_base,
-       LLVMValueRef descriptor)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMBuilderRef builder = gallivm->builder;
-       LLVMValueRef size =
-               LLVMBuildExtractElement(builder, descriptor,
-                                       LLVMConstInt(ctx->i32, 2, 0), "");
-
-       if (ctx->screen->b.chip_class == VI) {
-               /* On VI, the descriptor contains the size in bytes,
-                * but TXQ must return the size in elements.
-                * The stride is always non-zero for resources using TXQ.
-                */
-               LLVMValueRef stride =
-                       LLVMBuildExtractElement(builder, descriptor,
-                                               ctx->i32_1, "");
-               stride = LLVMBuildLShr(builder, stride,
-                                      LLVMConstInt(ctx->i32, 16, 0), "");
-               stride = LLVMBuildAnd(builder, stride,
-                                     LLVMConstInt(ctx->i32, 0x3FFF, 0), "");
-
-               size = LLVMBuildUDiv(builder, size, stride, "");
-       }
-
-       return size;
-}
-
-static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
-                               struct lp_build_tgsi_context *bld_base,
-                               struct lp_build_emit_data *emit_data);
-
-/* Prevent optimizations (at least of memory accesses) across the current
- * point in the program by emitting empty inline assembly that is marked as
- * having side effects.
- *
- * Optionally, a value can be passed through the inline assembly to prevent
- * LLVM from hoisting calls to ReadNone functions.
- */
-static void emit_optimization_barrier(struct si_shader_context *ctx,
-                                     LLVMValueRef *pvgpr)
-{
-       static int counter = 0;
-
-       LLVMBuilderRef builder = ctx->gallivm.builder;
-       char code[16];
-
-       snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
-
-       if (!pvgpr) {
-               LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
-               LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
-               LLVMBuildCall(builder, inlineasm, NULL, 0, "");
-       } else {
-               LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
-               LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
-               LLVMValueRef vgpr = *pvgpr;
-               LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
-               unsigned vgpr_size = llvm_get_type_size(vgpr_type);
-               LLVMValueRef vgpr0;
-
-               assert(vgpr_size % 4 == 0);
-
-               vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
-               vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
-               vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
-               vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
-               vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
-
-               *pvgpr = vgpr;
-       }
-}
-
-/* Combine these with & instead of |. */
-#define NOOP_WAITCNT 0xf7f
-#define LGKM_CNT 0x07f
-#define VM_CNT 0xf70
-
-static void emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
-{
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMBuilderRef builder = gallivm->builder;
-       LLVMValueRef args[1] = {
-               LLVMConstInt(ctx->i32, simm16, 0)
-       };
-       lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
-                          ctx->voidt, args, 1, 0);
-}
-
-static void membar_emit(
-               const struct lp_build_tgsi_action *action,
-               struct lp_build_tgsi_context *bld_base,
-               struct lp_build_emit_data *emit_data)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
-       unsigned flags = LLVMConstIntGetZExtValue(src0);
-       unsigned waitcnt = NOOP_WAITCNT;
-
-       if (flags & TGSI_MEMBAR_THREAD_GROUP)
-               waitcnt &= VM_CNT & LGKM_CNT;
-
-       if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
-                    TGSI_MEMBAR_SHADER_BUFFER |
-                    TGSI_MEMBAR_SHADER_IMAGE))
-               waitcnt &= VM_CNT;
-
-       if (flags & TGSI_MEMBAR_SHARED)
-               waitcnt &= LGKM_CNT;
-
-       if (waitcnt != NOOP_WAITCNT)
-               emit_waitcnt(ctx, waitcnt);
-}
-
-static void clock_emit(
-               const struct lp_build_tgsi_action *action,
-               struct lp_build_tgsi_context *bld_base,
-               struct lp_build_emit_data *emit_data)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMValueRef tmp;
-
-       tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
-                                ctx->i64, NULL, 0, 0);
-       tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
-
-       emit_data->output[0] =
-               LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
-       emit_data->output[1] =
-               LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
-}
-
-static LLVMValueRef
-shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
-                        const struct tgsi_full_src_register *reg)
-{
-       LLVMValueRef index;
-       LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
-                                            ctx->param_shader_buffers);
-
-       if (!reg->Register.Indirect)
-               index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
-       else
-               index = get_bounded_indirect_index(ctx, &reg->Indirect,
-                                                  reg->Register.Index,
-                                                  SI_NUM_SHADER_BUFFERS);
-
-       return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
-}
-
-static bool tgsi_is_array_sampler(unsigned target)
-{
-       return target == TGSI_TEXTURE_1D_ARRAY ||
-              target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
-              target == TGSI_TEXTURE_2D_ARRAY ||
-              target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
-              target == TGSI_TEXTURE_CUBE_ARRAY ||
-              target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
-              target == TGSI_TEXTURE_2D_ARRAY_MSAA;
-}
-
-static bool tgsi_is_array_image(unsigned target)
-{
-       return target == TGSI_TEXTURE_3D ||
-              target == TGSI_TEXTURE_CUBE ||
-              target == TGSI_TEXTURE_1D_ARRAY ||
-              target == TGSI_TEXTURE_2D_ARRAY ||
-              target == TGSI_TEXTURE_CUBE_ARRAY ||
-              target == TGSI_TEXTURE_2D_ARRAY_MSAA;
-}
-
-/**
- * Given a 256-bit resource descriptor, force the DCC enable bit to off.
- *
- * At least on Tonga, executing image stores on images with DCC enabled and
- * non-trivial can eventually lead to lockups. This can occur when an
- * application binds an image as read-only but then uses a shader that writes
- * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
- * program termination) in this case, but it doesn't cost much to be a bit
- * nicer: disabling DCC in the shader still leads to undefined results but
- * avoids the lockup.
- */
-static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
-                                 LLVMValueRef rsrc)
-{
-       if (ctx->screen->b.chip_class <= CIK) {
-               return rsrc;
-       } else {
-               LLVMBuilderRef builder = ctx->gallivm.builder;
-               LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
-               LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
-               LLVMValueRef tmp;
-
-               tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
-               tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
-               return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
-       }
-}
-
-static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
-{
-       return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
-                              CONST_ADDR_SPACE);
-}
-
-static LLVMValueRef load_image_desc(struct si_shader_context *ctx,
-                                   LLVMValueRef list, LLVMValueRef index,
-                                   unsigned target)
-{
-       LLVMBuilderRef builder = ctx->gallivm.builder;
-
-       if (target == TGSI_TEXTURE_BUFFER) {
-               index = LLVMBuildMul(builder, index,
-                                    LLVMConstInt(ctx->i32, 2, 0), "");
-               index = LLVMBuildAdd(builder, index,
-                                    ctx->i32_1, "");
-               list = LLVMBuildPointerCast(builder, list,
-                                           const_array(ctx->v4i32, 0), "");
-       }
-
-       return ac_build_indexed_load_const(&ctx->ac, list, index);
-}
-
-/**
- * Load the resource descriptor for \p image.
- */
-static void
-image_fetch_rsrc(
-       struct lp_build_tgsi_context *bld_base,
-       const struct tgsi_full_src_register *image,
-       bool is_store, unsigned target,
-       LLVMValueRef *rsrc)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
-                                            ctx->param_images);
-       LLVMValueRef index;
-       bool dcc_off = is_store;
-
-       assert(image->Register.File == TGSI_FILE_IMAGE);
-
-       if (!image->Register.Indirect) {
-               const struct tgsi_shader_info *info = bld_base->info;
-               unsigned images_writemask = info->images_store |
-                                           info->images_atomic;
-
-               index = LLVMConstInt(ctx->i32, image->Register.Index, 0);
-
-               if (images_writemask & (1 << image->Register.Index))
-                       dcc_off = true;
-       } else {
-               /* From the GL_ARB_shader_image_load_store extension spec:
-                *
-                *    If a shader performs an image load, store, or atomic
-                *    operation using an image variable declared as an array,
-                *    and if the index used to select an individual element is
-                *    negative or greater than or equal to the size of the
-                *    array, the results of the operation are undefined but may
-                *    not lead to termination.
-                */
-               index = get_bounded_indirect_index(ctx, &image->Indirect,
-                                                  image->Register.Index,
-                                                  SI_NUM_IMAGES);
-       }
-
-       *rsrc = load_image_desc(ctx, rsrc_ptr, index, target);
-       if (dcc_off && target != TGSI_TEXTURE_BUFFER)
-               *rsrc = force_dcc_off(ctx, *rsrc);
-}
-
-static LLVMValueRef image_fetch_coords(
-               struct lp_build_tgsi_context *bld_base,
-               const struct tgsi_full_instruction *inst,
-               unsigned src, LLVMValueRef desc)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMBuilderRef builder = gallivm->builder;
-       unsigned target = inst->Memory.Texture;
-       unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
-       LLVMValueRef coords[4];
-       LLVMValueRef tmp;
-       int chan;
-
-       for (chan = 0; chan < num_coords; ++chan) {
-               tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
-               tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
-               coords[chan] = tmp;
-       }
-
-       if (ctx->screen->b.chip_class >= GFX9) {
-               /* 1D textures are allocated and used as 2D on GFX9. */
-               if (target == TGSI_TEXTURE_1D) {
-                       coords[1] = ctx->i32_0;
-                       num_coords++;
-               } else if (target == TGSI_TEXTURE_1D_ARRAY) {
-                       coords[2] = coords[1];
-                       coords[1] = ctx->i32_0;
-                       num_coords++;
-               } else if (target == TGSI_TEXTURE_2D) {
-                       /* The hw can't bind a slice of a 3D image as a 2D
-                        * image, because it ignores BASE_ARRAY if the target
-                        * is 3D. The workaround is to read BASE_ARRAY and set
-                        * it as the 3rd address operand for all 2D images.
-                        */
-                       LLVMValueRef first_layer, const5, mask;
-
-                       const5 = LLVMConstInt(ctx->i32, 5, 0);
-                       mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0);
-                       first_layer = LLVMBuildExtractElement(builder, desc, const5, "");
-                       first_layer = LLVMBuildAnd(builder, first_layer, mask, "");
-
-                       coords[2] = first_layer;
-                       num_coords++;
-               }
-       }
-
-       if (num_coords == 1)
-               return coords[0];
-
-       if (num_coords == 3) {
-               /* LLVM has difficulties lowering 3-element vectors. */
-               coords[3] = bld_base->uint_bld.undef;
-               num_coords = 4;
-       }
-
-       return lp_build_gather_values(gallivm, coords, num_coords);
-}
-
-/**
- * Append the extra mode bits that are used by image load and store.
- */
-static void image_append_args(
-               struct si_shader_context *ctx,
-               struct lp_build_emit_data * emit_data,
-               unsigned target,
-               bool atomic,
-               bool force_glc)
-{
-       const struct tgsi_full_instruction *inst = emit_data->inst;
-       LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
-       LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
-       LLVMValueRef r128 = i1false;
-       LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false;
-       LLVMValueRef glc =
-               force_glc ||
-               inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
-               i1true : i1false;
-       LLVMValueRef slc = i1false;
-       LLVMValueRef lwe = i1false;
-
-       if (atomic || (HAVE_LLVM <= 0x0309)) {
-               emit_data->args[emit_data->arg_count++] = r128;
-               emit_data->args[emit_data->arg_count++] = da;
-               if (!atomic) {
-                       emit_data->args[emit_data->arg_count++] = glc;
-               }
-               emit_data->args[emit_data->arg_count++] = slc;
-               return;
-       }
-
-       /* HAVE_LLVM >= 0x0400 */
-       emit_data->args[emit_data->arg_count++] = glc;
-       emit_data->args[emit_data->arg_count++] = slc;
-       emit_data->args[emit_data->arg_count++] = lwe;
-       emit_data->args[emit_data->arg_count++] = da;
-}
-
-/**
- * Append the resource and indexing arguments for buffer intrinsics.
- *
- * \param rsrc the v4i32 buffer resource
- * \param index index into the buffer (stride-based)
- * \param offset byte offset into the buffer
- */
-static void buffer_append_args(
-               struct si_shader_context *ctx,
-               struct lp_build_emit_data *emit_data,
-               LLVMValueRef rsrc,
-               LLVMValueRef index,
-               LLVMValueRef offset,
-               bool atomic,
-               bool force_glc)
-{
-       const struct tgsi_full_instruction *inst = emit_data->inst;
-       LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
-       LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
-
-       emit_data->args[emit_data->arg_count++] = rsrc;
-       emit_data->args[emit_data->arg_count++] = index; /* vindex */
-       emit_data->args[emit_data->arg_count++] = offset; /* voffset */
-       if (!atomic) {
-               emit_data->args[emit_data->arg_count++] =
-                       force_glc ||
-                       inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
-                       i1true : i1false; /* glc */
-       }
-       emit_data->args[emit_data->arg_count++] = i1false; /* slc */
-}
-
-static void load_fetch_args(
-               struct lp_build_tgsi_context * bld_base,
-               struct lp_build_emit_data * emit_data)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       const struct tgsi_full_instruction * inst = emit_data->inst;
-       unsigned target = inst->Memory.Texture;
-       LLVMValueRef rsrc;
-
-       emit_data->dst_type = ctx->v4f32;
-
-       if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
-               LLVMBuilderRef builder = gallivm->builder;
-               LLVMValueRef offset;
-               LLVMValueRef tmp;
-
-               rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
-
-               tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
-               offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
-
-               buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
-                                  offset, false, false);
-       } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
-               LLVMValueRef coords;
-
-               image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &rsrc);
-               coords = image_fetch_coords(bld_base, inst, 1, rsrc);
-
-               if (target == TGSI_TEXTURE_BUFFER) {
-                       buffer_append_args(ctx, emit_data, rsrc, coords,
-                                          ctx->i32_0, false, false);
-               } else {
-                       emit_data->args[0] = coords;
-                       emit_data->args[1] = rsrc;
-                       emit_data->args[2] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
-                       emit_data->arg_count = 3;
-
-                       image_append_args(ctx, emit_data, target, false, false);
-               }
-       }
-}
-
-static unsigned get_load_intr_attribs(bool readonly_memory)
-{
-       /* READNONE means writes can't affect it, while READONLY means that
-        * writes can affect it. */
-       return readonly_memory && HAVE_LLVM >= 0x0400 ?
-                                LP_FUNC_ATTR_READNONE :
-                                LP_FUNC_ATTR_READONLY;
-}
-
-static unsigned get_store_intr_attribs(bool writeonly_memory)
-{
-       return writeonly_memory && HAVE_LLVM >= 0x0400 ?
-                                 LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
-                                 LP_FUNC_ATTR_WRITEONLY;
-}
-
-static void load_emit_buffer(struct si_shader_context *ctx,
-                            struct lp_build_emit_data *emit_data,
-                            bool readonly_memory)
-{
-       const struct tgsi_full_instruction *inst = emit_data->inst;
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMBuilderRef builder = gallivm->builder;
-       uint writemask = inst->Dst[0].Register.WriteMask;
-       uint count = util_last_bit(writemask);
-       const char *intrinsic_name;
-       LLVMTypeRef dst_type;
-
-       switch (count) {
-       case 1:
-               intrinsic_name = "llvm.amdgcn.buffer.load.f32";
-               dst_type = ctx->f32;
-               break;
-       case 2:
-               intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
-               dst_type = LLVMVectorType(ctx->f32, 2);
-               break;
-       default: // 3 & 4
-               intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
-               dst_type = ctx->v4f32;
-               count = 4;
-       }
-
-       emit_data->output[emit_data->chan] = lp_build_intrinsic(
-                       builder, intrinsic_name, dst_type,
-                       emit_data->args, emit_data->arg_count,
-                       get_load_intr_attribs(readonly_memory));
-}
-
-static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
-                                   const struct tgsi_full_instruction *inst,
-                                   LLVMTypeRef type, int arg)
-{
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMBuilderRef builder = gallivm->builder;
-       LLVMValueRef offset, ptr;
-       int addr_space;
-
-       offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
-       offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
-
-       ptr = ctx->shared_memory;
-       ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
-       addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
-       ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
-
-       return ptr;
-}
-
-static void load_emit_memory(
-               struct si_shader_context *ctx,
-               struct lp_build_emit_data *emit_data)
-{
-       const struct tgsi_full_instruction *inst = emit_data->inst;
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMBuilderRef builder = gallivm->builder;
-       unsigned writemask = inst->Dst[0].Register.WriteMask;
-       LLVMValueRef channels[4], ptr, derived_ptr, index;
-       int chan;
-
-       ptr = get_memory_ptr(ctx, inst, ctx->f32, 1);
-
-       for (chan = 0; chan < 4; ++chan) {
-               if (!(writemask & (1 << chan))) {
-                       channels[chan] = LLVMGetUndef(ctx->f32);
-                       continue;
-               }
-
-               index = LLVMConstInt(ctx->i32, chan, 0);
-               derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
-               channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
-       }
-       emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
-}
-
-/**
- * Return true if the memory accessed by a LOAD or STORE instruction is
- * read-only or write-only, respectively.
- *
- * \param shader_buffers_reverse_access_mask
- *     For LOAD, set this to (store | atomic) slot usage in the shader.
- *     For STORE, set this to (load | atomic) slot usage in the shader.
- * \param images_reverse_access_mask  Same as above, but for images.
- */
-static bool is_oneway_access_only(const struct tgsi_full_instruction *inst,
-                                 const struct tgsi_shader_info *info,
-                                 unsigned shader_buffers_reverse_access_mask,
-                                 unsigned images_reverse_access_mask)
-{
-       /* RESTRICT means NOALIAS.
-        * If there are no writes, we can assume the accessed memory is read-only.
-        * If there are no reads, we can assume the accessed memory is write-only.
-        */
-       if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT) {
-               unsigned reverse_access_mask;
-
-               if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
-                       reverse_access_mask = shader_buffers_reverse_access_mask;
-               } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
-                       reverse_access_mask = info->images_buffers &
-                                             images_reverse_access_mask;
-               } else {
-                       reverse_access_mask = ~info->images_buffers &
-                                             images_reverse_access_mask;
-               }
-
-               if (inst->Src[0].Register.Indirect) {
-                       if (!reverse_access_mask)
-                               return true;
-               } else {
-                       if (!(reverse_access_mask &
-                             (1u << inst->Src[0].Register.Index)))
-                               return true;
-               }
-       }
-
-       /* If there are no buffer writes (for both shader buffers & image
-        * buffers), it implies that buffer memory is read-only.
-        * If there are no buffer reads (for both shader buffers & image
-        * buffers), it implies that buffer memory is write-only.
-        *
-        * Same for the case when there are no writes/reads for non-buffer
-        * images.
-        */
-       if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
-           (inst->Src[0].Register.File == TGSI_FILE_IMAGE &&
-            inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
-               if (!shader_buffers_reverse_access_mask &&
-                   !(info->images_buffers & images_reverse_access_mask))
-                       return true;
-       } else {
-               if (!(~info->images_buffers & images_reverse_access_mask))
-                       return true;
-       }
-       return false;
-}
-
-static void load_emit(
-               const struct lp_build_tgsi_action *action,
-               struct lp_build_tgsi_context *bld_base,
-               struct lp_build_emit_data *emit_data)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMBuilderRef builder = gallivm->builder;
-       const struct tgsi_full_instruction * inst = emit_data->inst;
-       const struct tgsi_shader_info *info = &ctx->shader->selector->info;
-       char intrinsic_name[64];
-       bool readonly_memory = false;
-
-       if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
-               load_emit_memory(ctx, emit_data);
-               return;
-       }
-
-       if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
-               emit_waitcnt(ctx, VM_CNT);
-
-       readonly_memory = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) &&
-                         is_oneway_access_only(inst, info,
-                                               info->shader_buffers_store |
-                                               info->shader_buffers_atomic,
-                                               info->images_store |
-                                               info->images_atomic);
-
-       if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
-               load_emit_buffer(ctx, emit_data, readonly_memory);
-               return;
-       }
-
-       if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
-               emit_data->output[emit_data->chan] =
-                       lp_build_intrinsic(
-                               builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
-                               emit_data->args, emit_data->arg_count,
-                               get_load_intr_attribs(readonly_memory));
-       } else {
-               ac_get_image_intr_name("llvm.amdgcn.image.load",
-                                      emit_data->dst_type,             /* vdata */
-                                      LLVMTypeOf(emit_data->args[0]), /* coords */
-                                      LLVMTypeOf(emit_data->args[1]), /* rsrc */
-                                      intrinsic_name, sizeof(intrinsic_name));
-
-               emit_data->output[emit_data->chan] =
-                       lp_build_intrinsic(
-                               builder, intrinsic_name, emit_data->dst_type,
-                               emit_data->args, emit_data->arg_count,
-                               get_load_intr_attribs(readonly_memory));
-       }
-}
-
-static void store_fetch_args(
-               struct lp_build_tgsi_context * bld_base,
-               struct lp_build_emit_data * emit_data)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMBuilderRef builder = gallivm->builder;
-       const struct tgsi_full_instruction * inst = emit_data->inst;
-       struct tgsi_full_src_register memory;
-       LLVMValueRef chans[4];
-       LLVMValueRef data;
-       LLVMValueRef rsrc;
-       unsigned chan;
-
-       emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
-
-       for (chan = 0; chan < 4; ++chan) {
-               chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
-       }
-       data = lp_build_gather_values(gallivm, chans, 4);
-
-       emit_data->args[emit_data->arg_count++] = data;
-
-       memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
-
-       if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
-               LLVMValueRef offset;
-               LLVMValueRef tmp;
-
-               rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
-
-               tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
-               offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
-
-               buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
-                                  offset, false, false);
-       } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
-               unsigned target = inst->Memory.Texture;
-               LLVMValueRef coords;
-
-               /* 8bit/16bit TC L1 write corruption bug on SI.
-                * All store opcodes not aligned to a dword are affected.
-                *
-                * The only way to get unaligned stores in radeonsi is through
-                * shader images.
-                */
-               bool force_glc = ctx->screen->b.chip_class == SI;
-
-               image_fetch_rsrc(bld_base, &memory, true, target, &rsrc);
-               coords = image_fetch_coords(bld_base, inst, 0, rsrc);
-
-               if (target == TGSI_TEXTURE_BUFFER) {
-                       buffer_append_args(ctx, emit_data, rsrc, coords,
-                                          ctx->i32_0, false, force_glc);
-               } else {
-                       emit_data->args[1] = coords;
-                       emit_data->args[2] = rsrc;
-                       emit_data->args[3] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
-                       emit_data->arg_count = 4;
-
-                       image_append_args(ctx, emit_data, target, false, force_glc);
-               }
-       }
-}
-
-static void store_emit_buffer(
-               struct si_shader_context *ctx,
-               struct lp_build_emit_data *emit_data,
-               bool writeonly_memory)
-{
-       const struct tgsi_full_instruction *inst = emit_data->inst;
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMBuilderRef builder = gallivm->builder;
-       LLVMValueRef base_data = emit_data->args[0];
-       LLVMValueRef base_offset = emit_data->args[3];
-       unsigned writemask = inst->Dst[0].Register.WriteMask;
-
-       while (writemask) {
-               int start, count;
-               const char *intrinsic_name;
-               LLVMValueRef data;
-               LLVMValueRef offset;
-               LLVMValueRef tmp;
-
-               u_bit_scan_consecutive_range(&writemask, &start, &count);
-
-               /* Due to an LLVM limitation, split 3-element writes
-                * into a 2-element and a 1-element write. */
-               if (count == 3) {
-                       writemask |= 1 << (start + 2);
-                       count = 2;
-               }
-
-               if (count == 4) {
-                       data = base_data;
-                       intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
-               } else if (count == 2) {
-                       LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
-
-                       tmp = LLVMBuildExtractElement(
-                               builder, base_data,
-                               LLVMConstInt(ctx->i32, start, 0), "");
-                       data = LLVMBuildInsertElement(
-                               builder, LLVMGetUndef(v2f32), tmp,
-                               ctx->i32_0, "");
-
-                       tmp = LLVMBuildExtractElement(
-                               builder, base_data,
-                               LLVMConstInt(ctx->i32, start + 1, 0), "");
-                       data = LLVMBuildInsertElement(
-                               builder, data, tmp, ctx->i32_1, "");
-
-                       intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
-               } else {
-                       assert(count == 1);
-                       data = LLVMBuildExtractElement(
-                               builder, base_data,
-                               LLVMConstInt(ctx->i32, start, 0), "");
-                       intrinsic_name = "llvm.amdgcn.buffer.store.f32";
-               }
-
-               offset = base_offset;
-               if (start != 0) {
-                       offset = LLVMBuildAdd(
-                               builder, offset,
-                               LLVMConstInt(ctx->i32, start * 4, 0), "");
-               }
-
-               emit_data->args[0] = data;
-               emit_data->args[3] = offset;
-
-               lp_build_intrinsic(
-                       builder, intrinsic_name, emit_data->dst_type,
-                       emit_data->args, emit_data->arg_count,
-                       get_store_intr_attribs(writeonly_memory));
-       }
-}
-
-static void store_emit_memory(
-               struct si_shader_context *ctx,
-               struct lp_build_emit_data *emit_data)
-{
-       const struct tgsi_full_instruction *inst = emit_data->inst;
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMBuilderRef builder = gallivm->builder;
-       unsigned writemask = inst->Dst[0].Register.WriteMask;
-       LLVMValueRef ptr, derived_ptr, data, index;
-       int chan;
-
-       ptr = get_memory_ptr(ctx, inst, ctx->f32, 0);
-
-       for (chan = 0; chan < 4; ++chan) {
-               if (!(writemask & (1 << chan))) {
-                       continue;
-               }
-               data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
-               index = LLVMConstInt(ctx->i32, chan, 0);
-               derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
-               LLVMBuildStore(builder, data, derived_ptr);
-       }
-}
-
-static void store_emit(
-               const struct lp_build_tgsi_action *action,
-               struct lp_build_tgsi_context *bld_base,
-               struct lp_build_emit_data *emit_data)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMBuilderRef builder = gallivm->builder;
-       const struct tgsi_full_instruction * inst = emit_data->inst;
-       const struct tgsi_shader_info *info = &ctx->shader->selector->info;
-       unsigned target = inst->Memory.Texture;
-       char intrinsic_name[64];
-       bool writeonly_memory = false;
-
-       if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
-               store_emit_memory(ctx, emit_data);
-               return;
-       }
-
-       if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
-               emit_waitcnt(ctx, VM_CNT);
-
-       writeonly_memory = is_oneway_access_only(inst, info,
-                                                info->shader_buffers_load |
-                                                info->shader_buffers_atomic,
-                                                info->images_load |
-                                                info->images_atomic);
-
-       if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
-               store_emit_buffer(ctx, emit_data, writeonly_memory);
-               return;
-       }
-
-       if (target == TGSI_TEXTURE_BUFFER) {
-               emit_data->output[emit_data->chan] = lp_build_intrinsic(
-                       builder, "llvm.amdgcn.buffer.store.format.v4f32",
-                       emit_data->dst_type, emit_data->args,
-                       emit_data->arg_count,
-                       get_store_intr_attribs(writeonly_memory));
-       } else {
-               ac_get_image_intr_name("llvm.amdgcn.image.store",
-                                      LLVMTypeOf(emit_data->args[0]), /* vdata */
-                                      LLVMTypeOf(emit_data->args[1]), /* coords */
-                                      LLVMTypeOf(emit_data->args[2]), /* rsrc */
-                                      intrinsic_name, sizeof(intrinsic_name));
-
-               emit_data->output[emit_data->chan] =
-                       lp_build_intrinsic(
-                               builder, intrinsic_name, emit_data->dst_type,
-                               emit_data->args, emit_data->arg_count,
-                               get_store_intr_attribs(writeonly_memory));
-       }
-}
-
-static void atomic_fetch_args(
-               struct lp_build_tgsi_context * bld_base,
-               struct lp_build_emit_data * emit_data)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMBuilderRef builder = gallivm->builder;
-       const struct tgsi_full_instruction * inst = emit_data->inst;
-       LLVMValueRef data1, data2;
-       LLVMValueRef rsrc;
-       LLVMValueRef tmp;
-
-       emit_data->dst_type = ctx->f32;
-
-       tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
-       data1 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
-
-       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
-               tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
-               data2 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
-       }
-
-       /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
-        * of arguments, which is reversed relative to TGSI (and GLSL)
-        */
-       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
-               emit_data->args[emit_data->arg_count++] = data2;
-       emit_data->args[emit_data->arg_count++] = data1;
-
-       if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
-               LLVMValueRef offset;
-
-               rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
-
-               tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
-               offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
-
-               buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
-                                  offset, true, false);
-       } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
-               unsigned target = inst->Memory.Texture;
-               LLVMValueRef coords;
-
-               image_fetch_rsrc(bld_base, &inst->Src[0], true, target, &rsrc);
-               coords = image_fetch_coords(bld_base, inst, 1, rsrc);
-
-               if (target == TGSI_TEXTURE_BUFFER) {
-                       buffer_append_args(ctx, emit_data, rsrc, coords,
-                                          ctx->i32_0, true, false);
-               } else {
-                       emit_data->args[emit_data->arg_count++] = coords;
-                       emit_data->args[emit_data->arg_count++] = rsrc;
-
-                       image_append_args(ctx, emit_data, target, true, false);
-               }
-       }
-}
-
-static void atomic_emit_memory(struct si_shader_context *ctx,
-                               struct lp_build_emit_data *emit_data) {
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMBuilderRef builder = gallivm->builder;
-       const struct tgsi_full_instruction * inst = emit_data->inst;
-       LLVMValueRef ptr, result, arg;
-
-       ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
-
-       arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
-       arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
-
-       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
-               LLVMValueRef new_data;
-               new_data = lp_build_emit_fetch(&ctx->bld_base,
-                                              inst, 3, 0);
-
-               new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
-
-#if HAVE_LLVM >= 0x309
-               result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
-                                      LLVMAtomicOrderingSequentiallyConsistent,
-                                      LLVMAtomicOrderingSequentiallyConsistent,
-                                      false);
-#endif
-
-               result = LLVMBuildExtractValue(builder, result, 0, "");
-       } else {
-               LLVMAtomicRMWBinOp op;
-
-               switch(inst->Instruction.Opcode) {
-                       case TGSI_OPCODE_ATOMUADD:
-                               op = LLVMAtomicRMWBinOpAdd;
-                               break;
-                       case TGSI_OPCODE_ATOMXCHG:
-                               op = LLVMAtomicRMWBinOpXchg;
-                               break;
-                       case TGSI_OPCODE_ATOMAND:
-                               op = LLVMAtomicRMWBinOpAnd;
-                               break;
-                       case TGSI_OPCODE_ATOMOR:
-                               op = LLVMAtomicRMWBinOpOr;
-                               break;
-                       case TGSI_OPCODE_ATOMXOR:
-                               op = LLVMAtomicRMWBinOpXor;
-                               break;
-                       case TGSI_OPCODE_ATOMUMIN:
-                               op = LLVMAtomicRMWBinOpUMin;
-                               break;
-                       case TGSI_OPCODE_ATOMUMAX:
-                               op = LLVMAtomicRMWBinOpUMax;
-                               break;
-                       case TGSI_OPCODE_ATOMIMIN:
-                               op = LLVMAtomicRMWBinOpMin;
-                               break;
-                       case TGSI_OPCODE_ATOMIMAX:
-                               op = LLVMAtomicRMWBinOpMax;
-                               break;
-                       default:
-                               unreachable("unknown atomic opcode");
-               }
-
-               result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
-                                      LLVMAtomicOrderingSequentiallyConsistent,
-                                      false);
-       }
-       emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
-}
-
-static void atomic_emit(
-               const struct lp_build_tgsi_action *action,
-               struct lp_build_tgsi_context *bld_base,
-               struct lp_build_emit_data *emit_data)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMBuilderRef builder = gallivm->builder;
-       const struct tgsi_full_instruction * inst = emit_data->inst;
-       char intrinsic_name[40];
-       LLVMValueRef tmp;
-
-       if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
-               atomic_emit_memory(ctx, emit_data);
-               return;
-       }
-
-       if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
-           inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
-               snprintf(intrinsic_name, sizeof(intrinsic_name),
-                        "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
-       } else {
-               LLVMValueRef coords;
-               char coords_type[8];
-
-               if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
-                       coords = emit_data->args[2];
-               else
-                       coords = emit_data->args[1];
-
-               ac_build_type_name_for_intr(LLVMTypeOf(coords), coords_type, sizeof(coords_type));
-               snprintf(intrinsic_name, sizeof(intrinsic_name),
-                        "llvm.amdgcn.image.atomic.%s.%s",
-                        action->intr_name, coords_type);
-       }
-
-       tmp = lp_build_intrinsic(
-               builder, intrinsic_name, ctx->i32,
-               emit_data->args, emit_data->arg_count, 0);
-       emit_data->output[emit_data->chan] =
-               LLVMBuildBitCast(builder, tmp, ctx->f32, "");
-}
-
-static void set_tex_fetch_args(struct si_shader_context *ctx,
-                              struct lp_build_emit_data *emit_data,
-                              unsigned target,
-                              LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
-                              LLVMValueRef *param, unsigned count,
-                              unsigned dmask)
-{
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       struct ac_image_args args = {};
-
-       /* Pad to power of two vector */
-       while (count < util_next_power_of_two(count))
-               param[count++] = LLVMGetUndef(ctx->i32);
-
-       if (count > 1)
-               args.addr = lp_build_gather_values(gallivm, param, count);
-       else
-               args.addr = param[0];
-
-       args.resource = res_ptr;
-       args.sampler = samp_ptr;
-       args.dmask = dmask;
-       args.unorm = target == TGSI_TEXTURE_RECT ||
-                    target == TGSI_TEXTURE_SHADOWRECT;
-       args.da = tgsi_is_array_sampler(target);
-
-       /* Ugly, but we seem to have no other choice right now. */
-       STATIC_ASSERT(sizeof(args) <= sizeof(emit_data->args));
-       memcpy(emit_data->args, &args, sizeof(args));
-}
-
-static LLVMValueRef fix_resinfo(struct si_shader_context *ctx,
-                               unsigned target, LLVMValueRef out)
-{
-       LLVMBuilderRef builder = ctx->gallivm.builder;
-
-       /* 1D textures are allocated and used as 2D on GFX9. */
-        if (ctx->screen->b.chip_class >= GFX9 &&
-           (target == TGSI_TEXTURE_1D_ARRAY ||
-            target == TGSI_TEXTURE_SHADOW1D_ARRAY)) {
-               LLVMValueRef layers =
-                       LLVMBuildExtractElement(builder, out,
-                                               LLVMConstInt(ctx->i32, 2, 0), "");
-               out = LLVMBuildInsertElement(builder, out, layers,
-                                            ctx->i32_1, "");
-       }
-
-       /* Divide the number of layers by 6 to get the number of cubes. */
-       if (target == TGSI_TEXTURE_CUBE_ARRAY ||
-           target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
-               LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0);
-
-               LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
-               z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), "");
-
-               out = LLVMBuildInsertElement(builder, out, z, imm2, "");
-       }
-       return out;
-}
-
-static void resq_fetch_args(
-               struct lp_build_tgsi_context * bld_base,
-               struct lp_build_emit_data * emit_data)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       const struct tgsi_full_instruction *inst = emit_data->inst;
-       const struct tgsi_full_src_register *reg = &inst->Src[0];
-
-       emit_data->dst_type = ctx->v4i32;
-
-       if (reg->Register.File == TGSI_FILE_BUFFER) {
-               emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
-               emit_data->arg_count = 1;
-       } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
-               image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
-                                &emit_data->args[0]);
-               emit_data->arg_count = 1;
-       } else {
-               LLVMValueRef res_ptr;
-               unsigned image_target;
-
-               if (inst->Memory.Texture == TGSI_TEXTURE_3D)
-                       image_target = TGSI_TEXTURE_2D_ARRAY;
-               else
-                       image_target = inst->Memory.Texture;
-
-               image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
-                                &res_ptr);
-               set_tex_fetch_args(ctx, emit_data, image_target,
-                                  res_ptr, NULL, &ctx->i32_0, 1,
-                                  0xf);
-       }
-}
-
-static void resq_emit(
-               const struct lp_build_tgsi_action *action,
-               struct lp_build_tgsi_context *bld_base,
-               struct lp_build_emit_data *emit_data)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMBuilderRef builder = gallivm->builder;
-       const struct tgsi_full_instruction *inst = emit_data->inst;
-       LLVMValueRef out;
-
-       if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
-               out = LLVMBuildExtractElement(builder, emit_data->args[0],
-                                             LLVMConstInt(ctx->i32, 2, 0), "");
-       } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
-               out = get_buffer_size(bld_base, emit_data->args[0]);
-       } else {
-               struct ac_image_args args;
-
-               memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
-               args.opcode = ac_image_get_resinfo;
-               out = ac_build_image_opcode(&ctx->ac, &args);
-
-               out = fix_resinfo(ctx, inst->Memory.Texture, out);
-       }
-
-       emit_data->output[emit_data->chan] = out;
-}
-
-static const struct lp_build_tgsi_action tex_action;
-
-enum desc_type {
-       DESC_IMAGE,
-       DESC_BUFFER,
-       DESC_FMASK,
-       DESC_SAMPLER,
-};
-
-/**
- * Load an image view, fmask view. or sampler state descriptor.
- */
-static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx,
-                                     LLVMValueRef list, LLVMValueRef index,
-                                     enum desc_type type)
-{
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMBuilderRef builder = gallivm->builder;
-
-       switch (type) {
-       case DESC_IMAGE:
-               /* The image is at [0:7]. */
-               index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
-               break;
-       case DESC_BUFFER:
-               /* The buffer is in [4:7]. */
-               index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
-               index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
-               list = LLVMBuildPointerCast(builder, list,
-                                           const_array(ctx->v4i32, 0), "");
-               break;
-       case DESC_FMASK:
-               /* The FMASK is at [8:15]. */
-               index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
-               index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
-               break;
-       case DESC_SAMPLER:
-               /* The sampler state is at [12:15]. */
-               index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
-               index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
-               list = LLVMBuildPointerCast(builder, list,
-                                           const_array(ctx->v4i32, 0), "");
-               break;
-       }
-
-       return ac_build_indexed_load_const(&ctx->ac, list, index);
-}
-
-/* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
- *
- * SI-CI:
- *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
- *   filtering manually. The driver sets img7 to a mask clearing
- *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
- *     s_and_b32 samp0, samp0, img7
- *
- * VI:
- *   The ANISO_OVERRIDE sampler field enables this fix in TA.
- */
-static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
-                                          LLVMValueRef res, LLVMValueRef samp)
-{
-       LLVMBuilderRef builder = ctx->gallivm.builder;
-       LLVMValueRef img7, samp0;
-
-       if (ctx->screen->b.chip_class >= VI)
-               return samp;
-
-       img7 = LLVMBuildExtractElement(builder, res,
-                                      LLVMConstInt(ctx->i32, 7, 0), "");
-       samp0 = LLVMBuildExtractElement(builder, samp,
-                                       ctx->i32_0, "");
-       samp0 = LLVMBuildAnd(builder, samp0, img7, "");
-       return LLVMBuildInsertElement(builder, samp, samp0,
-                                     ctx->i32_0, "");
-}
-
-static void tex_fetch_ptrs(
-       struct lp_build_tgsi_context *bld_base,
-       struct lp_build_emit_data *emit_data,
-       LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers);
-       const struct tgsi_full_instruction *inst = emit_data->inst;
-       const struct tgsi_full_src_register *reg;
-       unsigned target = inst->Texture.Texture;
-       unsigned sampler_src;
-       LLVMValueRef index;
-
-       sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
-       reg = &emit_data->inst->Src[sampler_src];
-
-       if (reg->Register.Indirect) {
-               index = get_bounded_indirect_index(ctx,
-                                                  &reg->Indirect,
-                                                  reg->Register.Index,
-                                                  SI_NUM_SAMPLERS);
-       } else {
-               index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
-       }
-
-       if (target == TGSI_TEXTURE_BUFFER)
-               *res_ptr = load_sampler_desc(ctx, list, index, DESC_BUFFER);
-       else
-               *res_ptr = load_sampler_desc(ctx, list, index, DESC_IMAGE);
-
-       if (samp_ptr)
-               *samp_ptr = NULL;
-       if (fmask_ptr)
-               *fmask_ptr = NULL;
-
-       if (target == TGSI_TEXTURE_2D_MSAA ||
-           target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
-               if (fmask_ptr)
-                       *fmask_ptr = load_sampler_desc(ctx, list, index,
-                                                      DESC_FMASK);
-       } else if (target != TGSI_TEXTURE_BUFFER) {
-               if (samp_ptr) {
-                       *samp_ptr = load_sampler_desc(ctx, list, index,
-                                                     DESC_SAMPLER);
-                       *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
-               }
-       }
-}
-
-static void txq_fetch_args(
-       struct lp_build_tgsi_context *bld_base,
-       struct lp_build_emit_data *emit_data)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       const struct tgsi_full_instruction *inst = emit_data->inst;
-       unsigned target = inst->Texture.Texture;
-       LLVMValueRef res_ptr;
-       LLVMValueRef address;
-
-       tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
-
-       if (target == TGSI_TEXTURE_BUFFER) {
-               /* Read the size from the buffer descriptor directly. */
-               emit_data->args[0] = get_buffer_size(bld_base, res_ptr);
-               return;
-       }
-
-       /* Textures - set the mip level. */
-       address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
-
-       set_tex_fetch_args(ctx, emit_data, target, res_ptr,
-                          NULL, &address, 1, 0xf);
-}
-
-static void txq_emit(const struct lp_build_tgsi_action *action,
-                    struct lp_build_tgsi_context *bld_base,
-                    struct lp_build_emit_data *emit_data)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct ac_image_args args;
-       unsigned target = emit_data->inst->Texture.Texture;
-
-       if (target == TGSI_TEXTURE_BUFFER) {
-               /* Just return the buffer size. */
-               emit_data->output[emit_data->chan] = emit_data->args[0];
-               return;
-       }
-
-       memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
-
-       args.opcode = ac_image_get_resinfo;
-       LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args);
-
-       emit_data->output[emit_data->chan] = fix_resinfo(ctx, target, result);
-}
-
-static void tex_fetch_args(
-       struct lp_build_tgsi_context *bld_base,
-       struct lp_build_emit_data *emit_data)
-{
-       struct si_shader_context *ctx = si_shader_context(bld_base);
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       const struct tgsi_full_instruction *inst = emit_data->inst;
-       unsigned opcode = inst->Instruction.Opcode;
-       unsigned target = inst->Texture.Texture;
-       LLVMValueRef coords[5], derivs[6];
-       LLVMValueRef address[16];
-       unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
-       int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
-       unsigned count = 0;
-       unsigned chan;
-       unsigned num_deriv_channels = 0;
-       bool has_offset = inst->Texture.NumOffsets > 0;
-       LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
-       unsigned dmask = 0xf;
-
-       tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
-
-       if (target == TGSI_TEXTURE_BUFFER) {
-               emit_data->dst_type = ctx->v4f32;
-               emit_data->args[0] = LLVMBuildBitCast(gallivm->builder, res_ptr,
-                                                     ctx->v16i8, "");
-               emit_data->args[1] = ctx->i32_0;
-               emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
-               emit_data->arg_count = 3;
-               return;
-       }
-
-       /* Fetch and project texture coordinates */
-       coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
-       for (chan = 0; chan < 3; chan++ ) {
-               coords[chan] = lp_build_emit_fetch(bld_base,
-                                                  emit_data->inst, 0,
-                                                  chan);
-               if (opcode == TGSI_OPCODE_TXP)
-                       coords[chan] = lp_build_emit_llvm_binary(bld_base,
-                                                                TGSI_OPCODE_DIV,
-                                                                coords[chan],
-                                                                coords[3]);
-       }
-
-       if (opcode == TGSI_OPCODE_TXP)
-               coords[3] = bld_base->base.one;
-
-       /* Pack offsets. */
-       if (has_offset &&
-           opcode != TGSI_OPCODE_TXF &&
-           opcode != TGSI_OPCODE_TXF_LZ) {
-               /* The offsets are six-bit signed integers packed like this:
-                *   X=[5:0], Y=[13:8], and Z=[21:16].
-                */
-               LLVMValueRef offset[3], pack;
-
-               assert(inst->Texture.NumOffsets == 1);
-
-               for (chan = 0; chan < 3; chan++) {
-                       offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
-                                                                    emit_data->inst, 0, chan);
-                       offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
-                                                   LLVMConstInt(ctx->i32, 0x3f, 0), "");
-                       if (chan)
-                               offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
-                                                           LLVMConstInt(ctx->i32, chan*8, 0), "");
-               }
-
-               pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
-               pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
-               address[count++] = pack;
-       }
-
-       /* Pack LOD bias value */
-       if (opcode == TGSI_OPCODE_TXB)
-               address[count++] = coords[3];
-       if (opcode == TGSI_OPCODE_TXB2)
-               address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
-
-       /* Pack depth comparison value */
-       if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
-               LLVMValueRef z;
-
-               if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
-                       z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
-               } else {
-                       assert(ref_pos >= 0);
-                       z = coords[ref_pos];
-               }
-
-               /* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
-                * so the depth comparison value isn't clamped for Z16 and
-                * Z24 anymore. Do it manually here.
-                *
-                * It's unnecessary if the original texture format was
-                * Z32_FLOAT, but we don't know that here.
-                */
-               if (ctx->screen->b.chip_class == VI)
-                       z = ac_build_clamp(&ctx->ac, z);
-
-               address[count++] = z;
-       }
-
-       /* Pack user derivatives */
-       if (opcode == TGSI_OPCODE_TXD) {
-               int param, num_src_deriv_channels, num_dst_deriv_channels;
-
-               switch (target) {
-               case TGSI_TEXTURE_3D:
-                       num_src_deriv_channels = 3;
-                       num_dst_deriv_channels = 3;
-                       num_deriv_channels = 3;
-                       break;
-               case TGSI_TEXTURE_2D:
-               case TGSI_TEXTURE_SHADOW2D:
-               case TGSI_TEXTURE_RECT:
-               case TGSI_TEXTURE_SHADOWRECT:
-               case TGSI_TEXTURE_2D_ARRAY:
-               case TGSI_TEXTURE_SHADOW2D_ARRAY:
-                       num_src_deriv_channels = 2;
-                       num_dst_deriv_channels = 2;
-                       num_deriv_channels = 2;
-                       break;
-               case TGSI_TEXTURE_CUBE:
-               case TGSI_TEXTURE_SHADOWCUBE:
-               case TGSI_TEXTURE_CUBE_ARRAY:
-               case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
-                       /* Cube derivatives will be converted to 2D. */
-                       num_src_deriv_channels = 3;
-                       num_dst_deriv_channels = 3;
-                       num_deriv_channels = 2;
-                       break;
-               case TGSI_TEXTURE_1D:
-               case TGSI_TEXTURE_SHADOW1D:
-               case TGSI_TEXTURE_1D_ARRAY:
-               case TGSI_TEXTURE_SHADOW1D_ARRAY:
-                       num_src_deriv_channels = 1;
-
-                       /* 1D textures are allocated and used as 2D on GFX9. */
-                       if (ctx->screen->b.chip_class >= GFX9) {
-                               num_dst_deriv_channels = 2;
-                               num_deriv_channels = 2;
-                       } else {
-                               num_dst_deriv_channels = 1;
-                               num_deriv_channels = 1;
-                       }
-                       break;
-               default:
-                       unreachable("invalid target");
-               }
-
-               for (param = 0; param < 2; param++) {
-                       for (chan = 0; chan < num_src_deriv_channels; chan++)
-                               derivs[param * num_dst_deriv_channels + chan] =
-                                       lp_build_emit_fetch(bld_base, inst, param+1, chan);
-
-                       /* Fill in the rest with zeros. */
-                       for (chan = num_src_deriv_channels;
-                            chan < num_dst_deriv_channels; chan++)
-                               derivs[param * num_dst_deriv_channels + chan] =
-                                       bld_base->base.zero;
-               }
-       }
-
-       if (target == TGSI_TEXTURE_CUBE ||
-           target == TGSI_TEXTURE_CUBE_ARRAY ||
-           target == TGSI_TEXTURE_SHADOWCUBE ||
-           target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
-               ac_prepare_cube_coords(&ctx->ac,
-                                      opcode == TGSI_OPCODE_TXD,
-                                      target == TGSI_TEXTURE_CUBE_ARRAY ||
-                                      target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
-                                      coords, derivs);
-
-       if (opcode == TGSI_OPCODE_TXD)
-               for (int i = 0; i < num_deriv_channels * 2; i++)
-                       address[count++] = derivs[i];
-
-       /* Pack texture coordinates */
-       address[count++] = coords[0];
-       if (num_coords > 1)
-               address[count++] = coords[1];
-       if (num_coords > 2)
-               address[count++] = coords[2];
-
-       /* 1D textures are allocated and used as 2D on GFX9. */
-       if (ctx->screen->b.chip_class >= GFX9) {
-               LLVMValueRef filler;
-
-               /* Use 0.5, so that we don't sample the border color. */
-               if (opcode == TGSI_OPCODE_TXF)
-                       filler = ctx->i32_0;
-               else
-                       filler = LLVMConstReal(ctx->f32, 0.5);
-
-               if (target == TGSI_TEXTURE_1D ||
-                   target == TGSI_TEXTURE_SHADOW1D) {
-                       address[count++] = filler;
-               } else if (target == TGSI_TEXTURE_1D_ARRAY ||
-                          target == TGSI_TEXTURE_SHADOW1D_ARRAY) {
-                       address[count] = address[count - 1];
-                       address[count - 1] = filler;
-                       count++;
+                       memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
                }
-       }
+       } else {
+               struct ac_export_args args;
 
-       /* Pack LOD or sample index */
-       if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
-               address[count++] = coords[3];
-       else if (opcode == TGSI_OPCODE_TXL2)
-               address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
+               /* Export */
+               si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
+                                        &args);
+               if (is_last) {
+                       args.valid_mask = 1; /* whether the EXEC mask is valid */
+                       args.done = 1; /* DONE bit */
+               } else if (!args.enabled_channels)
+                       return; /* unnecessary NULL export */
 
-       if (count > 16) {
-               assert(!"Cannot handle more than 16 texture address parameters");
-               count = 16;
+               memcpy(&exp->args[exp->num++], &args, sizeof(args));
        }
+}
 
-       for (chan = 0; chan < count; chan++ ) {
-               address[chan] = LLVMBuildBitCast(gallivm->builder,
-                                                address[chan], ctx->i32, "");
-       }
+static void si_emit_ps_exports(struct si_shader_context *ctx,
+                              struct si_ps_exports *exp)
+{
+       for (unsigned i = 0; i < exp->num; i++)
+               ac_build_export(&ctx->ac, &exp->args[i]);
+}
 
-       /* Adjust the sample index according to FMASK.
-        *
-        * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
-        * which is the identity mapping. Each nibble says which physical sample
-        * should be fetched to get that sample.
-        *
-        * For example, 0x11111100 means there are only 2 samples stored and
-        * the second sample covers 3/4 of the pixel. When reading samples 0
-        * and 1, return physical sample 0 (determined by the first two 0s
-        * in FMASK), otherwise return physical sample 1.
-        *
-        * The sample index should be adjusted as follows:
-        *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
-        */
-       if (target == TGSI_TEXTURE_2D_MSAA ||
-           target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
-               struct lp_build_emit_data txf_emit_data = *emit_data;
-               LLVMValueRef txf_address[4];
-               /* We only need .xy for non-arrays, and .xyz for arrays. */
-               unsigned txf_count = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
-               struct tgsi_full_instruction inst = {};
-
-               memcpy(txf_address, address, sizeof(txf_address));
-
-               /* Read FMASK using TXF_LZ. */
-               inst.Instruction.Opcode = TGSI_OPCODE_TXF_LZ;
-               inst.Texture.Texture = target;
-               txf_emit_data.inst = &inst;
-               txf_emit_data.chan = 0;
-               set_tex_fetch_args(ctx, &txf_emit_data,
-                                  target, fmask_ptr, NULL,
-                                  txf_address, txf_count, 0xf);
-               build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
-
-               /* Initialize some constants. */
-               LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
-               LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
-
-               /* Apply the formula. */
-               LLVMValueRef fmask =
-                       LLVMBuildExtractElement(gallivm->builder,
-                                               txf_emit_data.output[0],
-                                               ctx->i32_0, "");
+static void si_export_null(struct lp_build_tgsi_context *bld_base)
+{
+       struct si_shader_context *ctx = si_shader_context(bld_base);
+       struct lp_build_context *base = &bld_base->base;
+       struct ac_export_args args;
 
-               unsigned sample_chan = txf_count; /* the sample index is last */
+       args.enabled_channels = 0x0; /* enabled channels */
+       args.valid_mask = 1; /* whether the EXEC mask is valid */
+       args.done = 1; /* DONE bit */
+       args.target = V_008DFC_SQ_EXP_NULL;
+       args.compr = 0; /* COMPR flag (0 = 32-bit export) */
+       args.out[0] = base->undef; /* R */
+       args.out[1] = base->undef; /* G */
+       args.out[2] = base->undef; /* B */
+       args.out[3] = base->undef; /* A */
 
-               LLVMValueRef sample_index4 =
-                       LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
+       ac_build_export(&ctx->ac, &args);
+}
 
-               LLVMValueRef shifted_fmask =
-                       LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
+/**
+ * Return PS outputs in this order:
+ *
+ * v[0:3] = color0.xyzw
+ * v[4:7] = color1.xyzw
+ * ...
+ * vN+0 = Depth
+ * vN+1 = Stencil
+ * vN+2 = SampleMask
+ * vN+3 = SampleMaskIn (used for OpenGL smoothing)
+ *
+ * The alpha-ref SGPR is returned via its original location.
+ */
+static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi,
+                                     unsigned max_outputs,
+                                     LLVMValueRef *addrs)
+{
+       struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+       struct si_shader *shader = ctx->shader;
+       struct tgsi_shader_info *info = &shader->selector->info;
+       LLVMBuilderRef builder = ctx->gallivm.builder;
+       unsigned i, j, first_vgpr, vgpr;
 
-               LLVMValueRef final_sample =
-                       LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
+       LLVMValueRef color[8][4] = {};
+       LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
+       LLVMValueRef ret;
 
-               /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
-                * resource descriptor is 0 (invalid),
-                */
-               LLVMValueRef fmask_desc =
-                       LLVMBuildBitCast(gallivm->builder, fmask_ptr,
-                                        ctx->v8i32, "");
+       if (ctx->postponed_kill)
+               ac_build_kill(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
 
-               LLVMValueRef fmask_word1 =
-                       LLVMBuildExtractElement(gallivm->builder, fmask_desc,
-                                               ctx->i32_1, "");
+       /* Read the output values. */
+       for (i = 0; i < info->num_outputs; i++) {
+               unsigned semantic_name = info->output_semantic_name[i];
+               unsigned semantic_index = info->output_semantic_index[i];
 
-               LLVMValueRef word1_is_nonzero =
-                       LLVMBuildICmp(gallivm->builder, LLVMIntNE,
-                                     fmask_word1, ctx->i32_0, "");
-
-               /* Replace the MSAA sample index. */
-               address[sample_chan] =
-                       LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
-                                       final_sample, address[sample_chan], "");
-       }
-
-       if (opcode == TGSI_OPCODE_TXF ||
-           opcode == TGSI_OPCODE_TXF_LZ) {
-               /* add tex offsets */
-               if (inst->Texture.NumOffsets) {
-                       struct lp_build_context *uint_bld = &bld_base->uint_bld;
-                       const struct tgsi_texture_offset *off = inst->TexOffsets;
-
-                       assert(inst->Texture.NumOffsets == 1);
-
-                       switch (target) {
-                       case TGSI_TEXTURE_3D:
-                               address[2] = lp_build_add(uint_bld, address[2],
-                                               ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ]);
-                               /* fall through */
-                       case TGSI_TEXTURE_2D:
-                       case TGSI_TEXTURE_SHADOW2D:
-                       case TGSI_TEXTURE_RECT:
-                       case TGSI_TEXTURE_SHADOWRECT:
-                       case TGSI_TEXTURE_2D_ARRAY:
-                       case TGSI_TEXTURE_SHADOW2D_ARRAY:
-                               address[1] =
-                                       lp_build_add(uint_bld, address[1],
-                                               ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY]);
-                               /* fall through */
-                       case TGSI_TEXTURE_1D:
-                       case TGSI_TEXTURE_SHADOW1D:
-                       case TGSI_TEXTURE_1D_ARRAY:
-                       case TGSI_TEXTURE_SHADOW1D_ARRAY:
-                               address[0] =
-                                       lp_build_add(uint_bld, address[0],
-                                               ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX]);
-                               break;
-                               /* texture offsets do not apply to other texture targets */
+               switch (semantic_name) {
+               case TGSI_SEMANTIC_COLOR:
+                       assert(semantic_index < 8);
+                       for (j = 0; j < 4; j++) {
+                               LLVMValueRef ptr = addrs[4 * i + j];
+                               LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
+                               color[semantic_index][j] = result;
                        }
+                       break;
+               case TGSI_SEMANTIC_POSITION:
+                       depth = LLVMBuildLoad(builder,
+                                             addrs[4 * i + 2], "");
+                       break;
+               case TGSI_SEMANTIC_STENCIL:
+                       stencil = LLVMBuildLoad(builder,
+                                               addrs[4 * i + 1], "");
+                       break;
+               case TGSI_SEMANTIC_SAMPLEMASK:
+                       samplemask = LLVMBuildLoad(builder,
+                                                  addrs[4 * i + 0], "");
+                       break;
+               default:
+                       fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
+                               semantic_name);
                }
        }
 
-       if (opcode == TGSI_OPCODE_TG4) {
-               unsigned gather_comp = 0;
-
-               /* DMASK was repurposed for GATHER4. 4 components are always
-                * returned and DMASK works like a swizzle - it selects
-                * the component to fetch. The only valid DMASK values are
-                * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
-                * (red,red,red,red) etc.) The ISA document doesn't mention
-                * this.
-                */
-
-               /* Get the component index from src1.x for Gather4. */
-               if (!tgsi_is_shadow_target(target)) {
-                       LLVMValueRef comp_imm;
-                       struct tgsi_src_register src1 = inst->Src[1].Register;
+       /* Fill the return structure. */
+       ret = ctx->return_value;
 
-                       assert(src1.File == TGSI_FILE_IMMEDIATE);
+       /* Set SGPRs. */
+       ret = LLVMBuildInsertValue(builder, ret,
+                                  LLVMBuildBitCast(ctx->ac.builder,
+                                               LLVMGetParam(ctx->main_fn,
+                                                       SI_PARAM_ALPHA_REF),
+                                               ctx->i32, ""),
+                                  SI_SGPR_ALPHA_REF, "");
 
-                       comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
-                       gather_comp = LLVMConstIntGetZExtValue(comp_imm);
-                       gather_comp = CLAMP(gather_comp, 0, 3);
-               }
+       /* Set VGPRs */
+       first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
+       for (i = 0; i < ARRAY_SIZE(color); i++) {
+               if (!color[i][0])
+                       continue;
 
-               dmask = 1 << gather_comp;
+               for (j = 0; j < 4; j++)
+                       ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
        }
+       if (depth)
+               ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
+       if (stencil)
+               ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
+       if (samplemask)
+               ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
 
-       set_tex_fetch_args(ctx, emit_data, target, res_ptr,
-                          samp_ptr, address, count, dmask);
+       /* Add the input sample mask for smoothing at the end. */
+       if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
+               vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
+       ret = LLVMBuildInsertValue(builder, ret,
+                                  LLVMGetParam(ctx->main_fn,
+                                               SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
+
+       ctx->return_value = ret;
 }
 
-/* Gather4 should follow the same rules as bilinear filtering, but the hardware
- * incorrectly forces nearest filtering if the texture format is integer.
- * The only effect it has on Gather4, which always returns 4 texels for
- * bilinear filtering, is that the final coordinates are off by 0.5 of
- * the texel size.
+/* Prevent optimizations (at least of memory accesses) across the current
+ * point in the program by emitting empty inline assembly that is marked as
+ * having side effects.
  *
- * The workaround is to subtract 0.5 from the unnormalized coordinates,
- * or (0.5 / size) from the normalized coordinates.
+ * Optionally, a value can be passed through the inline assembly to prevent
+ * LLVM from hoisting calls to ReadNone functions.
  */
-static void si_lower_gather4_integer(struct si_shader_context *ctx,
-                                    struct ac_image_args *args,
-                                    unsigned target)
+static void emit_optimization_barrier(struct si_shader_context *ctx,
+                                     LLVMValueRef *pvgpr)
 {
+       static int counter = 0;
+
        LLVMBuilderRef builder = ctx->gallivm.builder;
-       LLVMValueRef coord = args->addr;
-       LLVMValueRef half_texel[2];
-       /* Texture coordinates start after:
-        *   {offset, bias, z-compare, derivatives}
-        * Only the offset and z-compare can occur here.
-        */
-       unsigned coord_vgpr_index = (int)args->offset + (int)args->compare;
-       int c;
+       char code[16];
+
+       snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
 
-       if (target == TGSI_TEXTURE_RECT ||
-           target == TGSI_TEXTURE_SHADOWRECT) {
-               half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
+       if (!pvgpr) {
+               LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
+               LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
+               LLVMBuildCall(builder, inlineasm, NULL, 0, "");
        } else {
-               struct tgsi_full_instruction txq_inst = {};
-               struct lp_build_emit_data txq_emit_data = {};
-
-               /* Query the texture size. */
-               txq_inst.Texture.Texture = target;
-               txq_emit_data.inst = &txq_inst;
-               txq_emit_data.dst_type = ctx->v4i32;
-               set_tex_fetch_args(ctx, &txq_emit_data, target,
-                                  args->resource, NULL, &ctx->i32_0,
-                                  1, 0xf);
-               txq_emit(NULL, &ctx->bld_base, &txq_emit_data);
-
-               /* Compute -0.5 / size. */
-               for (c = 0; c < 2; c++) {
-                       half_texel[c] =
-                               LLVMBuildExtractElement(builder, txq_emit_data.output[0],
-                                                       LLVMConstInt(ctx->i32, c, 0), "");
-                       half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
-                       half_texel[c] =
-                               lp_build_emit_llvm_unary(&ctx->bld_base,
-                                                        TGSI_OPCODE_RCP, half_texel[c]);
-                       half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
-                                                     LLVMConstReal(ctx->f32, -0.5), "");
-               }
-       }
+               LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
+               LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
+               LLVMValueRef vgpr = *pvgpr;
+               LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
+               unsigned vgpr_size = llvm_get_type_size(vgpr_type);
+               LLVMValueRef vgpr0;
+
+               assert(vgpr_size % 4 == 0);
 
-       for (c = 0; c < 2; c++) {
-               LLVMValueRef tmp;
-               LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
+               vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
+               vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
+               vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
+               vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
+               vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
 
-               tmp = LLVMBuildExtractElement(builder, coord, index, "");
-               tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, "");
-               tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
-               tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
-               coord = LLVMBuildInsertElement(builder, coord, tmp, index, "");
+               *pvgpr = vgpr;
        }
+}
 
-       args->addr = coord;
+void si_emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
+{
+       struct gallivm_state *gallivm = &ctx->gallivm;
+       LLVMBuilderRef builder = gallivm->builder;
+       LLVMValueRef args[1] = {
+               LLVMConstInt(ctx->i32, simm16, 0)
+       };
+       lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
+                          ctx->voidt, args, 1, 0);
 }
 
-static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
-                               struct lp_build_tgsi_context *bld_base,
-                               struct lp_build_emit_data *emit_data)
+static void membar_emit(
+               const struct lp_build_tgsi_action *action,
+               struct lp_build_tgsi_context *bld_base,
+               struct lp_build_emit_data *emit_data)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
-       const struct tgsi_full_instruction *inst = emit_data->inst;
-       struct ac_image_args args;
-       unsigned opcode = inst->Instruction.Opcode;
-       unsigned target = inst->Texture.Texture;
-
-       if (target == TGSI_TEXTURE_BUFFER) {
-               emit_data->output[emit_data->chan] =
-                       ac_build_buffer_load_format(&ctx->ac,
-                                                   emit_data->args[0],
-                                                   emit_data->args[2],
-                                                   emit_data->args[1],
-                                                   true);
-               return;
-       }
-
-       memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
-
-       args.opcode = ac_image_sample;
-       args.compare = tgsi_is_shadow_target(target);
-       args.offset = inst->Texture.NumOffsets > 0;
-
-       switch (opcode) {
-       case TGSI_OPCODE_TXF:
-       case TGSI_OPCODE_TXF_LZ:
-               args.opcode = opcode == TGSI_OPCODE_TXF_LZ ||
-                             target == TGSI_TEXTURE_2D_MSAA ||
-                             target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
-                                     ac_image_load : ac_image_load_mip;
-               args.compare = false;
-               args.offset = false;
-               break;
-       case TGSI_OPCODE_LODQ:
-               args.opcode = ac_image_get_lod;
-               args.compare = false;
-               args.offset = false;
-               break;
-       case TGSI_OPCODE_TEX:
-       case TGSI_OPCODE_TEX2:
-       case TGSI_OPCODE_TXP:
-               if (ctx->type != PIPE_SHADER_FRAGMENT)
-                       args.level_zero = true;
-               break;
-       case TGSI_OPCODE_TEX_LZ:
-               args.level_zero = true;
-               break;
-       case TGSI_OPCODE_TXB:
-       case TGSI_OPCODE_TXB2:
-               assert(ctx->type == PIPE_SHADER_FRAGMENT);
-               args.bias = true;
-               break;
-       case TGSI_OPCODE_TXL:
-       case TGSI_OPCODE_TXL2:
-               args.lod = true;
-               break;
-       case TGSI_OPCODE_TXD:
-               args.deriv = true;
-               break;
-       case TGSI_OPCODE_TG4:
-               args.opcode = ac_image_gather4;
-               args.level_zero = true;
-               break;
-       default:
-               assert(0);
-               return;
-       }
+       LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
+       unsigned flags = LLVMConstIntGetZExtValue(src0);
+       unsigned waitcnt = NOOP_WAITCNT;
 
-       /* The hardware needs special lowering for Gather4 with integer formats. */
-       if (ctx->screen->b.chip_class <= VI &&
-           opcode == TGSI_OPCODE_TG4) {
-               struct tgsi_shader_info *info = &ctx->shader->selector->info;
-               /* This will also work with non-constant indexing because of how
-                * glsl_to_tgsi works and we intent to preserve that behavior.
-                */
-               const unsigned src_idx = 2;
-               unsigned sampler = inst->Src[src_idx].Register.Index;
+       if (flags & TGSI_MEMBAR_THREAD_GROUP)
+               waitcnt &= VM_CNT & LGKM_CNT;
 
-               assert(inst->Src[src_idx].Register.File == TGSI_FILE_SAMPLER);
+       if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
+                    TGSI_MEMBAR_SHADER_BUFFER |
+                    TGSI_MEMBAR_SHADER_IMAGE))
+               waitcnt &= VM_CNT;
 
-               if (info->sampler_type[sampler] == TGSI_RETURN_TYPE_SINT ||
-                   info->sampler_type[sampler] == TGSI_RETURN_TYPE_UINT)
-                       si_lower_gather4_integer(ctx, &args, target);
-       }
+       if (flags & TGSI_MEMBAR_SHARED)
+               waitcnt &= LGKM_CNT;
 
-       emit_data->output[emit_data->chan] =
-               ac_build_image_opcode(&ctx->ac, &args);
+       if (waitcnt != NOOP_WAITCNT)
+               si_emit_waitcnt(ctx, waitcnt);
 }
 
-static void si_llvm_emit_txqs(
-       const struct lp_build_tgsi_action *action,
-       struct lp_build_tgsi_context *bld_base,
-       struct lp_build_emit_data *emit_data)
+static void clock_emit(
+               const struct lp_build_tgsi_action *action,
+               struct lp_build_tgsi_context *bld_base,
+               struct lp_build_emit_data *emit_data)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMBuilderRef builder = gallivm->builder;
-       LLVMValueRef res, samples;
-       LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
-
-       tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
+       LLVMValueRef tmp;
 
+       tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
+                                ctx->i64, NULL, 0, 0);
+       tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
 
-       /* Read the samples from the descriptor directly. */
-       res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
-       samples = LLVMBuildExtractElement(
-               builder, res,
-               LLVMConstInt(ctx->i32, 3, 0), "");
-       samples = LLVMBuildLShr(builder, samples,
-                               LLVMConstInt(ctx->i32, 16, 0), "");
-       samples = LLVMBuildAnd(builder, samples,
-                              LLVMConstInt(ctx->i32, 0xf, 0), "");
-       samples = LLVMBuildShl(builder, ctx->i32_1,
-                              samples, "");
+       emit_data->output[0] =
+               LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
+       emit_data->output[1] =
+               LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
+}
 
-       emit_data->output[emit_data->chan] = samples;
+LLVMTypeRef si_const_array(LLVMTypeRef elem_type, int num_elements)
+{
+       return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
+                              CONST_ADDR_SPACE);
 }
 
 static void si_llvm_emit_ddxy(
@@ -5113,18 +3648,41 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct si_shader *shader = ctx->shader;
        struct gallivm_state *gallivm = &ctx->gallivm;
+       const struct tgsi_shader_info *info = &shader->selector->info;
        LLVMValueRef interp_param;
        const struct tgsi_full_instruction *inst = emit_data->inst;
-       int input_index = inst->Src[0].Register.Index;
+       const struct tgsi_full_src_register *input = &inst->Src[0];
+       int input_base, input_array_size;
        int chan;
        int i;
-       LLVMValueRef attr_number;
-       LLVMValueRef params = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
+       LLVMValueRef prim_mask = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
+       LLVMValueRef array_idx;
        int interp_param_idx;
-       unsigned interp = shader->selector->info.input_interpolate[input_index];
+       unsigned interp;
        unsigned location;
 
-       assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
+       assert(input->Register.File == TGSI_FILE_INPUT);
+
+       if (input->Register.Indirect) {
+               unsigned array_id = input->Indirect.ArrayID;
+
+               if (array_id) {
+                       input_base = info->input_array_first[array_id];
+                       input_array_size = info->input_array_last[array_id] - input_base + 1;
+               } else {
+                       input_base = inst->Src[0].Register.Index;
+                       input_array_size = info->num_inputs - input_base;
+               }
+
+               array_idx = si_get_indirect_index(ctx, &input->Indirect,
+                                              input->Register.Index - input_base);
+       } else {
+               input_base = inst->Src[0].Register.Index;
+               input_array_size = 1;
+               array_idx = ctx->i32_0;
+       }
+
+       interp = shader->selector->info.input_interpolate[input_base];
 
        if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
            inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
@@ -5140,8 +3698,6 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
        else
                interp_param = NULL;
 
-       attr_number = LLVMConstInt(ctx->i32, input_index, 0);
-
        if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
            inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
                LLVMValueRef ij_out[2];
@@ -5180,28 +3736,35 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
                interp_param = lp_build_gather_values(gallivm, ij_out, 2);
        }
 
+       if (interp_param) {
+               interp_param = LLVMBuildBitCast(gallivm->builder,
+                       interp_param, LLVMVectorType(ctx->f32, 2), "");
+       }
+
        for (chan = 0; chan < 4; chan++) {
-               LLVMValueRef llvm_chan;
-               unsigned schan;
-
-               schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
-               llvm_chan = LLVMConstInt(ctx->i32, schan, 0);
-
-               if (interp_param) {
-                       interp_param = LLVMBuildBitCast(gallivm->builder,
-                               interp_param, LLVMVectorType(ctx->f32, 2), "");
-                       LLVMValueRef i = LLVMBuildExtractElement(
-                               gallivm->builder, interp_param, ctx->i32_0, "");
-                       LLVMValueRef j = LLVMBuildExtractElement(
-                               gallivm->builder, interp_param, ctx->i32_1, "");
-                       emit_data->output[chan] = ac_build_fs_interp(&ctx->ac,
-                               llvm_chan, attr_number, params,
-                               i, j);
-               } else {
-                       emit_data->output[chan] = ac_build_fs_interp_mov(&ctx->ac,
-                               LLVMConstInt(ctx->i32, 2, 0), /* P0 */
-                               llvm_chan, attr_number, params);
+               LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size));
+               unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
+
+               for (unsigned idx = 0; idx < input_array_size; ++idx) {
+                       LLVMValueRef v, i = NULL, j = NULL;
+
+                       if (interp_param) {
+                               interp_param = LLVMBuildBitCast(gallivm->builder,
+                                       interp_param, LLVMVectorType(ctx->f32, 2), "");
+                               i = LLVMBuildExtractElement(
+                                       gallivm->builder, interp_param, ctx->i32_0, "");
+                               j = LLVMBuildExtractElement(
+                                       gallivm->builder, interp_param, ctx->i32_1, "");
+                       }
+                       v = si_build_fs_interp(ctx, input_base + idx, schan,
+                                              prim_mask, i, j);
+
+                       gather = LLVMBuildInsertElement(gallivm->builder,
+                               gather, v, LLVMConstInt(ctx->i32, idx, false), "");
                }
+
+               emit_data->output[chan] = LLVMBuildExtractElement(
+                       gallivm->builder, gather, array_idx, "");
        }
 }
 
@@ -5443,7 +4006,7 @@ static void si_llvm_emit_vertex(
 
        /* Signal vertex emission */
        ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
-                        LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id));
+                        si_get_gs_wave_id(ctx));
        if (!use_kill)
                lp_build_endif(&if_state);
 }
@@ -5460,7 +4023,7 @@ static void si_llvm_emit_primitive(
        /* Signal primitive cut */
        stream = si_llvm_get_stream(bld_base, emit_data);
        ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
-                        LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id));
+                        si_get_gs_wave_id(ctx));
 }
 
 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
@@ -5474,24 +4037,17 @@ static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
         * The real barrier instruction isn’t needed, because an entire patch
         * always fits into a single wave.
         */
-       if (HAVE_LLVM >= 0x0309 &&
-           ctx->screen->b.chip_class == SI &&
+       if (ctx->screen->b.chip_class == SI &&
            ctx->type == PIPE_SHADER_TESS_CTRL) {
-               emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
+               si_emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
                return;
        }
 
        lp_build_intrinsic(gallivm->builder,
-                          HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
-                                              : "llvm.AMDGPU.barrier.local",
+                          "llvm.amdgcn.s.barrier",
                           ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
 }
 
-static const struct lp_build_tgsi_action tex_action = {
-       .fetch_args = tex_fetch_args,
-       .emit = build_tex_intrinsic,
-};
-
 static const struct lp_build_tgsi_action interp_action = {
        .fetch_args = interp_fetch_args,
        .emit = build_interp_intrinsic,
@@ -5500,17 +4056,16 @@ static const struct lp_build_tgsi_action interp_action = {
 static void si_create_function(struct si_shader_context *ctx,
                               const char *name,
                               LLVMTypeRef *returns, unsigned num_returns,
-                              LLVMTypeRef *params, unsigned num_params,
-                              int last_sgpr)
+                              struct si_function_info *fninfo,
+                              unsigned max_workgroup_size)
 {
        int i;
 
        si_llvm_create_func(ctx, name, returns, num_returns,
-                           params, num_params);
-       si_llvm_shader_type(ctx->main_fn, ctx->type);
+                           fninfo->types, fninfo->num_params);
        ctx->return_value = LLVMGetUndef(ctx->return_type);
 
-       for (i = 0; i <= last_sgpr; ++i) {
+       for (i = 0; i < fninfo->num_sgpr_params; ++i) {
                LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
 
                /* The combination of:
@@ -5528,6 +4083,15 @@ static void si_create_function(struct si_shader_context *ctx,
                        lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
        }
 
+       for (i = 0; i < fninfo->num_params; ++i) {
+               if (fninfo->assign[i])
+                       *fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i);
+       }
+
+       if (max_workgroup_size) {
+               si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
+                                     max_workgroup_size);
+       }
        LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
                                           "no-signed-zeros-fp-math",
                                           "true");
@@ -5551,26 +4115,25 @@ static void si_create_function(struct si_shader_context *ctx,
 
 static void declare_streamout_params(struct si_shader_context *ctx,
                                     struct pipe_stream_output_info *so,
-                                    LLVMTypeRef *params, LLVMTypeRef i32,
-                                    unsigned *num_params)
+                                    struct si_function_info *fninfo)
 {
        int i;
 
        /* Streamout SGPRs. */
        if (so->num_outputs) {
                if (ctx->type != PIPE_SHADER_TESS_EVAL)
-                       params[ctx->param_streamout_config = (*num_params)++] = i32;
+                       ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
                else
-                       ctx->param_streamout_config = *num_params - 1;
+                       ctx->param_streamout_config = fninfo->num_params - 1;
 
-               params[ctx->param_streamout_write_index = (*num_params)++] = i32;
+               ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
        }
        /* A streamout buffer offset is loaded if the stride is non-zero. */
        for (i = 0; i < 4; i++) {
                if (!so->stride[i])
                        continue;
 
-               params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
+               ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
        }
 }
 
@@ -5597,18 +4160,34 @@ static unsigned llvm_get_type_size(LLVMTypeRef type)
        }
 }
 
-static void declare_tess_lds(struct si_shader_context *ctx)
+static void declare_lds_as_pointer(struct si_shader_context *ctx)
 {
        struct gallivm_state *gallivm = &ctx->gallivm;
 
-       unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
-       ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
-               LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
-               "tess_lds");
-}
+       unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
+       ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
+               LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
+               "lds");
+}
+
+static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
+{
+       switch (shader->selector->type) {
+       case PIPE_SHADER_TESS_CTRL:
+               /* Return this so that LLVM doesn't remove s_barrier
+                * instructions on chips where we use s_barrier. */
+               return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
+
+       case PIPE_SHADER_GEOMETRY:
+               return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
+
+       case PIPE_SHADER_COMPUTE:
+               break; /* see below */
+
+       default:
+               return 0;
+       }
 
-static unsigned si_get_max_workgroup_size(struct si_shader *shader)
-{
        const unsigned *properties = shader->selector->info.properties;
        unsigned max_work_group_size =
                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
@@ -5625,64 +4204,77 @@ static unsigned si_get_max_workgroup_size(struct si_shader *shader)
 }
 
 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
-                                           LLVMTypeRef *params,
-                                           unsigned *num_params,
+                                           struct si_function_info *fninfo,
                                            bool assign_params)
 {
-       params[(*num_params)++] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
-       params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
-       params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_IMAGES);
-       params[(*num_params)++] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
+       unsigned const_and_shader_buffers =
+               add_arg(fninfo, ARG_SGPR,
+                       si_const_array(ctx->v4i32,
+                                      SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS));
+       unsigned samplers_and_images =
+               add_arg(fninfo, ARG_SGPR,
+                       si_const_array(ctx->v8i32,
+                                      SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2));
 
        if (assign_params) {
-               ctx->param_const_buffers  = *num_params - 4;
-               ctx->param_samplers       = *num_params - 3;
-               ctx->param_images         = *num_params - 2;
-               ctx->param_shader_buffers = *num_params - 1;
+               ctx->param_const_and_shader_buffers = const_and_shader_buffers;
+               ctx->param_samplers_and_images = samplers_and_images;
        }
 }
 
 static void declare_default_desc_pointers(struct si_shader_context *ctx,
-                                         LLVMTypeRef *params,
-                                         unsigned *num_params)
+                                         struct si_function_info *fninfo)
 {
-       params[ctx->param_rw_buffers = (*num_params)++] =
-               const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
-       declare_per_stage_desc_pointers(ctx, params, num_params, true);
+       ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR,
+               si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
+       declare_per_stage_desc_pointers(ctx, fninfo, true);
 }
 
 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
-                                           LLVMTypeRef *params,
-                                           unsigned *num_params)
+                                           struct si_function_info *fninfo)
 {
-       params[ctx->param_vertex_buffers = (*num_params)++] =
-               const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
-       params[ctx->param_base_vertex = (*num_params)++] = ctx->i32;
-       params[ctx->param_start_instance = (*num_params)++] = ctx->i32;
-       params[ctx->param_draw_id = (*num_params)++] = ctx->i32;
-       params[ctx->param_vs_state_bits = (*num_params)++] = ctx->i32;
+       ctx->param_vertex_buffers = add_arg(fninfo, ARG_SGPR,
+               si_const_array(ctx->v4i32, SI_NUM_VERTEX_BUFFERS));
+       add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex);
+       add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance);
+       add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id);
+       ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32);
 }
 
 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
-                                  LLVMTypeRef *params, unsigned *num_params,
+                                  struct si_function_info *fninfo,
                                   unsigned *num_prolog_vgprs)
 {
        struct si_shader *shader = ctx->shader;
 
-       params[ctx->param_vertex_id = (*num_params)++] = ctx->i32;
-       params[ctx->param_rel_auto_id = (*num_params)++] = ctx->i32;
-       params[ctx->param_vs_prim_id = (*num_params)++] = ctx->i32;
-       params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
+       add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id);
+       if (shader->key.as_ls) {
+               ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
+               add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
+       } else {
+               add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
+               ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
+       }
+       add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */
 
        if (!shader->is_gs_copy_shader) {
                /* Vertex load indices. */
-               ctx->param_vertex_index0 = (*num_params);
+               ctx->param_vertex_index0 = fninfo->num_params;
                for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
-                       params[(*num_params)++] = ctx->i32;
+                       add_arg(fninfo, ARG_VGPR, ctx->i32);
                *num_prolog_vgprs += shader->selector->info.num_inputs;
        }
 }
 
+static void declare_tes_input_vgprs(struct si_shader_context *ctx,
+                                   struct si_function_info *fninfo)
+{
+       ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32);
+       ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32);
+       ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
+       ctx->param_tes_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
+}
+
 enum {
        /* Convenient merged shader definitions. */
        SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
@@ -5694,13 +4286,15 @@ static void create_function(struct si_shader_context *ctx)
        struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
        struct gallivm_state *gallivm = &ctx->gallivm;
        struct si_shader *shader = ctx->shader;
-       LLVMTypeRef params[100]; /* just make it large enough */
+       struct si_function_info fninfo;
        LLVMTypeRef returns[16+32*4];
-       unsigned i, last_sgpr, num_params = 0, num_return_sgprs;
+       unsigned i, num_return_sgprs;
        unsigned num_returns = 0;
        unsigned num_prolog_vgprs = 0;
        unsigned type = ctx->type;
 
+       si_init_function_info(&fninfo);
+
        /* Set MERGED shaders. */
        if (ctx->screen->b.chip_class >= GFX9) {
                if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
@@ -5713,91 +4307,85 @@ static void create_function(struct si_shader_context *ctx)
 
        switch (type) {
        case PIPE_SHADER_VERTEX:
-               declare_default_desc_pointers(ctx, params, &num_params);
-               declare_vs_specific_input_sgprs(ctx, params, &num_params);
+               declare_default_desc_pointers(ctx, &fninfo);
+               declare_vs_specific_input_sgprs(ctx, &fninfo);
 
                if (shader->key.as_es) {
-                       params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
+                       ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                } else if (shader->key.as_ls) {
                        /* no extra parameters */
                } else {
-                       if (shader->is_gs_copy_shader)
-                               num_params = ctx->param_rw_buffers + 1;
+                       if (shader->is_gs_copy_shader) {
+                               fninfo.num_params = ctx->param_rw_buffers + 1;
+                               fninfo.num_sgpr_params = fninfo.num_params;
+                       }
 
                        /* The locations of the other parameters are assigned dynamically. */
                        declare_streamout_params(ctx, &shader->selector->so,
-                                                params, ctx->i32, &num_params);
+                                                &fninfo);
                }
 
-               last_sgpr = num_params-1;
-
                /* VGPRs */
-               declare_vs_input_vgprs(ctx, params, &num_params,
-                                      &num_prolog_vgprs);
-
-               /* PrimitiveID output. */
-               if (!shader->is_gs_copy_shader &&
-                   !shader->key.as_es && !shader->key.as_ls) {
-                       for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
-                               returns[num_returns++] = ctx->f32;
-               }
+               declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
                break;
 
        case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
-               declare_default_desc_pointers(ctx, params, &num_params);
-               params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
-               params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
-               params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
-               params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
-               params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
-               params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
-               last_sgpr = num_params - 1;
+               declare_default_desc_pointers(ctx, &fninfo);
+               ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
 
                /* VGPRs */
-               params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
-               params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
+               ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
+               ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
 
                /* param_tcs_offchip_offset and param_tcs_factor_offset are
                 * placed after the user SGPRs.
                 */
                for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
                        returns[num_returns++] = ctx->i32; /* SGPRs */
-               for (i = 0; i < 3; i++)
+               for (i = 0; i < 5; i++)
                        returns[num_returns++] = ctx->f32; /* VGPRs */
                break;
 
        case SI_SHADER_MERGED_VERTEX_TESSCTRL:
                /* Merged stages have 8 system SGPRs at the beginning. */
-               params[num_params++] = ctx->i32; /* unused */
-               params[num_params++] = ctx->i32; /* unused */
-               params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
-               params[num_params++] = ctx->i32; /* wave thread counts for LS and HS */
-               params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
-               params[num_params++] = ctx->i32; /* scratch wave offset */
-               params[num_params++] = ctx->i32; /* unused */
-               params[num_params++] = ctx->i32; /* unused */
-
-               params[ctx->param_rw_buffers = num_params++] =
-                       const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
-               declare_per_stage_desc_pointers(ctx, params, &num_params,
+               ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
+                       add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
+               ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
+               add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
+
+               add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
+               add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
+               declare_per_stage_desc_pointers(ctx, &fninfo,
                                                ctx->type == PIPE_SHADER_VERTEX);
-               declare_vs_specific_input_sgprs(ctx, params, &num_params);
+               declare_vs_specific_input_sgprs(ctx, &fninfo);
 
-               params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
-               params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
-               params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
-               params[num_params++] = ctx->i32; /* unused */
+               ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
 
-               declare_per_stage_desc_pointers(ctx, params, &num_params,
+               declare_per_stage_desc_pointers(ctx, &fninfo,
                                                ctx->type == PIPE_SHADER_TESS_CTRL);
-               last_sgpr = num_params - 1;
 
                /* VGPRs (first TCS, then VS) */
-               params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
-               params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
+               ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
+               ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
 
                if (ctx->type == PIPE_SHADER_VERTEX) {
-                       declare_vs_input_vgprs(ctx, params, &num_params,
+                       declare_vs_input_vgprs(ctx, &fninfo,
                                               &num_prolog_vgprs);
 
                        /* LS return values are inputs to the TCS main shader part. */
@@ -5808,97 +4396,140 @@ static void create_function(struct si_shader_context *ctx)
                } else {
                        /* TCS return values are inputs to the TCS epilog.
                         *
-                        * param_tcs_offchip_offset and param_tcs_factor_offset
+                        * param_tcs_offchip_offset, param_tcs_factor_offset,
+                        * param_tcs_offchip_layout, and param_rw_buffers
                         * should be passed to the epilog.
                         */
-                       for (i = 0; i <= ctx->param_tcs_factor_offset; i++)
+                       for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
                                returns[num_returns++] = ctx->i32; /* SGPRs */
-                       for (i = 0; i < 3; i++)
+                       for (i = 0; i < 5; i++)
                                returns[num_returns++] = ctx->f32; /* VGPRs */
                }
                break;
 
        case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
-               assert(!"unimplemented merged ES-GS shader");
+               /* Merged stages have 8 system SGPRs at the beginning. */
+               ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
+                       add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
+               ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
+               add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
+
+               add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
+               add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
+               declare_per_stage_desc_pointers(ctx, &fninfo,
+                                               (ctx->type == PIPE_SHADER_VERTEX ||
+                                                ctx->type == PIPE_SHADER_TESS_EVAL));
+               if (ctx->type == PIPE_SHADER_VERTEX) {
+                       declare_vs_specific_input_sgprs(ctx, &fninfo);
+               } else {
+                       /* TESS_EVAL (and also GEOMETRY):
+                        * Declare as many input SGPRs as the VS has. */
+                       ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+                       ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+                       add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
+                       add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
+                       add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
+                       ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
+               }
+
+               declare_per_stage_desc_pointers(ctx, &fninfo,
+                                               ctx->type == PIPE_SHADER_GEOMETRY);
+
+               /* VGPRs (first GS, then VS/TES) */
+               ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
+               ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
+               ctx->param_gs_prim_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
+               ctx->param_gs_instance_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
+               ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
+
+               if (ctx->type == PIPE_SHADER_VERTEX) {
+                       declare_vs_input_vgprs(ctx, &fninfo,
+                                              &num_prolog_vgprs);
+               } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
+                       declare_tes_input_vgprs(ctx, &fninfo);
+               }
+
+               if (ctx->type == PIPE_SHADER_VERTEX ||
+                   ctx->type == PIPE_SHADER_TESS_EVAL) {
+                       /* ES return values are inputs to GS. */
+                       for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
+                               returns[num_returns++] = ctx->i32; /* SGPRs */
+                       for (i = 0; i < 5; i++)
+                               returns[num_returns++] = ctx->f32; /* VGPRs */
+               }
                break;
 
        case PIPE_SHADER_TESS_EVAL:
-               declare_default_desc_pointers(ctx, params, &num_params);
-               params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
+               declare_default_desc_pointers(ctx, &fninfo);
+               ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
 
                if (shader->key.as_es) {
-                       params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
-                       params[num_params++] = ctx->i32;
-                       params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
+                       ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+                       add_arg(&fninfo, ARG_SGPR, ctx->i32);
+                       ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                } else {
-                       params[num_params++] = ctx->i32;
+                       add_arg(&fninfo, ARG_SGPR, ctx->i32);
                        declare_streamout_params(ctx, &shader->selector->so,
-                                                params, ctx->i32, &num_params);
-                       params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
+                                                &fninfo);
+                       ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
                }
-               last_sgpr = num_params - 1;
 
                /* VGPRs */
-               params[ctx->param_tes_u = num_params++] = ctx->f32;
-               params[ctx->param_tes_v = num_params++] = ctx->f32;
-               params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
-               params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
-
-               /* PrimitiveID output. */
-               if (!shader->key.as_es)
-                       for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
-                               returns[num_returns++] = ctx->f32;
+               declare_tes_input_vgprs(ctx, &fninfo);
                break;
 
        case PIPE_SHADER_GEOMETRY:
-               declare_default_desc_pointers(ctx, params, &num_params);
-               params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
-               params[ctx->param_gs_wave_id = num_params++] = ctx->i32;
-               last_sgpr = num_params - 1;
+               declare_default_desc_pointers(ctx, &fninfo);
+               ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32);
 
                /* VGPRs */
-               params[ctx->param_gs_vtx0_offset = num_params++] = ctx->i32;
-               params[ctx->param_gs_vtx1_offset = num_params++] = ctx->i32;
-               params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
-               params[ctx->param_gs_vtx2_offset = num_params++] = ctx->i32;
-               params[ctx->param_gs_vtx3_offset = num_params++] = ctx->i32;
-               params[ctx->param_gs_vtx4_offset = num_params++] = ctx->i32;
-               params[ctx->param_gs_vtx5_offset = num_params++] = ctx->i32;
-               params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
+               ctx->param_gs_vtx0_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
+               ctx->param_gs_vtx1_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
+               ctx->param_gs_prim_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
+               ctx->param_gs_vtx2_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
+               ctx->param_gs_vtx3_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
+               ctx->param_gs_vtx4_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
+               ctx->param_gs_vtx5_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
+               ctx->param_gs_instance_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
                break;
 
        case PIPE_SHADER_FRAGMENT:
-               declare_default_desc_pointers(ctx, params, &num_params);
-               params[SI_PARAM_ALPHA_REF] = ctx->f32;
-               params[SI_PARAM_PRIM_MASK] = ctx->i32;
-               last_sgpr = SI_PARAM_PRIM_MASK;
-               params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
-               params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
-               params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
-               params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
-               params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
-               params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
-               params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
-               params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
-               params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
-               params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
-               params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
-               params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
-               params[SI_PARAM_FRONT_FACE] = ctx->i32;
+               declare_default_desc_pointers(ctx, &fninfo);
+               add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
+               add_arg_checked(&fninfo, ARG_SGPR, ctx->i32, SI_PARAM_PRIM_MASK);
+
+               add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE);
+               add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER);
+               add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID);
+               add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL);
+               add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE);
+               add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER);
+               add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID);
+               add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX);
+               add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_POS_X_FLOAT);
+               add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_POS_Y_FLOAT);
+               add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_POS_Z_FLOAT);
+               add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_POS_W_FLOAT);
+               add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_FRONT_FACE);
                shader->info.face_vgpr_index = 20;
-               params[SI_PARAM_ANCILLARY] = ctx->i32;
-               params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
-               params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
-               num_params = SI_PARAM_POS_FIXED_PT+1;
+               add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_ANCILLARY);
+               add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_SAMPLE_COVERAGE);
+               add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT);
 
                /* Color inputs from the prolog. */
                if (shader->selector->info.colors_read) {
                        unsigned num_color_elements =
                                util_bitcount(shader->selector->info.colors_read);
 
-                       assert(num_params + num_color_elements <= ARRAY_SIZE(params));
+                       assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types));
                        for (i = 0; i < num_color_elements; i++)
-                               params[num_params++] = ctx->f32;
+                               add_arg(&fninfo, ARG_VGPR, ctx->f32);
 
                        num_prolog_vgprs += num_color_elements;
                }
@@ -5924,24 +4555,27 @@ static void create_function(struct si_shader_context *ctx)
                break;
 
        case PIPE_SHADER_COMPUTE:
-               declare_default_desc_pointers(ctx, params, &num_params);
-               params[SI_PARAM_GRID_SIZE] = v3i32;
-               params[SI_PARAM_BLOCK_SIZE] = v3i32;
-               params[SI_PARAM_BLOCK_ID] = v3i32;
-               last_sgpr = SI_PARAM_BLOCK_ID;
-
-               params[SI_PARAM_THREAD_ID] = v3i32;
-               num_params = SI_PARAM_THREAD_ID + 1;
+               declare_default_desc_pointers(ctx, &fninfo);
+               if (shader->selector->info.uses_grid_size)
+                       ctx->param_grid_size = add_arg(&fninfo, ARG_SGPR, v3i32);
+               if (shader->selector->info.uses_block_size)
+                       ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32);
+
+               for (i = 0; i < 3; i++) {
+                       ctx->param_block_id[i] = -1;
+                       if (shader->selector->info.uses_block_id[i])
+                               ctx->param_block_id[i] = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               }
+
+               ctx->param_thread_id = add_arg(&fninfo, ARG_VGPR, v3i32);
                break;
        default:
                assert(0 && "unimplemented shader");
                return;
        }
 
-       assert(num_params <= ARRAY_SIZE(params));
-
-       si_create_function(ctx, "main", returns, num_returns, params,
-                          num_params, last_sgpr);
+       si_create_function(ctx, "main", returns, num_returns, &fninfo,
+                          si_get_max_workgroup_size(shader));
 
        /* Reserve register locations for VGPR inputs the PS prolog may need. */
        if (ctx->type == PIPE_SHADER_FRAGMENT &&
@@ -5956,20 +4590,16 @@ static void create_function(struct si_shader_context *ctx)
                                      S_0286D0_LINEAR_CENTROID_ENA(1) |
                                      S_0286D0_FRONT_FACE_ENA(1) |
                                      S_0286D0_POS_FIXED_PT_ENA(1));
-       } else if (ctx->type == PIPE_SHADER_COMPUTE) {
-               si_llvm_add_attribute(ctx->main_fn,
-                                     "amdgpu-max-work-group-size",
-                                     si_get_max_workgroup_size(shader));
        }
 
        shader->info.num_input_sgprs = 0;
        shader->info.num_input_vgprs = 0;
 
-       for (i = 0; i <= last_sgpr; ++i)
-               shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
+       for (i = 0; i < fninfo.num_sgpr_params; ++i)
+               shader->info.num_input_sgprs += llvm_get_type_size(fninfo.types[i]) / 4;
 
-       for (; i < num_params; ++i)
-               shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
+       for (; i < fninfo.num_params; ++i)
+               shader->info.num_input_vgprs += llvm_get_type_size(fninfo.types[i]) / 4;
 
        assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
        shader->info.num_input_vgprs -= num_prolog_vgprs;
@@ -5988,9 +4618,13 @@ static void create_function(struct si_shader_context *ctx)
                                                    "ddxy_lds",
                                                    LOCAL_ADDR_SPACE);
 
-       if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.as_ls) ||
-           ctx->type == PIPE_SHADER_TESS_CTRL)
-               declare_tess_lds(ctx);
+       if (shader->key.as_ls ||
+           ctx->type == PIPE_SHADER_TESS_CTRL ||
+           /* GFX9 has the ESGS ring buffer in LDS. */
+           (ctx->screen->b.chip_class >= GFX9 &&
+            (shader->key.as_es ||
+             ctx->type == PIPE_SHADER_GEOMETRY)))
+               declare_lds_as_pointer(ctx);
 }
 
 /**
@@ -6005,11 +4639,8 @@ static void preload_ring_buffers(struct si_shader_context *ctx)
        LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
                                            ctx->param_rw_buffers);
 
-       if ((ctx->type == PIPE_SHADER_VERTEX &&
-            ctx->shader->key.as_es) ||
-           (ctx->type == PIPE_SHADER_TESS_EVAL &&
-            ctx->shader->key.as_es) ||
-           ctx->type == PIPE_SHADER_GEOMETRY) {
+       if (ctx->screen->b.chip_class <= VI &&
+           (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
                unsigned ring =
                        ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
                                                             : SI_ES_RING_ESGS;
@@ -6090,7 +4721,6 @@ static void preload_ring_buffers(struct si_shader_context *ctx)
                                             S_008F0C_ADD_TID_ENABLE(1),
                                             0),
                                LLVMConstInt(ctx->i32, 3, 0), "");
-                       ring = LLVMBuildBitCast(builder, ring, ctx->v16i8, "");
 
                        ctx->gsvs_ring[stream] = ring;
                }
@@ -6163,6 +4793,7 @@ void si_shader_binary_read_config(struct ac_shader_binary *binary,
                case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
                case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
                case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
+               case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
                case R_00B848_COMPUTE_PGM_RSRC1:
                        conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
                        conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
@@ -6213,24 +4844,16 @@ void si_shader_binary_read_config(struct ac_shader_binary *binary,
                conf->spi_ps_input_addr = conf->spi_ps_input_ena;
 }
 
-void si_shader_apply_scratch_relocs(struct si_context *sctx,
-                       struct si_shader *shader,
-                       struct si_shader_config *config,
-                       uint64_t scratch_va)
+void si_shader_apply_scratch_relocs(struct si_shader *shader,
+                                   uint64_t scratch_va)
 {
        unsigned i;
        uint32_t scratch_rsrc_dword0 = scratch_va;
        uint32_t scratch_rsrc_dword1 =
                S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
 
-       /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
-        * correctly.
-        */
-       if (HAVE_LLVM >= 0x0309)
-               scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
-       else
-               scratch_rsrc_dword1 |=
-                       S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
+       /* Enable scratch coalescing. */
+       scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
 
        for (i = 0 ; i < shader->binary.reloc_count; i++) {
                const struct ac_shader_reloc *reloc =
@@ -6245,7 +4868,7 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx,
        }
 }
 
-static unsigned si_get_shader_binary_size(struct si_shader *shader)
+static unsigned si_get_shader_binary_size(const struct si_shader *shader)
 {
        unsigned size = shader->binary.code_size;
 
@@ -6253,6 +4876,8 @@ static unsigned si_get_shader_binary_size(struct si_shader *shader)
                size += shader->prolog->binary.code_size;
        if (shader->previous_stage)
                size += shader->previous_stage->binary.code_size;
+       if (shader->prolog2)
+               size += shader->prolog2->binary.code_size;
        if (shader->epilog)
                size += shader->epilog->binary.code_size;
        return size;
@@ -6264,6 +4889,8 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
                shader->prolog ? &shader->prolog->binary : NULL;
        const struct ac_shader_binary *previous_stage =
                shader->previous_stage ? &shader->previous_stage->binary : NULL;
+       const struct ac_shader_binary *prolog2 =
+               shader->prolog2 ? &shader->prolog2->binary : NULL;
        const struct ac_shader_binary *epilog =
                shader->epilog ? &shader->epilog->binary : NULL;
        const struct ac_shader_binary *mainb = &shader->binary;
@@ -6273,15 +4900,11 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
 
        assert(!prolog || !prolog->rodata_size);
        assert(!previous_stage || !previous_stage->rodata_size);
-       assert((!prolog && !previous_stage && !epilog) || !mainb->rodata_size);
+       assert(!prolog2 || !prolog2->rodata_size);
+       assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
+              !mainb->rodata_size);
        assert(!epilog || !epilog->rodata_size);
 
-       /* GFX9 can fetch at most 128 bytes past the end of the shader.
-        * Prevent VM faults.
-        */
-       if (sscreen->b.chip_class >= GFX9)
-               bo_size += 128;
-
        r600_resource_reference(&shader->bo, NULL);
        shader->bo = (struct r600_resource*)
                     pipe_buffer_create(&sscreen->b.b, 0,
@@ -6295,23 +4918,28 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
                                        PIPE_TRANSFER_READ_WRITE |
                                        PIPE_TRANSFER_UNSYNCHRONIZED);
 
+       /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
+        * endian-independent. */
        if (prolog) {
-               util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
+               memcpy(ptr, prolog->code, prolog->code_size);
                ptr += prolog->code_size;
        }
        if (previous_stage) {
-               util_memcpy_cpu_to_le32(ptr, previous_stage->code,
-                                       previous_stage->code_size);
+               memcpy(ptr, previous_stage->code, previous_stage->code_size);
                ptr += previous_stage->code_size;
        }
+       if (prolog2) {
+               memcpy(ptr, prolog2->code, prolog2->code_size);
+               ptr += prolog2->code_size;
+       }
 
-       util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
+       memcpy(ptr, mainb->code, mainb->code_size);
        ptr += mainb->code_size;
 
        if (epilog)
-               util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
+               memcpy(ptr, epilog->code, epilog->code_size);
        else if (mainb->rodata_size > 0)
-               util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
+               memcpy(ptr, mainb->rodata, mainb->rodata_size);
 
        sscreen->b.ws->buffer_unmap(shader->bo->buf);
        return 0;
@@ -6366,13 +4994,13 @@ static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
 }
 
 static void si_shader_dump_stats(struct si_screen *sscreen,
-                                struct si_shader *shader,
+                                const struct si_shader *shader,
                                 struct pipe_debug_callback *debug,
                                 unsigned processor,
                                 FILE *file,
                                 bool check_debug_option)
 {
-       struct si_shader_config *conf = &shader->config;
+       const struct si_shader_config *conf = &shader->config;
        unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
        unsigned code_size = si_get_shader_binary_size(shader);
        unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
@@ -6458,7 +5086,7 @@ static void si_shader_dump_stats(struct si_screen *sscreen,
                           conf->spilled_vgprs, conf->private_mem_vgprs);
 }
 
-const char *si_get_shader_name(struct si_shader *shader, unsigned processor)
+const char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
 {
        switch (processor) {
        case PIPE_SHADER_VERTEX:
@@ -6489,7 +5117,7 @@ const char *si_get_shader_name(struct si_shader *shader, unsigned processor)
        }
 }
 
-void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
+void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
                    struct pipe_debug_callback *debug, unsigned processor,
                    FILE *file, bool check_debug_option)
 {
@@ -6498,6 +5126,13 @@ void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
                si_dump_shader_key(processor, shader, file);
 
        if (!check_debug_option && shader->binary.llvm_ir_string) {
+               if (shader->previous_stage &&
+                   shader->previous_stage->binary.llvm_ir_string) {
+                       fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
+                               si_get_shader_name(shader, processor));
+                       fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
+               }
+
                fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
                        si_get_shader_name(shader, processor));
                fprintf(file, "%s\n", shader->binary.llvm_ir_string);
@@ -6514,6 +5149,9 @@ void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
                if (shader->previous_stage)
                        si_shader_dump_disassembly(&shader->previous_stage->binary,
                                                   debug, "previous stage", file);
+               if (shader->prolog2)
+                       si_shader_dump_disassembly(&shader->prolog2->binary,
+                                                  debug, "prolog2", file);
 
                si_shader_dump_disassembly(&shader->binary, debug, "main", file);
 
@@ -6527,14 +5165,14 @@ void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
                             check_debug_option);
 }
 
-int si_compile_llvm(struct si_screen *sscreen,
-                   struct ac_shader_binary *binary,
-                   struct si_shader_config *conf,
-                   LLVMTargetMachineRef tm,
-                   LLVMModuleRef mod,
-                   struct pipe_debug_callback *debug,
-                   unsigned processor,
-                   const char *name)
+static int si_compile_llvm(struct si_screen *sscreen,
+                          struct ac_shader_binary *binary,
+                          struct si_shader_config *conf,
+                          LLVMTargetMachineRef tm,
+                          LLVMModuleRef mod,
+                          struct pipe_debug_callback *debug,
+                          unsigned processor,
+                          const char *name)
 {
        int r = 0;
        unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
@@ -6647,8 +5285,7 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
        preload_ring_buffers(&ctx);
 
        LLVMValueRef voffset =
-               lp_build_mul_imm(uint, LLVMGetParam(ctx.main_fn,
-                                                   ctx.param_vertex_id), 4);
+               lp_build_mul_imm(uint, ctx.abi.vertex_id, 4);
 
        /* Fetch the vertex stream ID.*/
        LLVMValueRef stream_id;
@@ -6707,7 +5344,8 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
                                        ac_build_buffer_load(&ctx.ac,
                                                             ctx.gsvs_ring[0], 1,
                                                             ctx.i32_0, voffset,
-                                                            soffset, 0, 1, 1, true);
+                                                            soffset, 0, 1, 1,
+                                                            true, false);
                        }
                }
 
@@ -6728,13 +5366,8 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
 
        LLVMBuildRetVoid(gallivm->builder);
 
-       /* Dump LLVM IR before any optimization passes */
-       if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
-           r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
-               ac_dump_module(ctx.gallivm.module);
-
-       si_llvm_finalize_module(&ctx,
-               r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_GEOMETRY));
+       ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
+       si_llvm_optimize_module(&ctx);
 
        r = si_compile_llvm(sscreen, &ctx.shader->binary,
                            &ctx.shader->config, ctx.tm,
@@ -6760,16 +5393,14 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
        return shader;
 }
 
-static void si_dump_shader_key_vs(struct si_shader_key *key,
-                                 struct si_vs_prolog_bits *prolog,
+static void si_dump_shader_key_vs(const struct si_shader_key *key,
+                                 const struct si_vs_prolog_bits *prolog,
                                  const char *prefix, FILE *f)
 {
-       fprintf(f, "  %s.instance_divisors = {", prefix);
-       for (int i = 0; i < ARRAY_SIZE(prolog->instance_divisors); i++) {
-               fprintf(f, !i ? "%u" : ", %u",
-                       prolog->instance_divisors[i]);
-       }
-       fprintf(f, "}\n");
+       fprintf(f, "  %s.instance_divisor_is_one = %u\n",
+               prefix, prolog->instance_divisor_is_one);
+       fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
+               prefix, prolog->instance_divisor_is_fetched);
 
        fprintf(f, "  mono.vs.fix_fetch = {");
        for (int i = 0; i < SI_MAX_ATTRIBS; i++)
@@ -6777,10 +5408,10 @@ static void si_dump_shader_key_vs(struct si_shader_key *key,
        fprintf(f, "}\n");
 }
 
-static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
+static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
                               FILE *f)
 {
-       struct si_shader_key *key = &shader->key;
+       const struct si_shader_key *key = &shader->key;
 
        fprintf(f, "SHADER KEY\n");
 
@@ -6790,8 +5421,8 @@ static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
                                      "part.vs.prolog", f);
                fprintf(f, "  as_es = %u\n", key->as_es);
                fprintf(f, "  as_ls = %u\n", key->as_ls);
-               fprintf(f, "  part.vs.epilog.export_prim_id = %u\n",
-                       key->part.vs.epilog.export_prim_id);
+               fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
+                       key->mono.u.vs_export_prim_id);
                break;
 
        case PIPE_SHADER_TESS_CTRL:
@@ -6800,15 +5431,24 @@ static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
                                              "part.tcs.ls_prolog", f);
                }
                fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
-               fprintf(f, "  mono.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.ff_tcs_inputs_to_copy);
+               fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
                break;
 
        case PIPE_SHADER_TESS_EVAL:
-               fprintf(f, "  part.tes.epilog.export_prim_id = %u\n", key->part.tes.epilog.export_prim_id);
                fprintf(f, "  as_es = %u\n", key->as_es);
+               fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
+                       key->mono.u.vs_export_prim_id);
                break;
 
        case PIPE_SHADER_GEOMETRY:
+               if (shader->is_gs_copy_shader)
+                       break;
+
+               if (shader->selector->screen->b.chip_class >= GFX9 &&
+                   key->part.gs.es->type == PIPE_SHADER_VERTEX) {
+                       si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
+                                             "part.gs.vs_prolog", f);
+               }
                fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
                break;
 
@@ -6843,9 +5483,8 @@ static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
             processor == PIPE_SHADER_TESS_EVAL ||
             processor == PIPE_SHADER_VERTEX) &&
            !key->as_es && !key->as_ls) {
-               fprintf(f, "  opt.hw_vs.kill_outputs = 0x%"PRIx64"\n", key->opt.hw_vs.kill_outputs);
-               fprintf(f, "  opt.hw_vs.kill_outputs2 = 0x%x\n", key->opt.hw_vs.kill_outputs2);
-               fprintf(f, "  opt.hw_vs.clip_disable = %u\n", key->opt.hw_vs.clip_disable);
+               fprintf(f, "  opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
+               fprintf(f, "  opt.clip_disable = %u\n", key->opt.clip_disable);
        }
 }
 
@@ -6854,7 +5493,8 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
                               LLVMTargetMachineRef tm)
 {
        struct lp_build_tgsi_context *bld_base;
-       struct lp_build_tgsi_action tmpl = {};
+
+       ctx->abi.chip_class = sscreen->b.chip_class;
 
        si_llvm_context_init(ctx, sscreen, tm);
 
@@ -6865,53 +5505,6 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
        bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
        bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
 
-       bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
-       bld_base->op_actions[TGSI_OPCODE_TEX_LZ] = tex_action;
-       bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
-       bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
-       bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
-       bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
-       bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
-       bld_base->op_actions[TGSI_OPCODE_TXF_LZ] = tex_action;
-       bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
-       bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
-       bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
-       bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
-       bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
-       bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
-       bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
-       bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
-
-       bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
-       bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
-       bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
-       bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
-       bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
-       bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
-
-       tmpl.fetch_args = atomic_fetch_args;
-       tmpl.emit = atomic_emit;
-       bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
-       bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
-       bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
-       bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
-       bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
-       bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
-       bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
-       bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
-       bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
-       bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
-       bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
-       bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
-       bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
-       bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
-       bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
-       bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
-       bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
-       bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
-       bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
-       bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
-
        bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
 
        bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
@@ -6936,22 +5529,22 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
        bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
 }
 
-static void si_eliminate_const_vs_outputs(struct si_shader_context *ctx)
+static void si_optimize_vs_outputs(struct si_shader_context *ctx)
 {
        struct si_shader *shader = ctx->shader;
        struct tgsi_shader_info *info = &shader->selector->info;
 
-       if (ctx->type == PIPE_SHADER_FRAGMENT ||
-           ctx->type == PIPE_SHADER_COMPUTE ||
-           shader->key.as_es ||
-           shader->key.as_ls)
+       if ((ctx->type != PIPE_SHADER_VERTEX &&
+            ctx->type != PIPE_SHADER_TESS_EVAL) ||
+           shader->key.as_ls ||
+           shader->key.as_es)
                return;
 
-       ac_eliminate_const_vs_outputs(&ctx->ac,
-                                     ctx->main_fn,
-                                     shader->info.vs_output_param_offset,
-                                     info->num_outputs,
-                                     &shader->info.nr_param_exports);
+       ac_optimize_vs_outputs(&ctx->ac,
+                              ctx->main_fn,
+                              shader->info.vs_output_param_offset,
+                              info->num_outputs,
+                              &shader->info.nr_param_exports);
 }
 
 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
@@ -6980,12 +5573,34 @@ static void si_count_scratch_private_memory(struct si_shader_context *ctx)
        }
 }
 
+static void si_init_exec_full_mask(struct si_shader_context *ctx)
+{
+       LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
+       lp_build_intrinsic(ctx->gallivm.builder,
+                          "llvm.amdgcn.init.exec", ctx->voidt,
+                          &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
+}
+
+static void si_init_exec_from_input(struct si_shader_context *ctx,
+                                   unsigned param, unsigned bitoffset)
+{
+       LLVMValueRef args[] = {
+               LLVMGetParam(ctx->main_fn, param),
+               LLVMConstInt(ctx->i32, bitoffset, 0),
+       };
+       lp_build_intrinsic(ctx->gallivm.builder,
+                          "llvm.amdgcn.init.exec.from.input",
+                          ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
+}
+
 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
-                                struct si_shader *shader)
+                                bool is_monolithic)
 {
+       struct si_shader *shader = ctx->shader;
        struct si_shader_selector *sel = shader->selector;
        struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
 
+       // TODO clean all this up!
        switch (ctx->type) {
        case PIPE_SHADER_VERTEX:
                ctx->load_input = declare_input_vs;
@@ -6993,8 +5608,10 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                        bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
                else if (shader->key.as_es)
                        bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
-               else
-                       bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
+               else {
+                       ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
+                       bld_base->emit_epilogue = si_tgsi_emit_epilogue;
+               }
                break;
        case PIPE_SHADER_TESS_CTRL:
                bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
@@ -7006,8 +5623,10 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
                if (shader->key.as_es)
                        bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
-               else
-                       bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
+               else {
+                       ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
+                       bld_base->emit_epilogue = si_tgsi_emit_epilogue;
+               }
                break;
        case PIPE_SHADER_GEOMETRY:
                bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
@@ -7015,7 +5634,8 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                break;
        case PIPE_SHADER_FRAGMENT:
                ctx->load_input = declare_input_fs;
-               bld_base->emit_epilogue = si_llvm_return_fs_outputs;
+               ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
+               bld_base->emit_epilogue = si_tgsi_emit_epilogue;
                break;
        case PIPE_SHADER_COMPUTE:
                ctx->declare_memory_region = declare_compute_memory;
@@ -7025,9 +5645,50 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                return false;
        }
 
+       ctx->abi.load_ubo = load_ubo;
+
        create_function(ctx);
        preload_ring_buffers(ctx);
 
+       /* For GFX9 merged shaders:
+        * - Set EXEC for the first shader. If the prolog is present, set
+        *   EXEC there instead.
+        * - Add a barrier before the second shader.
+        * - In the second shader, reset EXEC to ~0 and wrap the main part in
+        *   an if-statement. This is required for correctness in geometry
+        *   shaders, to ensure that empty GS waves do not send GS_EMIT and
+        *   GS_CUT messages.
+        *
+        * For monolithic merged shaders, the first shader is wrapped in an
+        * if-block together with its prolog in si_build_wrapper_function.
+        */
+       if (ctx->screen->b.chip_class >= GFX9) {
+               if (!is_monolithic &&
+                   sel->info.num_instructions > 1 && /* not empty shader */
+                   (shader->key.as_es || shader->key.as_ls) &&
+                   (ctx->type == PIPE_SHADER_TESS_EVAL ||
+                    (ctx->type == PIPE_SHADER_VERTEX &&
+                     !sel->vs_needs_prolog))) {
+                       si_init_exec_from_input(ctx,
+                                               ctx->param_merged_wave_info, 0);
+               } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
+                          ctx->type == PIPE_SHADER_GEOMETRY) {
+                       if (!is_monolithic)
+                               si_init_exec_full_mask(ctx);
+
+                       /* The barrier must execute for all shaders in a
+                        * threadgroup.
+                        */
+                       si_llvm_emit_barrier(NULL, bld_base, NULL);
+
+                       LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
+                       LLVMValueRef ena =
+                               LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+                                           ac_get_thread_id(&ctx->ac), num_threads, "");
+                       lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
+               }
+       }
+
        if (ctx->type == PIPE_SHADER_GEOMETRY) {
                int i;
                for (i = 0; i < 4; i++) {
@@ -7037,9 +5698,22 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
                }
        }
 
-       if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
-               fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
-               return false;
+       if (ctx->type == PIPE_SHADER_FRAGMENT && sel->info.uses_kill &&
+           ctx->screen->b.debug_flags & DBG_FS_CORRECT_DERIVS_AFTER_KILL) {
+               /* This is initialized to 0.0 = not kill. */
+               ctx->postponed_kill = lp_build_alloca(&ctx->gallivm, ctx->f32, "");
+       }
+
+       if (sel->tokens) {
+               if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
+                       fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
+                       return false;
+               }
+       } else {
+               if (!si_nir_build_llvm(ctx, sel->nir)) {
+                       fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
+                       return false;
+               }
        }
 
        si_llvm_build_ret(ctx, ctx->return_value);
@@ -7066,33 +5740,21 @@ static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
        key->vs_prolog.states = *prolog_key;
        key->vs_prolog.num_input_sgprs = num_input_sgprs;
        key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
+       key->vs_prolog.as_ls = shader_out->key.as_ls;
 
-       /* Set the instanceID flag. */
-       for (unsigned i = 0; i < info->num_inputs; i++)
-               if (key->vs_prolog.states.instance_divisors[i])
-                       shader_out->info.uses_instanceid = true;
-}
-
-/**
- * Compute the VS epilog key, which contains all the information needed to
- * build the VS epilog function, and set the PrimitiveID output offset.
- */
-static void si_get_vs_epilog_key(struct si_shader *shader,
-                                struct si_vs_epilog_bits *states,
-                                union si_shader_part_key *key)
-{
-       memset(key, 0, sizeof(*key));
-       key->vs_epilog.states = *states;
+       if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
+               key->vs_prolog.as_ls = 1;
+               key->vs_prolog.num_merged_next_stage_vgprs = 2;
+       } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
+               key->vs_prolog.num_merged_next_stage_vgprs = 5;
+       }
 
-       /* Set up the PrimitiveID output. */
-       if (shader->key.part.vs.epilog.export_prim_id) {
-               unsigned index = shader->selector->info.num_outputs;
-               unsigned offset = shader->info.nr_param_exports++;
+       /* Enable loading the InstanceID VGPR. */
+       uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
 
-               key->vs_epilog.prim_id_param_offset = offset;
-               assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
-               shader->info.vs_output_param_offset[index] = offset;
-       }
+       if ((key->vs_prolog.states.instance_divisor_is_one |
+            key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
+               shader_out->info.uses_instanceid = true;
 }
 
 /**
@@ -7253,29 +5915,45 @@ static void si_get_ps_epilog_key(struct si_shader *shader,
 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
                                        union si_shader_part_key *key)
 {
-       const unsigned num_sgprs = SI_GS_NUM_USER_SGPR + 2;
-       const unsigned num_vgprs = 8;
+       unsigned num_sgprs, num_vgprs;
        struct gallivm_state *gallivm = &ctx->gallivm;
+       struct si_function_info fninfo;
        LLVMBuilderRef builder = gallivm->builder;
-       LLVMTypeRef params[32];
-       LLVMTypeRef returns[32];
+       LLVMTypeRef returns[48];
        LLVMValueRef func, ret;
 
+       si_init_function_info(&fninfo);
+
+       if (ctx->screen->b.chip_class >= GFX9) {
+               num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
+               num_vgprs = 5; /* ES inputs are not needed by GS */
+       } else {
+               num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
+               num_vgprs = 8;
+       }
+
        for (unsigned i = 0; i < num_sgprs; ++i) {
-               params[i] = ctx->i32;
+               add_arg(&fninfo, ARG_SGPR, ctx->i32);
                returns[i] = ctx->i32;
        }
 
        for (unsigned i = 0; i < num_vgprs; ++i) {
-               params[num_sgprs + i] = ctx->i32;
+               add_arg(&fninfo, ARG_VGPR, ctx->i32);
                returns[num_sgprs + i] = ctx->f32;
        }
 
        /* Create the function. */
        si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
-                          params, num_sgprs + num_vgprs, num_sgprs - 1);
+                          &fninfo, 0);
        func = ctx->main_fn;
 
+       /* Set the full EXEC mask for the prolog, because we are only fiddling
+        * with registers here. The main shader part will set the correct EXEC
+        * mask.
+        */
+       if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
+               si_init_exec_full_mask(ctx);
+
        /* Copy inputs to outputs. This should be no-op, as the registers match,
         * but it will prevent the compiler from overwriting them unintentionally.
         */
@@ -7292,7 +5970,7 @@ static void si_build_gs_prolog_function(struct si_shader_context *ctx,
 
        if (key->gs_prolog.states.tri_strip_adj_fix) {
                /* Remap the input vertices for every other primitive. */
-               const unsigned vtx_params[6] = {
+               const unsigned gfx6_vtx_params[6] = {
                        num_sgprs,
                        num_sgprs + 1,
                        num_sgprs + 3,
@@ -7300,18 +5978,53 @@ static void si_build_gs_prolog_function(struct si_shader_context *ctx,
                        num_sgprs + 5,
                        num_sgprs + 6
                };
+               const unsigned gfx9_vtx_params[3] = {
+                       num_sgprs,
+                       num_sgprs + 1,
+                       num_sgprs + 4,
+               };
+               LLVMValueRef vtx_in[6], vtx_out[6];
                LLVMValueRef prim_id, rotate;
 
+               if (ctx->screen->b.chip_class >= GFX9) {
+                       for (unsigned i = 0; i < 3; i++) {
+                               vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
+                               vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
+                       }
+               } else {
+                       for (unsigned i = 0; i < 6; i++)
+                               vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
+               }
+
                prim_id = LLVMGetParam(func, num_sgprs + 2);
                rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
 
                for (unsigned i = 0; i < 6; ++i) {
-                       LLVMValueRef base, rotated, actual;
-                       base = LLVMGetParam(func, vtx_params[i]);
-                       rotated = LLVMGetParam(func, vtx_params[(i + 4) % 6]);
-                       actual = LLVMBuildSelect(builder, rotate, rotated, base, "");
-                       actual = LLVMBuildBitCast(builder, actual, ctx->f32, "");
-                       ret = LLVMBuildInsertValue(builder, ret, actual, vtx_params[i], "");
+                       LLVMValueRef base, rotated;
+                       base = vtx_in[i];
+                       rotated = vtx_in[(i + 4) % 6];
+                       vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
+               }
+
+               if (ctx->screen->b.chip_class >= GFX9) {
+                       for (unsigned i = 0; i < 3; i++) {
+                               LLVMValueRef hi, out;
+
+                               hi = LLVMBuildShl(builder, vtx_out[i*2+1],
+                                                 LLVMConstInt(ctx->i32, 16, 0), "");
+                               out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
+                               out = LLVMBuildBitCast(builder, out, ctx->f32, "");
+                               ret = LLVMBuildInsertValue(builder, ret, out,
+                                                          gfx9_vtx_params[i], "");
+                       }
+               } else {
+                       for (unsigned i = 0; i < 6; i++) {
+                               LLVMValueRef out;
+
+                               out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
+                               ret = LLVMBuildInsertValue(builder, ret, out,
+                                                          gfx6_vtx_params[i], "");
+                       }
                }
        }
 
@@ -7325,20 +6038,26 @@ static void si_build_gs_prolog_function(struct si_shader_context *ctx,
 static void si_build_wrapper_function(struct si_shader_context *ctx,
                                      LLVMValueRef *parts,
                                      unsigned num_parts,
-                                     unsigned main_part)
+                                     unsigned main_part,
+                                     unsigned next_shader_first_part)
 {
        struct gallivm_state *gallivm = &ctx->gallivm;
        LLVMBuilderRef builder = ctx->gallivm.builder;
-       /* PS epilog has one arg per color component */
-       LLVMTypeRef param_types[48];
-       LLVMValueRef out[48];
+       /* PS epilog has one arg per color component; gfx9 merged shader
+        * prologs need to forward 32 user SGPRs.
+        */
+       struct si_function_info fninfo;
+       LLVMValueRef initial[64], out[64];
        LLVMTypeRef function_type;
-       unsigned num_params;
-       unsigned num_out;
+       unsigned num_first_params;
+       unsigned num_out, initial_num_out;
        MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
+       MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
        unsigned num_sgprs, num_vgprs;
-       unsigned last_sgpr_param;
        unsigned gprs;
+       struct lp_build_if_state if_state;
+
+       si_init_function_info(&fninfo);
 
        for (unsigned i = 0; i < num_parts; ++i) {
                lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
@@ -7354,9 +6073,9 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
        num_vgprs = 0;
 
        function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
-       num_params = LLVMCountParamTypes(function_type);
+       num_first_params = LLVMCountParamTypes(function_type);
 
-       for (unsigned i = 0; i < num_params; ++i) {
+       for (unsigned i = 0; i < num_first_params; ++i) {
                LLVMValueRef param = LLVMGetParam(parts[0], i);
 
                if (ac_is_sgpr_param(param)) {
@@ -7366,20 +6085,14 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
                        num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
                }
        }
-       assert(num_vgprs + num_sgprs <= ARRAY_SIZE(param_types));
 
-       num_params = 0;
-       last_sgpr_param = 0;
        gprs = 0;
        while (gprs < num_sgprs + num_vgprs) {
-               LLVMValueRef param = LLVMGetParam(parts[main_part], num_params);
-               unsigned size;
+               LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params);
+               LLVMTypeRef type = LLVMTypeOf(param);
+               unsigned size = llvm_get_type_size(type) / 4;
 
-               param_types[num_params] = LLVMTypeOf(param);
-               if (gprs < num_sgprs)
-                       last_sgpr_param = num_params;
-               size = llvm_get_type_size(param_types[num_params]) / 4;
-               num_params++;
+               add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type);
 
                assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
                assert(gprs + size <= num_sgprs + num_vgprs &&
@@ -7388,7 +6101,11 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
                gprs += size;
        }
 
-       si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
+       si_create_function(ctx, "wrapper", NULL, 0, &fninfo,
+                          si_get_max_workgroup_size(ctx->shader));
+
+       if (is_merged_shader(ctx->shader))
+               si_init_exec_full_mask(ctx);
 
        /* Record the arguments of the function as if they were an output of
         * a previous part.
@@ -7396,10 +6113,10 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
        num_out = 0;
        num_out_sgpr = 0;
 
-       for (unsigned i = 0; i < num_params; ++i) {
+       for (unsigned i = 0; i < fninfo.num_params; ++i) {
                LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
                LLVMTypeRef param_type = LLVMTypeOf(param);
-               LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32;
+               LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32;
                unsigned size = llvm_get_type_size(param_type) / 4;
 
                if (size == 1) {
@@ -7422,19 +6139,33 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
                                        builder, param, LLVMConstInt(ctx->i32, j, 0), "");
                }
 
-               if (i <= last_sgpr_param)
+               if (i < fninfo.num_sgpr_params)
                        num_out_sgpr = num_out;
        }
 
+       memcpy(initial, out, sizeof(out));
+       initial_num_out = num_out;
+       initial_num_out_sgpr = num_out_sgpr;
+
        /* Now chain the parts. */
        for (unsigned part = 0; part < num_parts; ++part) {
                LLVMValueRef in[48];
                LLVMValueRef ret;
                LLVMTypeRef ret_type;
                unsigned out_idx = 0;
-
-               num_params = LLVMCountParams(parts[part]);
-               assert(num_params <= ARRAY_SIZE(param_types));
+               unsigned num_params = LLVMCountParams(parts[part]);
+
+               /* Merged shaders are executed conditionally depending
+                * on the number of enabled threads passed in the input SGPRs. */
+               if (is_merged_shader(ctx->shader) && part == 0) {
+                       LLVMValueRef ena, count = initial[3];
+
+                       count = LLVMBuildAnd(builder, count,
+                                            LLVMConstInt(ctx->i32, 0x7f, 0), "");
+                       ena = LLVMBuildICmp(builder, LLVMIntULT,
+                                           ac_get_thread_id(&ctx->ac), count, "");
+                       lp_build_if(&if_state, &ctx->gallivm, ena);
+               }
 
                /* Derive arguments for the next part from outputs of the
                 * previous one.
@@ -7483,9 +6214,27 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
                }
 
                ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
-               ret_type = LLVMTypeOf(ret);
+
+               if (is_merged_shader(ctx->shader) &&
+                   part + 1 == next_shader_first_part) {
+                       lp_build_endif(&if_state);
+
+                       /* The second half of the merged shader should use
+                        * the inputs from the toplevel (wrapper) function,
+                        * not the return value from the last call.
+                        *
+                        * That's because the last call was executed condi-
+                        * tionally, so we can't consume it in the main
+                        * block.
+                        */
+                       memcpy(out, initial, sizeof(initial));
+                       num_out = initial_num_out;
+                       num_out_sgpr = initial_num_out_sgpr;
+                       continue;
+               }
 
                /* Extract the returned GPRs. */
+               ret_type = LLVMTypeOf(ret);
                num_out = 0;
                num_out_sgpr = 0;
 
@@ -7498,6 +6247,7 @@ static void si_build_wrapper_function(struct si_shader_context *ctx,
                                LLVMValueRef val =
                                        LLVMBuildExtractValue(builder, ret, i, "");
 
+                               assert(num_out < ARRAY_SIZE(out));
                                out[num_out++] = val;
 
                                if (LLVMTypeOf(val) == ctx->i32) {
@@ -7525,7 +6275,10 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
         * conversion fails. */
        if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
            !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
-               tgsi_dump(sel->tokens, 0);
+               if (sel->tokens)
+                       tgsi_dump(sel->tokens, 0);
+               else
+                       nir_print_shader(sel->nir, stderr);
                si_dump_streamout(&sel->so);
        }
 
@@ -7540,20 +6293,16 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 
        ctx.load_system_value = declare_system_value;
 
-       if (!si_compile_tgsi_main(&ctx, shader)) {
+       if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
                si_llvm_dispose(&ctx);
                return -1;
        }
 
        if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
-               LLVMValueRef parts[3];
-               bool need_prolog;
-               bool need_epilog;
-
-               need_prolog = sel->vs_needs_prolog;
-               need_epilog = !shader->key.as_es && !shader->key.as_ls;
+               LLVMValueRef parts[2];
+               bool need_prolog = sel->vs_needs_prolog;
 
-               parts[need_prolog ? 1 : 0] = ctx.main_fn;
+               parts[1] = ctx.main_fn;
 
                if (need_prolog) {
                        union si_shader_part_key prolog_key;
@@ -7565,51 +6314,144 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
                        parts[0] = ctx.main_fn;
                }
 
-               if (need_epilog) {
+               si_build_wrapper_function(&ctx, parts + !need_prolog,
+                                         1 + need_prolog, need_prolog, 0);
+       } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
+               if (sscreen->b.chip_class >= GFX9) {
+                       struct si_shader_selector *ls = shader->key.part.tcs.ls;
+                       LLVMValueRef parts[4];
+
+                       /* TCS main part */
+                       parts[2] = ctx.main_fn;
+
+                       /* TCS epilog */
+                       union si_shader_part_key tcs_epilog_key;
+                       memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
+                       tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+                       si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
+                       parts[3] = ctx.main_fn;
+
+                       /* VS prolog */
+                       if (ls->vs_needs_prolog) {
+                               union si_shader_part_key vs_prolog_key;
+                               si_get_vs_prolog_key(&ls->info,
+                                                    shader->info.num_input_sgprs,
+                                                    &shader->key.part.tcs.ls_prolog,
+                                                    shader, &vs_prolog_key);
+                               vs_prolog_key.vs_prolog.is_monolithic = true;
+                               si_build_vs_prolog_function(&ctx, &vs_prolog_key);
+                               parts[0] = ctx.main_fn;
+                       }
+
+                       /* VS as LS main part */
+                       struct si_shader shader_ls = {};
+                       shader_ls.selector = ls;
+                       shader_ls.key.as_ls = 1;
+                       shader_ls.key.mono = shader->key.mono;
+                       shader_ls.key.opt = shader->key.opt;
+                       si_llvm_context_set_tgsi(&ctx, &shader_ls);
+
+                       if (!si_compile_tgsi_main(&ctx, true)) {
+                               si_llvm_dispose(&ctx);
+                               return -1;
+                       }
+                       shader->info.uses_instanceid |= ls->info.uses_instanceid;
+                       parts[1] = ctx.main_fn;
+
+                       /* Reset the shader context. */
+                       ctx.shader = shader;
+                       ctx.type = PIPE_SHADER_TESS_CTRL;
+
+                       si_build_wrapper_function(&ctx,
+                                                 parts + !ls->vs_needs_prolog,
+                                                 4 - !ls->vs_needs_prolog, 0,
+                                                 ls->vs_needs_prolog ? 2 : 1);
+               } else {
+                       LLVMValueRef parts[2];
                        union si_shader_part_key epilog_key;
-                       si_get_vs_epilog_key(shader, &shader->key.part.vs.epilog, &epilog_key);
-                       si_build_vs_epilog_function(&ctx, &epilog_key);
-                       parts[need_prolog ? 2 : 1] = ctx.main_fn;
+
+                       parts[0] = ctx.main_fn;
+
+                       memset(&epilog_key, 0, sizeof(epilog_key));
+                       epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+                       si_build_tcs_epilog_function(&ctx, &epilog_key);
+                       parts[1] = ctx.main_fn;
+
+                       si_build_wrapper_function(&ctx, parts, 2, 0, 0);
                }
+       } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
+               if (ctx.screen->b.chip_class >= GFX9) {
+                       struct si_shader_selector *es = shader->key.part.gs.es;
+                       LLVMValueRef es_prolog = NULL;
+                       LLVMValueRef es_main = NULL;
+                       LLVMValueRef gs_prolog = NULL;
+                       LLVMValueRef gs_main = ctx.main_fn;
+
+                       /* GS prolog */
+                       union si_shader_part_key gs_prolog_key;
+                       memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
+                       gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
+                       gs_prolog_key.gs_prolog.is_monolithic = true;
+                       si_build_gs_prolog_function(&ctx, &gs_prolog_key);
+                       gs_prolog = ctx.main_fn;
+
+                       /* ES prolog */
+                       if (es->vs_needs_prolog) {
+                               union si_shader_part_key vs_prolog_key;
+                               si_get_vs_prolog_key(&es->info,
+                                                    shader->info.num_input_sgprs,
+                                                    &shader->key.part.tcs.ls_prolog,
+                                                    shader, &vs_prolog_key);
+                               vs_prolog_key.vs_prolog.is_monolithic = true;
+                               si_build_vs_prolog_function(&ctx, &vs_prolog_key);
+                               es_prolog = ctx.main_fn;
+                       }
 
-               si_build_wrapper_function(&ctx, parts, 1 + need_prolog + need_epilog,
-                                         need_prolog ? 1 : 0);
-       } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
-               LLVMValueRef parts[2];
-               union si_shader_part_key epilog_key;
+                       /* ES main part */
+                       struct si_shader shader_es = {};
+                       shader_es.selector = es;
+                       shader_es.key.as_es = 1;
+                       shader_es.key.mono = shader->key.mono;
+                       shader_es.key.opt = shader->key.opt;
+                       si_llvm_context_set_tgsi(&ctx, &shader_es);
 
-               parts[0] = ctx.main_fn;
+                       if (!si_compile_tgsi_main(&ctx, true)) {
+                               si_llvm_dispose(&ctx);
+                               return -1;
+                       }
+                       shader->info.uses_instanceid |= es->info.uses_instanceid;
+                       es_main = ctx.main_fn;
 
-               memset(&epilog_key, 0, sizeof(epilog_key));
-               epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
-               si_build_tcs_epilog_function(&ctx, &epilog_key);
-               parts[1] = ctx.main_fn;
+                       /* Reset the shader context. */
+                       ctx.shader = shader;
+                       ctx.type = PIPE_SHADER_GEOMETRY;
 
-               si_build_wrapper_function(&ctx, parts, 2, 0);
-       } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
-                  !shader->key.as_es) {
-               LLVMValueRef parts[2];
-               union si_shader_part_key epilog_key;
+                       /* Prepare the array of shader parts. */
+                       LLVMValueRef parts[4];
+                       unsigned num_parts = 0, main_part, next_first_part;
 
-               parts[0] = ctx.main_fn;
+                       if (es_prolog)
+                               parts[num_parts++] = es_prolog;
 
-               si_get_vs_epilog_key(shader, &shader->key.part.tes.epilog, &epilog_key);
-               si_build_vs_epilog_function(&ctx, &epilog_key);
-               parts[1] = ctx.main_fn;
+                       parts[main_part = num_parts++] = es_main;
+                       parts[next_first_part = num_parts++] = gs_prolog;
+                       parts[num_parts++] = gs_main;
 
-               si_build_wrapper_function(&ctx, parts, 2, 0);
-       } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
-               LLVMValueRef parts[2];
-               union si_shader_part_key prolog_key;
+                       si_build_wrapper_function(&ctx, parts, num_parts,
+                                                 main_part, next_first_part);
+               } else {
+                       LLVMValueRef parts[2];
+                       union si_shader_part_key prolog_key;
 
-               parts[1] = ctx.main_fn;
+                       parts[1] = ctx.main_fn;
 
-               memset(&prolog_key, 0, sizeof(prolog_key));
-               prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
-               si_build_gs_prolog_function(&ctx, &prolog_key);
-               parts[0] = ctx.main_fn;
+                       memset(&prolog_key, 0, sizeof(prolog_key));
+                       prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
+                       si_build_gs_prolog_function(&ctx, &prolog_key);
+                       parts[0] = ctx.main_fn;
 
-               si_build_wrapper_function(&ctx, parts, 2, 1);
+                       si_build_wrapper_function(&ctx, parts, 2, 1, 0);
+               }
        } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
                LLVMValueRef parts[3];
                union si_shader_part_key prolog_key;
@@ -7630,19 +6472,14 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
                si_build_ps_epilog_function(&ctx, &epilog_key);
                parts[need_prolog ? 2 : 1] = ctx.main_fn;
 
-               si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, need_prolog ? 1 : 0);
+               si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
+                                         need_prolog ? 1 : 0, 0);
        }
 
-       /* Dump LLVM IR before any optimization passes */
-       if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
-           r600_can_dump_shader(&sscreen->b, ctx.type))
-               LLVMDumpModule(ctx.gallivm.module);
-
-       si_llvm_finalize_module(&ctx,
-                                   r600_extra_shader_checks(&sscreen->b, ctx.type));
+       si_llvm_optimize_module(&ctx);
 
        /* Post-optimization transformations and analysis. */
-       si_eliminate_const_vs_outputs(&ctx);
+       si_optimize_vs_outputs(&ctx);
 
        if ((debug && debug->debug_message) ||
            r600_can_dump_shader(&sscreen->b, ctx.type))
@@ -7689,7 +6526,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
        }
 
        /* Add the scratch offset to input SGPRs. */
-       if (shader->config.scratch_bytes_per_wave)
+       if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
                shader->info.num_input_sgprs += 1; /* scratch byte offset */
 
        /* Calculate the number of fragment input VGPRs. */
@@ -7808,8 +6645,7 @@ si_get_shader_part(struct si_screen *sscreen,
        build(&ctx, key);
 
        /* Compile. */
-       si_llvm_finalize_module(&ctx,
-               r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT));
+       si_llvm_optimize_module(&ctx);
 
        if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
                            gallivm->module, debug, ctx.type, name)) {
@@ -7827,6 +6663,21 @@ out:
        return result;
 }
 
+static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
+{
+       struct gallivm_state *gallivm = &ctx->gallivm;
+       LLVMValueRef ptr[2], list;
+
+       /* Get the pointer to rw buffers. */
+       ptr[0] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS);
+       ptr[1] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS_HI);
+       list = lp_build_gather_values(gallivm, ptr, 2);
+       list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
+       list = LLVMBuildIntToPtr(gallivm->builder, list,
+                                si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
+       return list;
+}
+
 /**
  * Build the vertex shader prolog function.
  *
@@ -7847,45 +6698,51 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                                        union si_shader_part_key *key)
 {
        struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMTypeRef *params, *returns;
+       struct si_function_info fninfo;
+       LLVMTypeRef *returns;
        LLVMValueRef ret, func;
-       int last_sgpr, num_params, num_returns, i;
+       int num_returns, i;
+       unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
+                                key->vs_prolog.num_merged_next_stage_vgprs;
+       unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
+       unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
+                                     num_input_vgprs;
+       unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
 
-       ctx->param_vertex_id = key->vs_prolog.num_input_sgprs;
-       ctx->param_instance_id = key->vs_prolog.num_input_sgprs + 3;
+       si_init_function_info(&fninfo);
 
        /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
-       params = alloca((key->vs_prolog.num_input_sgprs + 4) *
-                       sizeof(LLVMTypeRef));
-       returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
-                         key->vs_prolog.last_input + 1) *
+       returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
                         sizeof(LLVMTypeRef));
-       num_params = 0;
        num_returns = 0;
 
        /* Declare input and output SGPRs. */
-       num_params = 0;
        for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
-               params[num_params++] = ctx->i32;
+               add_arg(&fninfo, ARG_SGPR, ctx->i32);
                returns[num_returns++] = ctx->i32;
        }
-       last_sgpr = num_params - 1;
 
-       /* 4 preloaded VGPRs (outputs must be floats) */
-       for (i = 0; i < 4; i++) {
-               params[num_params++] = ctx->i32;
+       /* Preloaded VGPRs (outputs must be floats) */
+       for (i = 0; i < num_input_vgprs; i++) {
+               add_arg(&fninfo, ARG_VGPR, ctx->i32);
                returns[num_returns++] = ctx->f32;
        }
 
+       fninfo.assign[first_vs_vgpr] = &ctx->abi.vertex_id;
+       fninfo.assign[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)] = &ctx->abi.instance_id;
+
        /* Vertex load indices. */
        for (i = 0; i <= key->vs_prolog.last_input; i++)
                returns[num_returns++] = ctx->f32;
 
        /* Create the function. */
-       si_create_function(ctx, "vs_prolog", returns, num_returns, params,
-                          num_params, last_sgpr);
+       si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
        func = ctx->main_fn;
 
+       if (key->vs_prolog.num_merged_next_stage_vgprs &&
+           !key->vs_prolog.is_monolithic)
+               si_init_exec_from_input(ctx, 3, 0);
+
        /* Copy inputs to outputs. This should be no-op, as the registers match,
         * but it will prevent the compiler from overwriting them unintentionally.
         */
@@ -7894,88 +6751,61 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
                LLVMValueRef p = LLVMGetParam(func, i);
                ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
        }
-       for (i = num_params - 4; i < num_params; i++) {
+       for (; i < fninfo.num_params; i++) {
                LLVMValueRef p = LLVMGetParam(func, i);
                p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
                ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
        }
 
        /* Compute vertex load indices from instance divisors. */
+       LLVMValueRef instance_divisor_constbuf = NULL;
+
+       if (key->vs_prolog.states.instance_divisor_is_fetched) {
+               LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
+               LLVMValueRef buf_index =
+                       LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
+               instance_divisor_constbuf =
+                       ac_build_indexed_load_const(&ctx->ac, list, buf_index);
+       }
+
        for (i = 0; i <= key->vs_prolog.last_input; i++) {
-               unsigned divisor = key->vs_prolog.states.instance_divisors[i];
+               bool divisor_is_one =
+                       key->vs_prolog.states.instance_divisor_is_one & (1u << i);
+               bool divisor_is_fetched =
+                       key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
                LLVMValueRef index;
 
-               if (divisor) {
+               if (divisor_is_one || divisor_is_fetched) {
+                       LLVMValueRef divisor = ctx->i32_1;
+
+                       if (divisor_is_fetched) {
+                               divisor = buffer_load_const(ctx, instance_divisor_constbuf,
+                                                           LLVMConstInt(ctx->i32, i * 4, 0));
+                               divisor = LLVMBuildBitCast(gallivm->builder, divisor,
+                                                          ctx->i32, "");
+                       }
+
                        /* InstanceID / Divisor + StartInstance */
                        index = get_instance_index_for_fetch(ctx,
+                                                            user_sgpr_base +
                                                             SI_SGPR_START_INSTANCE,
                                                             divisor);
                } else {
                        /* VertexID + BaseVertex */
                        index = LLVMBuildAdd(gallivm->builder,
-                                            LLVMGetParam(func, ctx->param_vertex_id),
-                                            LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
+                                            ctx->abi.vertex_id,
+                                            LLVMGetParam(func, user_sgpr_base +
+                                                               SI_SGPR_BASE_VERTEX), "");
                }
 
                index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
                ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
-                                          num_params++, "");
+                                          fninfo.num_params + i, "");
        }
 
        si_llvm_build_ret(ctx, ret);
 }
 
-/**
- * Build the vertex shader epilog function. This is also used by the tessellation
- * evaluation shader compiled as VS.
- *
- * The input is PrimitiveID.
- *
- * If PrimitiveID is required by the pixel shader, export it.
- * Otherwise, do nothing.
- */
-static void si_build_vs_epilog_function(struct si_shader_context *ctx,
-                                       union si_shader_part_key *key)
-{
-       struct gallivm_state *gallivm = &ctx->gallivm;
-       struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
-       LLVMTypeRef params[5];
-       int num_params, i;
-
-       /* Declare input VGPRs. */
-       num_params = key->vs_epilog.states.export_prim_id ?
-                          (VS_EPILOG_PRIMID_LOC + 1) : 0;
-       assert(num_params <= ARRAY_SIZE(params));
-
-       for (i = 0; i < num_params; i++)
-               params[i] = ctx->f32;
-
-       /* Create the function. */
-       si_create_function(ctx, "vs_epilog", NULL, 0, params, num_params, -1);
-
-       /* Emit exports. */
-       if (key->vs_epilog.states.export_prim_id) {
-               struct lp_build_context *base = &bld_base->base;
-               struct ac_export_args args;
-
-               args.enabled_channels = 0x1; /* enabled channels */
-               args.valid_mask = 0; /* whether the EXEC mask is valid */
-               args.done = 0; /* DONE bit */
-               args.target = V_008DFC_SQ_EXP_PARAM +
-                             key->vs_epilog.prim_id_param_offset;
-               args.compr = 0; /* COMPR flag (0 = 32-bit export) */
-               args.out[0] = LLVMGetParam(ctx->main_fn,
-                                      VS_EPILOG_PRIMID_LOC); /* X */
-               args.out[1] = base->undef; /* Y */
-               args.out[2] = base->undef; /* Z */
-               args.out[3] = base->undef; /* W */
-
-               ac_build_export(&ctx->ac, &args);
-       }
-
-       LLVMBuildRetVoid(gallivm->builder);
-}
-
 static bool si_get_vs_prolog(struct si_screen *sscreen,
                             LLVMTargetMachineRef tm,
                             struct si_shader *shader,
@@ -8002,27 +6832,6 @@ static bool si_get_vs_prolog(struct si_screen *sscreen,
        return shader->prolog != NULL;
 }
 
-/**
- * Create & compile a vertex shader epilog. This a helper used by VS and TES.
- */
-static bool si_get_vs_epilog(struct si_screen *sscreen,
-                            LLVMTargetMachineRef tm,
-                            struct si_shader *shader,
-                            struct pipe_debug_callback *debug,
-                            struct si_vs_epilog_bits *states)
-{
-       union si_shader_part_key epilog_key;
-
-       si_get_vs_epilog_key(shader, states, &epilog_key);
-
-       shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
-                                           PIPE_SHADER_VERTEX, true,
-                                           &epilog_key, tm, debug,
-                                           si_build_vs_epilog_function,
-                                           "Vertex Shader Epilog");
-       return shader->epilog != NULL;
-}
-
 /**
  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
  */
@@ -8031,33 +6840,8 @@ static bool si_shader_select_vs_parts(struct si_screen *sscreen,
                                      struct si_shader *shader,
                                      struct pipe_debug_callback *debug)
 {
-       if (!si_get_vs_prolog(sscreen, tm, shader, debug, shader,
-                             &shader->key.part.vs.prolog))
-               return false;
-
-       /* Get the epilog. */
-       if (!shader->key.as_es && !shader->key.as_ls &&
-           !si_get_vs_epilog(sscreen, tm, shader, debug,
-                             &shader->key.part.vs.epilog))
-               return false;
-
-       return true;
-}
-
-/**
- * Select and compile (or reuse) TES parts (epilog).
- */
-static bool si_shader_select_tes_parts(struct si_screen *sscreen,
-                                      LLVMTargetMachineRef tm,
-                                      struct si_shader *shader,
-                                      struct pipe_debug_callback *debug)
-{
-       if (shader->key.as_es)
-               return true;
-
-       /* TES compiled as VS. */
-       return si_get_vs_epilog(sscreen, tm, shader, debug,
-                               &shader->key.part.tes.epilog);
+       return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
+                               &shader->key.part.vs.prolog);
 }
 
 /**
@@ -8069,38 +6853,63 @@ static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
 {
        struct gallivm_state *gallivm = &ctx->gallivm;
        struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
-       LLVMTypeRef params[16];
+       struct si_function_info fninfo;
        LLVMValueRef func;
-       int last_sgpr, num_params = 0;
-
-       /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
-       params[ctx->param_rw_buffers = num_params++] =
-               const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
-       params[ctx->param_const_buffers = num_params++] = ctx->i64;
-       params[ctx->param_samplers = num_params++] = ctx->i64;
-       params[ctx->param_images = num_params++] = ctx->i64;
-       params[ctx->param_shader_buffers = num_params++] = ctx->i64;
-       params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
-       params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
-       params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
-       params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
-       params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
-       params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
-       last_sgpr = num_params - 1;
-
-       params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
-       params[num_params++] = ctx->i32; /* invocation ID within the patch */
-       params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
+
+       si_init_function_info(&fninfo);
+
+       if (ctx->screen->b.chip_class >= GFX9) {
+               add_arg(&fninfo, ARG_SGPR, ctx->i64);
+               ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */
+               ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               add_arg(&fninfo, ARG_SGPR, ctx->i64);
+               add_arg(&fninfo, ARG_SGPR, ctx->i64);
+               add_arg(&fninfo, ARG_SGPR, ctx->i64);
+               add_arg(&fninfo, ARG_SGPR, ctx->i64);
+               add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+       } else {
+               add_arg(&fninfo, ARG_SGPR, ctx->i64);
+               add_arg(&fninfo, ARG_SGPR, ctx->i64);
+               add_arg(&fninfo, ARG_SGPR, ctx->i64);
+               ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+               ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
+       }
+
+       add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
+       add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
+       unsigned tess_factors_idx =
+               add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */
+       add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */
+       add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */
 
        /* Create the function. */
-       si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr);
-       declare_tess_lds(ctx);
+       si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo,
+                          ctx->screen->b.chip_class >= CIK ? 128 : 64);
+       declare_lds_as_pointer(ctx);
        func = ctx->main_fn;
 
        si_write_tess_factors(bld_base,
-                             LLVMGetParam(func, last_sgpr + 1),
-                             LLVMGetParam(func, last_sgpr + 2),
-                             LLVMGetParam(func, last_sgpr + 3));
+                             LLVMGetParam(func, tess_factors_idx),
+                             LLVMGetParam(func, tess_factors_idx + 1),
+                             LLVMGetParam(func, tess_factors_idx + 2));
 
        LLVMBuildRetVoid(gallivm->builder);
 }
@@ -8145,20 +6954,31 @@ static bool si_shader_select_gs_parts(struct si_screen *sscreen,
                                      struct si_shader *shader,
                                      struct pipe_debug_callback *debug)
 {
-       union si_shader_part_key prolog_key;
+       if (sscreen->b.chip_class >= GFX9) {
+               struct si_shader *es_main_part =
+                       shader->key.part.gs.es->main_shader_part_es;
+
+               if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
+                   !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
+                                     &shader->key.part.gs.vs_prolog))
+                       return false;
+
+               shader->previous_stage = es_main_part;
+       }
 
        if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
                return true;
 
+       union si_shader_part_key prolog_key;
        memset(&prolog_key, 0, sizeof(prolog_key));
        prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
 
-       shader->prolog = si_get_shader_part(sscreen, &sscreen->gs_prologs,
+       shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
                                            PIPE_SHADER_GEOMETRY, true,
                                            &prolog_key, tm, debug,
                                            si_build_gs_prolog_function,
                                            "Geometry Shader Prolog");
-       return shader->prolog != NULL;
+       return shader->prolog2 != NULL;
 }
 
 /**
@@ -8175,42 +6995,37 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx,
                                        union si_shader_part_key *key)
 {
        struct gallivm_state *gallivm = &ctx->gallivm;
-       LLVMTypeRef *params;
+       struct si_function_info fninfo;
        LLVMValueRef ret, func;
-       int last_sgpr, num_params, num_returns, i, num_color_channels;
+       int num_returns, i, num_color_channels;
 
        assert(si_need_ps_prolog(key));
 
-       /* Number of inputs + 8 color elements. */
-       params = alloca((key->ps_prolog.num_input_sgprs +
-                        key->ps_prolog.num_input_vgprs + 8) *
-                       sizeof(LLVMTypeRef));
+       si_init_function_info(&fninfo);
 
        /* Declare inputs. */
-       num_params = 0;
        for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
-               params[num_params++] = ctx->i32;
-       last_sgpr = num_params - 1;
+               add_arg(&fninfo, ARG_SGPR, ctx->i32);
 
        for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
-               params[num_params++] = ctx->f32;
+               add_arg(&fninfo, ARG_VGPR, ctx->f32);
 
        /* Declare outputs (same as inputs + add colors if needed) */
-       num_returns = num_params;
+       num_returns = fninfo.num_params;
        num_color_channels = util_bitcount(key->ps_prolog.colors_read);
        for (i = 0; i < num_color_channels; i++)
-               params[num_returns++] = ctx->f32;
+               fninfo.types[num_returns++] = ctx->f32;
 
        /* Create the function. */
-       si_create_function(ctx, "ps_prolog", params, num_returns, params,
-                          num_params, last_sgpr);
+       si_create_function(ctx, "ps_prolog", fninfo.types, num_returns,
+                          &fninfo, 0);
        func = ctx->main_fn;
 
        /* Copy inputs to outputs. This should be no-op, as the registers match,
         * but it will prevent the compiler from overwriting them unintentionally.
         */
        ret = ctx->return_value;
-       for (i = 0; i < num_params; i++) {
+       for (i = 0; i < fninfo.num_params; i++) {
                LLVMValueRef p = LLVMGetParam(func, i);
                ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
        }
@@ -8220,15 +7035,7 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx,
                /* POS_FIXED_PT is always last. */
                unsigned pos = key->ps_prolog.num_input_sgprs +
                               key->ps_prolog.num_input_vgprs - 1;
-               LLVMValueRef ptr[2], list;
-
-               /* Get the pointer to rw buffers. */
-               ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
-               ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
-               list = lp_build_gather_values(gallivm, ptr, 2);
-               list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
-               list = LLVMBuildIntToPtr(gallivm->builder, list,
-                                         const_array(ctx->v16i8, SI_NUM_RW_BUFFERS), "");
+               LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
 
                si_llvm_emit_polygon_stipple(ctx, list, pos);
        }
@@ -8351,6 +7158,7 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx,
        }
 
        /* Interpolate colors. */
+       unsigned color_out_idx = 0;
        for (i = 0; i < 2; i++) {
                unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
                unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
@@ -8392,7 +7200,7 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx,
                while (writemask) {
                        unsigned chan = u_bit_scan(&writemask);
                        ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
-                                                  num_params++, "");
+                                                  fninfo.num_params + color_out_idx++, "");
                }
        }
 
@@ -8414,44 +7222,41 @@ static void si_build_ps_epilog_function(struct si_shader_context *ctx,
 {
        struct gallivm_state *gallivm = &ctx->gallivm;
        struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
-       LLVMTypeRef params[16+8*4+3];
+       struct si_function_info fninfo;
        LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
-       int last_sgpr, num_params = 0, i;
+       int i;
        struct si_ps_exports exp = {};
 
+       si_init_function_info(&fninfo);
+
        /* Declare input SGPRs. */
-       params[ctx->param_rw_buffers = num_params++] = ctx->i64;
-       params[ctx->param_const_buffers = num_params++] = ctx->i64;
-       params[ctx->param_samplers = num_params++] = ctx->i64;
-       params[ctx->param_images = num_params++] = ctx->i64;
-       params[ctx->param_shader_buffers = num_params++] = ctx->i64;
-       assert(num_params == SI_PARAM_ALPHA_REF);
-       params[SI_PARAM_ALPHA_REF] = ctx->f32;
-       last_sgpr = SI_PARAM_ALPHA_REF;
+       ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
+       ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
+       ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64);
+       add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
 
        /* Declare input VGPRs. */
-       num_params = (last_sgpr + 1) +
+       unsigned required_num_params =
+                    fninfo.num_sgpr_params +
                     util_bitcount(key->ps_epilog.colors_written) * 4 +
                     key->ps_epilog.writes_z +
                     key->ps_epilog.writes_stencil +
                     key->ps_epilog.writes_samplemask;
 
-       num_params = MAX2(num_params,
-                         last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
-
-       assert(num_params <= ARRAY_SIZE(params));
+       required_num_params = MAX2(required_num_params,
+                                  fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
 
-       for (i = last_sgpr + 1; i < num_params; i++)
-               params[i] = ctx->f32;
+       while (fninfo.num_params < required_num_params)
+               add_arg(&fninfo, ARG_VGPR, ctx->f32);
 
        /* Create the function. */
-       si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr);
+       si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0);
        /* Disable elimination of unused inputs. */
        si_llvm_add_attribute(ctx->main_fn,
                                  "InitialPSInputAddr", 0xffffff);
 
        /* Process colors. */
-       unsigned vgpr = last_sgpr + 1;
+       unsigned vgpr = fninfo.num_sgpr_params;
        unsigned colors_written = key->ps_epilog.colors_written;
        int last_color_export = -1;
 
@@ -8465,7 +7270,7 @@ static void si_build_ps_epilog_function(struct si_shader_context *ctx,
                if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
                        /* Just set this if any of the colorbuffers are enabled. */
                        if (spi_format &
-                           ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
+                           ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
                                last_color_export = 0;
                } else {
                        for (i = 0; i < 8; i++)
@@ -8483,7 +7288,7 @@ static void si_build_ps_epilog_function(struct si_shader_context *ctx,
                        color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
 
                si_export_mrt_color(bld_base, color, mrt,
-                                   num_params - 1,
+                                   fninfo.num_params - 1,
                                    mrt == last_color_export, &exp);
        }
 
@@ -8653,7 +7458,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
                if (r)
                        return r;
        } else {
-               /* The shader consists of 2-3 parts:
+               /* The shader consists of several parts:
                 *
                 * - the middle part is the user shader, it has 1 variant only
                 *   and it was compiled during the creation of the shader
@@ -8662,8 +7467,15 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
                 * - the epilog part is inserted at the end
                 *
                 * The prolog and epilog have many (but simple) variants.
+                *
+                * Starting with gfx9, geometry and tessellation control
+                * shaders also contain the prolog and user shader parts of
+                * the previous shader stage.
                 */
 
+               if (!mainp)
+                       return -1;
+
                /* Copy the compiled TGSI shader data over. */
                shader->is_binary_shared = true;
                shader->binary = mainp->binary;
@@ -8689,8 +7501,6 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
                                return -1;
                        break;
                case PIPE_SHADER_TESS_EVAL:
-                       if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
-                               return -1;
                        break;
                case PIPE_SHADER_GEOMETRY:
                        if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
@@ -8735,6 +7545,12 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
                        shader->info.uses_instanceid |=
                                shader->previous_stage->info.uses_instanceid;
                }
+               if (shader->prolog2) {
+                       shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
+                                                       shader->prolog2->config.num_sgprs);
+                       shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
+                                                       shader->prolog2->config.num_vgprs);
+               }
                if (shader->epilog) {
                        shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
                                                        shader->epilog->config.num_sgprs);