ac: add support for 16bit UBO loads
[mesa.git] / src / amd / common / ac_llvm_build.c
index a5cb72daed544712ca24e497680cd2ace8cd3bad..54b7e98701585c13682a7a99a2955d8b66d5fce0 100644 (file)
 #include "util/bitscan.h"
 #include "util/macros.h"
 #include "util/u_atomic.h"
+#include "util/u_math.h"
 #include "sid.h"
 
 #include "shader_enums.h"
 
+#define AC_LLVM_INITIAL_CF_DEPTH 4
+
+/* Data for if/else/endif and bgnloop/endloop control flow structures.
+ */
+struct ac_llvm_flow {
+       /* Loop exit or next part of if/else/endif. */
+       LLVMBasicBlockRef next_block;
+       LLVMBasicBlockRef loop_entry_block;
+};
+
 /* Initialize module-independent parts of the context.
  *
  * The caller is responsible for initializing ctx::module and ctx::builder.
  */
 void
-ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context,
+ac_llvm_context_init(struct ac_llvm_context *ctx,
                     enum chip_class chip_class, enum radeon_family family)
 {
        LLVMValueRef args[1];
 
+       ctx->context = LLVMContextCreate();
+
        ctx->chip_class = chip_class;
        ctx->family = family;
-
-       ctx->context = context;
        ctx->module = NULL;
        ctx->builder = NULL;
 
@@ -64,9 +75,11 @@ ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context,
        ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
        ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
        ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
+       ctx->intptr = HAVE_32BIT_POINTERS ? ctx->i32 : ctx->i64;
        ctx->f16 = LLVMHalfTypeInContext(ctx->context);
        ctx->f32 = LLVMFloatTypeInContext(ctx->context);
        ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
+       ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
        ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
        ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
        ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
@@ -103,6 +116,14 @@ ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context,
        ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
 }
 
+void
+ac_llvm_context_dispose(struct ac_llvm_context *ctx)
+{
+       free(ctx->flow);
+       ctx->flow = NULL;
+       ctx->flow_depth_max = 0;
+}
+
 int
 ac_get_llvm_num_components(LLVMValueRef value)
 {
@@ -127,6 +148,25 @@ ac_llvm_extract_elem(struct ac_llvm_context *ac,
                                       LLVMConstInt(ac->i32, index, false), "");
 }
 
+int
+ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
+{
+       if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
+               type = LLVMGetElementType(type);
+
+       if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
+               return LLVMGetIntTypeWidth(type);
+
+       if (type == ctx->f16)
+               return 16;
+       if (type == ctx->f32)
+               return 32;
+       if (type == ctx->f64)
+               return 64;
+
+       unreachable("Unhandled type kind in get_elem_bits");
+}
+
 unsigned
 ac_get_type_size(LLVMTypeRef type)
 {
@@ -135,10 +175,15 @@ ac_get_type_size(LLVMTypeRef type)
        switch (kind) {
        case LLVMIntegerTypeKind:
                return LLVMGetIntTypeWidth(type) / 8;
+       case LLVMHalfTypeKind:
+               return 2;
        case LLVMFloatTypeKind:
                return 4;
        case LLVMDoubleTypeKind:
+               return 8;
        case LLVMPointerTypeKind:
+               if (LLVMGetPointerAddressSpace(type) == AC_CONST_32BIT_ADDR_SPACE)
+                       return 4;
                return 8;
        case LLVMVectorTypeKind:
                return LLVMGetVectorSize(type) *
@@ -219,8 +264,7 @@ ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
                   unsigned param_count, unsigned attrib_mask)
 {
        LLVMValueRef function, call;
-       bool set_callsite_attrs = HAVE_LLVM >= 0x0400 &&
-                                 !(attrib_mask & AC_FUNC_ATTR_LEGACY);
+       bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
 
        function = LLVMGetNamedFunction(ctx->module, name);
        if (!function) {
@@ -278,6 +322,9 @@ void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
        case LLVMIntegerTypeKind:
                snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
                break;
+       case LLVMHalfTypeKind:
+               snprintf(buf, bufsize, "f16");
+               break;
        case LLVMFloatTypeKind:
                snprintf(buf, bufsize, "f32");
                break;
@@ -343,6 +390,14 @@ ac_build_optimization_barrier(struct ac_llvm_context *ctx,
        }
 }
 
+LLVMValueRef
+ac_build_shader_clock(struct ac_llvm_context *ctx)
+{
+       LLVMValueRef tmp = ac_build_intrinsic(ctx, "llvm.readcyclecounter",
+                                             ctx->i64, NULL, 0, 0);
+       return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
+}
+
 LLVMValueRef
 ac_build_ballot(struct ac_llvm_context *ctx,
                LLVMValueRef value)
@@ -358,8 +413,7 @@ ac_build_ballot(struct ac_llvm_context *ctx,
         */
        ac_build_optimization_barrier(ctx, &args[0]);
 
-       if (LLVMTypeOf(args[0]) != ctx->i32)
-               args[0] = LLVMBuildBitCast(ctx->builder, args[0], ctx->i32, "");
+       args[0] = ac_to_integer(ctx, args[0]);
 
        return ac_build_intrinsic(ctx,
                                  "llvm.amdgcn.icmp.i32",
@@ -720,20 +774,6 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
 {
        LLVMValueRef args[5];
        LLVMValueRef p1;
-       
-       if (HAVE_LLVM < 0x0400) {
-               LLVMValueRef ij[2];
-               ij[0] = LLVMBuildBitCast(ctx->builder, i, ctx->i32, "");
-               ij[1] = LLVMBuildBitCast(ctx->builder, j, ctx->i32, "");
-
-               args[0] = llvm_chan;
-               args[1] = attr_number;
-               args[2] = params;
-               args[3] = ac_build_gather_values(ctx, ij, 2);
-               return ac_build_intrinsic(ctx, "llvm.SI.fs.interp",
-                                         ctx->f32, args, 4,
-                                         AC_FUNC_ATTR_READNONE);
-       }
 
        args[0] = i;
        args[1] = llvm_chan;
@@ -761,16 +801,6 @@ ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
                       LLVMValueRef params)
 {
        LLVMValueRef args[4];
-       if (HAVE_LLVM < 0x0400) {
-               args[0] = llvm_chan;
-               args[1] = attr_number;
-               args[2] = params;
-
-               return ac_build_intrinsic(ctx,
-                                         "llvm.SI.fs.constant",
-                                         ctx->f32, args, 3,
-                                         AC_FUNC_ATTR_READNONE);
-       }
 
        args[0] = parameter;
        args[1] = llvm_chan;
@@ -863,36 +893,35 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
                            bool writeonly_memory,
                            bool swizzle_enable_hint)
 {
+       /* Split 3 channel stores, becase LLVM doesn't support 3-channel
+        * intrinsics. */
+       if (num_channels == 3) {
+               LLVMValueRef v[3], v01;
+
+               for (int i = 0; i < 3; i++) {
+                       v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
+                                       LLVMConstInt(ctx->i32, i, 0), "");
+               }
+               v01 = ac_build_gather_values(ctx, v, 2);
+
+               ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
+                                           soffset, inst_offset, glc, slc,
+                                           writeonly_memory, swizzle_enable_hint);
+               ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
+                                           soffset, inst_offset + 8,
+                                           glc, slc,
+                                           writeonly_memory, swizzle_enable_hint);
+               return;
+       }
+
        /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
         * (voffset is swizzled, but soffset isn't swizzled).
         * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
         */
        if (!swizzle_enable_hint) {
-               /* Split 3 channel stores, becase LLVM doesn't support 3-channel
-                * intrinsics. */
-               if (num_channels == 3) {
-                       LLVMValueRef v[3], v01;
-
-                       for (int i = 0; i < 3; i++) {
-                               v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
-                                               LLVMConstInt(ctx->i32, i, 0), "");
-                       }
-                       v01 = ac_build_gather_values(ctx, v, 2);
-
-                       ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
-                                                   soffset, inst_offset, glc, slc,
-                                                   writeonly_memory, swizzle_enable_hint);
-                       ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
-                                                   soffset, inst_offset + 8,
-                                                   glc, slc,
-                                                   writeonly_memory, swizzle_enable_hint);
-                       return;
-               }
+               LLVMValueRef offset = soffset;
 
-               unsigned func = CLAMP(num_channels, 1, 3) - 1;
                static const char *types[] = {"f32", "v2f32", "v4f32"};
-               char name[256];
-               LLVMValueRef offset = soffset;
 
                if (inst_offset)
                        offset = LLVMBuildAdd(ctx->builder, offset,
@@ -909,53 +938,46 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
                        LLVMConstInt(ctx->i1, slc, 0),
                };
 
+               char name[256];
                snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
-                        types[func]);
+                        types[CLAMP(num_channels, 1, 3) - 1]);
 
                ac_build_intrinsic(ctx, name, ctx->voidt,
                                   args, ARRAY_SIZE(args),
                                   writeonly_memory ?
-                                          AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
-                                          AC_FUNC_ATTR_WRITEONLY);
+                                  AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
+                                  AC_FUNC_ATTR_WRITEONLY);
                return;
        }
 
-       static unsigned dfmt[] = {
+       static const unsigned dfmt[] = {
                V_008F0C_BUF_DATA_FORMAT_32,
                V_008F0C_BUF_DATA_FORMAT_32_32,
                V_008F0C_BUF_DATA_FORMAT_32_32_32,
                V_008F0C_BUF_DATA_FORMAT_32_32_32_32
        };
-       assert(num_channels >= 1 && num_channels <= 4);
-
+       static const char *types[] = {"i32", "v2i32", "v4i32"};
        LLVMValueRef args[] = {
-               rsrc,
                vdata,
-               LLVMConstInt(ctx->i32, num_channels, 0),
-               voffset ? voffset : LLVMGetUndef(ctx->i32),
+               LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
+               LLVMConstInt(ctx->i32, 0, 0),
+               voffset ? voffset : LLVMConstInt(ctx->i32, 0, 0),
                soffset,
                LLVMConstInt(ctx->i32, inst_offset, 0),
                LLVMConstInt(ctx->i32, dfmt[num_channels - 1], 0),
                LLVMConstInt(ctx->i32, V_008F0C_BUF_NUM_FORMAT_UINT, 0),
-               LLVMConstInt(ctx->i32, voffset != NULL, 0),
-               LLVMConstInt(ctx->i32, 0, 0), /* idxen */
-               LLVMConstInt(ctx->i32, glc, 0),
-               LLVMConstInt(ctx->i32, slc, 0),
-               LLVMConstInt(ctx->i32, 0, 0), /* tfe*/
+               LLVMConstInt(ctx->i1, glc, 0),
+               LLVMConstInt(ctx->i1, slc, 0),
        };
-
-       /* The instruction offset field has 12 bits */
-       assert(voffset || inst_offset < (1 << 12));
-
-       /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
-       unsigned func = CLAMP(num_channels, 1, 3) - 1;
-       const char *types[] = {"i32", "v2i32", "v4i32"};
        char name[256];
-       snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
+       snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s",
+                types[CLAMP(num_channels, 1, 3) - 1]);
 
        ac_build_intrinsic(ctx, name, ctx->voidt,
                           args, ARRAY_SIZE(args),
-                          AC_FUNC_ATTR_LEGACY);
+                          writeonly_memory ?
+                                  AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
+                                  AC_FUNC_ATTR_WRITEONLY);
 }
 
 static LLVMValueRef
@@ -1049,13 +1071,63 @@ LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
                                         LLVMValueRef vindex,
                                         LLVMValueRef voffset,
                                         unsigned num_channels,
+                                        bool glc,
                                         bool can_speculate)
 {
        return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset,
-                                          num_channels, false, false,
+                                          num_channels, glc, false,
                                           can_speculate, true);
 }
 
+LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
+                                                  LLVMValueRef rsrc,
+                                                  LLVMValueRef vindex,
+                                                  LLVMValueRef voffset,
+                                                  unsigned num_channels,
+                                                  bool glc,
+                                                  bool can_speculate)
+{
+       LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), "");
+       LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 1, 0), "");
+       stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), "");
+
+       LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->builder,
+                                                     LLVMBuildICmp(ctx->builder, LLVMIntUGT, elem_count, stride, ""),
+                                                     elem_count, stride, "");
+
+       LLVMValueRef new_rsrc = LLVMBuildInsertElement(ctx->builder, rsrc, new_elem_count,
+                                                      LLVMConstInt(ctx->i32, 2, 0), "");
+
+       return ac_build_buffer_load_common(ctx, new_rsrc, vindex, voffset,
+                                          num_channels, glc, false,
+                                          can_speculate, true);
+}
+
+LLVMValueRef
+ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
+                           LLVMValueRef rsrc,
+                           LLVMValueRef vindex,
+                           LLVMValueRef voffset,
+                               LLVMValueRef soffset,
+                               LLVMValueRef immoffset)
+{
+       const char *name = "llvm.amdgcn.tbuffer.load.i32";
+       LLVMTypeRef type = ctx->i32;
+       LLVMValueRef params[] = {
+                               rsrc,
+                               vindex,
+                               voffset,
+                               soffset,
+                               immoffset,
+                               LLVMConstInt(ctx->i32, V_008F0C_BUF_DATA_FORMAT_16, false),
+                               LLVMConstInt(ctx->i32, V_008F0C_BUF_NUM_FORMAT_UINT, false),
+                               ctx->i1false,
+                               ctx->i1false,
+       };
+       LLVMValueRef res = ac_build_intrinsic(ctx, name, type, params, 9, 0);
+       return LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
+}
+
 /**
  * Set range metadata on an instruction.  This can only be used on load and
  * call instructions.  If you know an instruction can only produce the values
@@ -1128,7 +1200,21 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
        LLVMValueRef tl, trbl, args[2];
        LLVMValueRef result;
 
-       if (ctx->chip_class >= VI) {
+       if (HAVE_LLVM >= 0x0700) {
+               unsigned tl_lanes[4], trbl_lanes[4];
+
+               for (unsigned i = 0; i < 4; ++i) {
+                       tl_lanes[i] = i & mask;
+                       trbl_lanes[i] = (i & mask) + idx;
+               }
+
+               tl = ac_build_quad_swizzle(ctx, val,
+                                          tl_lanes[0], tl_lanes[1],
+                                          tl_lanes[2], tl_lanes[3]);
+               trbl = ac_build_quad_swizzle(ctx, val,
+                                            trbl_lanes[0], trbl_lanes[1],
+                                            trbl_lanes[2], trbl_lanes[3]);
+       } else if (ctx->chip_class >= VI) {
                LLVMValueRef thread_id, tl_tid, trbl_tid;
                thread_id = ac_get_thread_id(ctx);
 
@@ -1198,6 +1284,13 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
        tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
        trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
        result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
+
+       if (HAVE_LLVM >= 0x0700) {
+               result = ac_build_intrinsic(ctx,
+                       "llvm.amdgcn.wqm.f32", ctx->f32,
+                       &result, 1, 0);
+       }
+
        return result;
 }
 
@@ -1207,10 +1300,9 @@ ac_build_sendmsg(struct ac_llvm_context *ctx,
                 LLVMValueRef wave_id)
 {
        LLVMValueRef args[2];
-       const char *intr_name = (HAVE_LLVM < 0x0400) ? "llvm.SI.sendmsg" : "llvm.amdgcn.s.sendmsg";
        args[0] = LLVMConstInt(ctx->i32, msg, false);
        args[1] = wave_id;
-       ac_build_intrinsic(ctx, intr_name, ctx->voidt, args, 2, 0);
+       ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
 }
 
 LLVMValueRef
@@ -1218,9 +1310,7 @@ ac_build_imsb(struct ac_llvm_context *ctx,
              LLVMValueRef arg,
              LLVMTypeRef dst_type)
 {
-       const char *intr_name = (HAVE_LLVM < 0x0400) ? "llvm.AMDGPU.flbit.i32" :
-                                                      "llvm.amdgcn.sffbh.i32";
-       LLVMValueRef msb = ac_build_intrinsic(ctx, intr_name,
+       LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32",
                                              dst_type, &arg, 1,
                                              AC_FUNC_ATTR_READNONE);
 
@@ -1244,23 +1334,40 @@ ac_build_umsb(struct ac_llvm_context *ctx,
              LLVMValueRef arg,
              LLVMTypeRef dst_type)
 {
-       LLVMValueRef args[2] = {
+       const char *intrin_name;
+       LLVMTypeRef type;
+       LLVMValueRef highest_bit;
+       LLVMValueRef zero;
+
+       if (ac_get_elem_bits(ctx, LLVMTypeOf(arg)) == 64) {
+               intrin_name = "llvm.ctlz.i64";
+               type = ctx->i64;
+               highest_bit = LLVMConstInt(ctx->i64, 63, false);
+               zero = ctx->i64_0;
+       } else {
+               intrin_name = "llvm.ctlz.i32";
+               type = ctx->i32;
+               highest_bit = LLVMConstInt(ctx->i32, 31, false);
+               zero = ctx->i32_0;
+       }
+
+       LLVMValueRef params[2] = {
                arg,
                ctx->i1true,
        };
-       LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.ctlz.i32",
-                                             dst_type, args, ARRAY_SIZE(args),
+
+       LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type,
+                                             params, 2,
                                              AC_FUNC_ATTR_READNONE);
 
        /* The HW returns the last bit index from MSB, but TGSI/NIR wants
         * the index from LSB. Invert it by doing "31 - msb". */
-       msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
-                          msb, "");
+       msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
+       msb = LLVMBuildTruncOrBitCast(ctx->builder, msb, ctx->i32, "");
 
        /* check for zero */
        return LLVMBuildSelect(ctx->builder,
-                              LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg,
-                                            LLVMConstInt(ctx->i32, 0, 0), ""),
+                              LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
                               LLVMConstInt(ctx->i32, -1, true), msb, "");
 }
 
@@ -1280,6 +1387,20 @@ LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
                                  AC_FUNC_ATTR_READNONE);
 }
 
+LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a,
+                          LLVMValueRef b)
+{
+       LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
+       return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
+}
+
+LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a,
+                          LLVMValueRef b)
+{
+       LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
+       return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
+}
+
 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,
                           LLVMValueRef b)
 {
@@ -1289,220 +1410,597 @@ LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,
 
 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
 {
-       if (HAVE_LLVM >= 0x0500) {
-               return ac_build_fmin(ctx, ac_build_fmax(ctx, value, ctx->f32_0),
-                                    ctx->f32_1);
-       }
-
-       LLVMValueRef args[3] = {
-               value,
-               LLVMConstReal(ctx->f32, 0),
-               LLVMConstReal(ctx->f32, 1),
-       };
-
-       return ac_build_intrinsic(ctx, "llvm.AMDGPU.clamp.", ctx->f32, args, 3,
-                                 AC_FUNC_ATTR_READNONE |
-                                 AC_FUNC_ATTR_LEGACY);
+       return ac_build_fmin(ctx, ac_build_fmax(ctx, value, ctx->f32_0),
+                            ctx->f32_1);
 }
 
 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
 {
        LLVMValueRef args[9];
 
-       if (HAVE_LLVM >= 0x0500) {
-               args[0] = LLVMConstInt(ctx->i32, a->target, 0);
-               args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
+       args[0] = LLVMConstInt(ctx->i32, a->target, 0);
+       args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
 
-               if (a->compr) {
-                       LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
-                       LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
+       if (a->compr) {
+               LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
+               LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
 
-                       args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
-                                                  v2i16, "");
-                       args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
-                                                  v2i16, "");
-                       args[4] = LLVMConstInt(ctx->i1, a->done, 0);
-                       args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
+               args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
+                               v2i16, "");
+               args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
+                               v2i16, "");
+               args[4] = LLVMConstInt(ctx->i1, a->done, 0);
+               args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
 
-                       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
-                                          ctx->voidt, args, 6, 0);
-               } else {
-                       args[2] = a->out[0];
-                       args[3] = a->out[1];
-                       args[4] = a->out[2];
-                       args[5] = a->out[3];
-                       args[6] = LLVMConstInt(ctx->i1, a->done, 0);
-                       args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
-
-                       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
-                                          ctx->voidt, args, 8, 0);
-               }
-               return;
+               ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
+                                  ctx->voidt, args, 6, 0);
+       } else {
+               args[2] = a->out[0];
+               args[3] = a->out[1];
+               args[4] = a->out[2];
+               args[5] = a->out[3];
+               args[6] = LLVMConstInt(ctx->i1, a->done, 0);
+               args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
+
+               ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
+                                  ctx->voidt, args, 8, 0);
        }
+}
+
+void ac_build_export_null(struct ac_llvm_context *ctx)
+{
+       struct ac_export_args args;
 
-       args[0] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
-       args[1] = LLVMConstInt(ctx->i32, a->valid_mask, 0);
-       args[2] = LLVMConstInt(ctx->i32, a->done, 0);
-       args[3] = LLVMConstInt(ctx->i32, a->target, 0);
-       args[4] = LLVMConstInt(ctx->i32, a->compr, 0);
-       memcpy(args + 5, a->out, sizeof(a->out[0]) * 4);
+       args.enabled_channels = 0x0; /* enabled channels */
+       args.valid_mask = 1; /* whether the EXEC mask is valid */
+       args.done = 1; /* DONE bit */
+       args.target = V_008DFC_SQ_EXP_NULL;
+       args.compr = 0; /* COMPR flag (0 = 32-bit export) */
+       args.out[0] = LLVMGetUndef(ctx->f32); /* R */
+       args.out[1] = LLVMGetUndef(ctx->f32); /* G */
+       args.out[2] = LLVMGetUndef(ctx->f32); /* B */
+       args.out[3] = LLVMGetUndef(ctx->f32); /* A */
 
-       ac_build_intrinsic(ctx, "llvm.SI.export", ctx->voidt, args, 9,
-                          AC_FUNC_ATTR_LEGACY);
+       ac_build_export(ctx, &args);
 }
 
-LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
-                                  struct ac_image_args *a)
+static unsigned ac_num_coords(enum ac_image_dim dim)
 {
-       LLVMTypeRef dst_type;
-       LLVMValueRef args[11];
-       unsigned num_args = 0;
+       switch (dim) {
+       case ac_image_1d:
+               return 1;
+       case ac_image_2d:
+       case ac_image_1darray:
+                return 2;
+       case ac_image_3d:
+       case ac_image_cube:
+       case ac_image_2darray:
+       case ac_image_2dmsaa:
+               return 3;
+       case ac_image_2darraymsaa:
+               return 4;
+       default:
+               unreachable("ac_num_coords: bad dim");
+       }
+}
+
+static unsigned ac_num_derivs(enum ac_image_dim dim)
+{
+       switch (dim) {
+       case ac_image_1d:
+       case ac_image_1darray:
+               return 2;
+       case ac_image_2d:
+       case ac_image_2darray:
+       case ac_image_cube:
+               return 4;
+       case ac_image_3d:
+               return 6;
+       case ac_image_2dmsaa:
+       case ac_image_2darraymsaa:
+       default:
+               unreachable("derivatives not supported");
+       }
+}
+
+static const char *get_atomic_name(enum ac_atomic_op op)
+{
+       switch (op) {
+       case ac_atomic_swap: return "swap";
+       case ac_atomic_add: return "add";
+       case ac_atomic_sub: return "sub";
+       case ac_atomic_smin: return "smin";
+       case ac_atomic_umin: return "umin";
+       case ac_atomic_smax: return "smax";
+       case ac_atomic_umax: return "umax";
+       case ac_atomic_and: return "and";
+       case ac_atomic_or: return "or";
+       case ac_atomic_xor: return "xor";
+       }
+       unreachable("bad atomic op");
+}
+
+/* LLVM 6 and older */
+static LLVMValueRef ac_build_image_opcode_llvm6(struct ac_llvm_context *ctx,
+                                               struct ac_image_args *a)
+{
+       LLVMValueRef args[16];
+       LLVMTypeRef retty = ctx->v4f32;
        const char *name = NULL;
-       char intr_name[128], type[64];
+       const char *atomic_subop = "";
+       char intr_name[128], coords_type[64];
+
+       bool sample = a->opcode == ac_image_sample ||
+                     a->opcode == ac_image_gather4 ||
+                     a->opcode == ac_image_get_lod;
+       bool atomic = a->opcode == ac_image_atomic ||
+                     a->opcode == ac_image_atomic_cmpswap;
+       bool da = a->dim == ac_image_cube ||
+                 a->dim == ac_image_1darray ||
+                 a->dim == ac_image_2darray ||
+                 a->dim == ac_image_2darraymsaa;
+       if (a->opcode == ac_image_get_lod)
+               da = false;
+
+       unsigned num_coords =
+               a->opcode != ac_image_get_resinfo ? ac_num_coords(a->dim) : 0;
+       LLVMValueRef addr;
+       unsigned num_addr = 0;
+
+       if (a->opcode == ac_image_get_lod) {
+               switch (a->dim) {
+               case ac_image_1darray:
+                       num_coords = 1;
+                       break;
+               case ac_image_2darray:
+               case ac_image_cube:
+                       num_coords = 2;
+                       break;
+               default:
+                       break;
+               }
+       }
+
+       if (a->offset)
+               args[num_addr++] = ac_to_integer(ctx, a->offset);
+       if (a->bias)
+               args[num_addr++] = ac_to_integer(ctx, a->bias);
+       if (a->compare)
+               args[num_addr++] = ac_to_integer(ctx, a->compare);
+       if (a->derivs[0]) {
+               unsigned num_derivs = ac_num_derivs(a->dim);
+               for (unsigned i = 0; i < num_derivs; ++i)
+                       args[num_addr++] = ac_to_integer(ctx, a->derivs[i]);
+       }
+       for (unsigned i = 0; i < num_coords; ++i)
+               args[num_addr++] = ac_to_integer(ctx, a->coords[i]);
+       if (a->lod)
+               args[num_addr++] = ac_to_integer(ctx, a->lod);
 
-       if (HAVE_LLVM >= 0x0400) {
-               bool sample = a->opcode == ac_image_sample ||
-                             a->opcode == ac_image_gather4 ||
-                             a->opcode == ac_image_get_lod;
+       unsigned pad_goal = util_next_power_of_two(num_addr);
+       while (num_addr < pad_goal)
+               args[num_addr++] = LLVMGetUndef(ctx->i32);
 
-               if (sample)
-                       args[num_args++] = ac_to_float(ctx, a->addr);
-               else
-                       args[num_args++] = a->addr;
+       addr = ac_build_gather_values(ctx, args, num_addr);
 
-               args[num_args++] = a->resource;
-               if (sample)
-                       args[num_args++] = a->sampler;
+       unsigned num_args = 0;
+       if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
+               args[num_args++] = a->data[0];
+               if (a->opcode == ac_image_atomic_cmpswap)
+                       args[num_args++] = a->data[1];
+       }
+
+       unsigned coords_arg = num_args;
+       if (sample)
+               args[num_args++] = ac_to_float(ctx, addr);
+       else
+               args[num_args++] = ac_to_integer(ctx, addr);
+
+       args[num_args++] = a->resource;
+       if (sample)
+               args[num_args++] = a->sampler;
+       if (!atomic) {
                args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, 0);
                if (sample)
                        args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, 0);
-               args[num_args++] = ctx->i1false; /* glc */
-               args[num_args++] = ctx->i1false; /* slc */
+               args[num_args++] = a->cache_policy & ac_glc ? ctx->i1true : ctx->i1false;
+               args[num_args++] = a->cache_policy & ac_slc ? ctx->i1true : ctx->i1false;
                args[num_args++] = ctx->i1false; /* lwe */
-               args[num_args++] = LLVMConstInt(ctx->i1, a->da, 0);
+               args[num_args++] = LLVMConstInt(ctx->i1, da, 0);
+       } else {
+               args[num_args++] = ctx->i1false; /* r128 */
+               args[num_args++] = LLVMConstInt(ctx->i1, da, 0);
+               args[num_args++] = a->cache_policy & ac_slc ? ctx->i1true : ctx->i1false;
+       }
 
-               switch (a->opcode) {
-               case ac_image_sample:
-                       name = "llvm.amdgcn.image.sample";
-                       break;
-               case ac_image_gather4:
-                       name = "llvm.amdgcn.image.gather4";
-                       break;
-               case ac_image_load:
-                       name = "llvm.amdgcn.image.load";
-                       break;
-               case ac_image_load_mip:
-                       name = "llvm.amdgcn.image.load.mip";
-                       break;
-               case ac_image_get_lod:
-                       name = "llvm.amdgcn.image.getlod";
-                       break;
-               case ac_image_get_resinfo:
-                       name = "llvm.amdgcn.image.getresinfo";
-                       break;
-               default:
-                       unreachable("invalid image opcode");
+       switch (a->opcode) {
+       case ac_image_sample:
+               name = "llvm.amdgcn.image.sample";
+               break;
+       case ac_image_gather4:
+               name = "llvm.amdgcn.image.gather4";
+               break;
+       case ac_image_load:
+               name = "llvm.amdgcn.image.load";
+               break;
+       case ac_image_load_mip:
+               name = "llvm.amdgcn.image.load.mip";
+               break;
+       case ac_image_store:
+               name = "llvm.amdgcn.image.store";
+               retty = ctx->voidt;
+               break;
+       case ac_image_store_mip:
+               name = "llvm.amdgcn.image.store.mip";
+               retty = ctx->voidt;
+               break;
+       case ac_image_atomic:
+       case ac_image_atomic_cmpswap:
+               name = "llvm.amdgcn.image.atomic.";
+               retty = ctx->i32;
+               if (a->opcode == ac_image_atomic_cmpswap) {
+                       atomic_subop = "cmpswap";
+               } else {
+                       atomic_subop = get_atomic_name(a->atomic);
                }
+               break;
+       case ac_image_get_lod:
+               name = "llvm.amdgcn.image.getlod";
+               break;
+       case ac_image_get_resinfo:
+               name = "llvm.amdgcn.image.getresinfo";
+               break;
+       default:
+               unreachable("invalid image opcode");
+       }
 
-               ac_build_type_name_for_intr(LLVMTypeOf(args[0]), type,
-                                           sizeof(type));
+       ac_build_type_name_for_intr(LLVMTypeOf(args[coords_arg]), coords_type,
+                                   sizeof(coords_type));
+
+       if (atomic) {
+               snprintf(intr_name, sizeof(intr_name), "llvm.amdgcn.image.atomic.%s.%s",
+                        atomic_subop, coords_type);
+       } else {
+               bool lod_suffix =
+                       a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
 
                snprintf(intr_name, sizeof(intr_name), "%s%s%s%s.v4f32.%s.v8i32",
                        name,
                        a->compare ? ".c" : "",
                        a->bias ? ".b" :
-                       a->lod ? ".l" :
-                       a->deriv ? ".d" :
+                       lod_suffix ? ".l" :
+                       a->derivs[0] ? ".d" :
                        a->level_zero ? ".lz" : "",
                        a->offset ? ".o" : "",
-                       type);
+                       coords_type);
+       }
 
-               LLVMValueRef result =
-                       ac_build_intrinsic(ctx, intr_name,
-                                          ctx->v4f32, args, num_args,
-                                          AC_FUNC_ATTR_READNONE);
-               if (!sample) {
-                       result = LLVMBuildBitCast(ctx->builder, result,
-                                                 ctx->v4i32, "");
+       LLVMValueRef result =
+               ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
+                                  a->attributes);
+       if (!sample && retty == ctx->v4f32) {
+               result = LLVMBuildBitCast(ctx->builder, result,
+                                         ctx->v4i32, "");
+       }
+       return result;
+}
+
+LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
+                                  struct ac_image_args *a)
+{
+       const char *overload[3] = { "", "", "" };
+       unsigned num_overloads = 0;
+       LLVMValueRef args[18];
+       unsigned num_args = 0;
+       enum ac_image_dim dim = a->dim;
+
+       assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
+              !a->level_zero);
+       assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
+               a->opcode != ac_image_store_mip) ||
+              a->lod);
+       assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
+              (!a->compare && !a->offset));
+       assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
+               a->opcode == ac_image_get_lod) ||
+              !a->bias);
+       assert((a->bias ? 1 : 0) +
+              (a->lod ? 1 : 0) +
+              (a->level_zero ? 1 : 0) +
+              (a->derivs[0] ? 1 : 0) <= 1);
+
+       if (HAVE_LLVM < 0x0700)
+               return ac_build_image_opcode_llvm6(ctx, a);
+
+       if (a->opcode == ac_image_get_lod) {
+               switch (dim) {
+               case ac_image_1darray:
+                       dim = ac_image_1d;
+                       break;
+               case ac_image_2darray:
+               case ac_image_cube:
+                       dim = ac_image_2d;
+                       break;
+               default:
+                       break;
                }
-               return result;
        }
 
-       args[num_args++] = a->addr;
-       args[num_args++] = a->resource;
+       bool sample = a->opcode == ac_image_sample ||
+                     a->opcode == ac_image_gather4 ||
+                     a->opcode == ac_image_get_lod;
+       bool atomic = a->opcode == ac_image_atomic ||
+                     a->opcode == ac_image_atomic_cmpswap;
+       LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
+
+       if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
+               args[num_args++] = a->data[0];
+               if (a->opcode == ac_image_atomic_cmpswap)
+                       args[num_args++] = a->data[1];
+       }
 
-       if (a->opcode == ac_image_load ||
-           a->opcode == ac_image_load_mip ||
-           a->opcode == ac_image_get_resinfo) {
-               dst_type = ctx->v4i32;
-       } else {
-               dst_type = ctx->v4f32;
+       if (!atomic)
+               args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
+
+       if (a->offset)
+               args[num_args++] = ac_to_integer(ctx, a->offset);
+       if (a->bias) {
+               args[num_args++] = ac_to_float(ctx, a->bias);
+               overload[num_overloads++] = ".f32";
+       }
+       if (a->compare)
+               args[num_args++] = ac_to_float(ctx, a->compare);
+       if (a->derivs[0]) {
+               unsigned count = ac_num_derivs(dim);
+               for (unsigned i = 0; i < count; ++i)
+                       args[num_args++] = ac_to_float(ctx, a->derivs[i]);
+               overload[num_overloads++] = ".f32";
+       }
+       unsigned num_coords =
+               a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
+       for (unsigned i = 0; i < num_coords; ++i)
+               args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
+       if (a->lod)
+               args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
+       overload[num_overloads++] = sample ? ".f32" : ".i32";
+
+       args[num_args++] = a->resource;
+       if (sample) {
                args[num_args++] = a->sampler;
+               args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
        }
 
-       args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, 0);
-       args[num_args++] = LLVMConstInt(ctx->i32, a->unorm, 0);
-       args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* r128 */
-       args[num_args++] = LLVMConstInt(ctx->i32, a->da, 0);
-       args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* glc */
-       args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* slc */
-       args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* tfe */
-       args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* lwe */
+       args[num_args++] = ctx->i32_0; /* texfailctrl */
+       args[num_args++] = LLVMConstInt(ctx->i32, a->cache_policy, false);
 
+       const char *name;
+       const char *atomic_subop = "";
        switch (a->opcode) {
-       case ac_image_sample:
-               name = "llvm.SI.image.sample";
+       case ac_image_sample: name = "sample"; break;
+       case ac_image_gather4: name = "gather4"; break;
+       case ac_image_load: name = "load"; break;
+       case ac_image_load_mip: name = "load.mip"; break;
+       case ac_image_store: name = "store"; break;
+       case ac_image_store_mip: name = "store.mip"; break;
+       case ac_image_atomic:
+               name = "atomic.";
+               atomic_subop = get_atomic_name(a->atomic);
                break;
-       case ac_image_gather4:
-               name = "llvm.SI.gather4";
-               break;
-       case ac_image_load:
-               name = "llvm.SI.image.load";
-               break;
-       case ac_image_load_mip:
-               name = "llvm.SI.image.load.mip";
-               break;
-       case ac_image_get_lod:
-               name = "llvm.SI.getlod";
-               break;
-       case ac_image_get_resinfo:
-               name = "llvm.SI.getresinfo";
+       case ac_image_atomic_cmpswap:
+               name = "atomic.";
+               atomic_subop = "cmpswap";
                break;
+       case ac_image_get_lod: name = "getlod"; break;
+       case ac_image_get_resinfo: name = "getresinfo"; break;
+       default: unreachable("invalid image opcode");
        }
 
-       ac_build_type_name_for_intr(LLVMTypeOf(a->addr), type, sizeof(type));
-       snprintf(intr_name, sizeof(intr_name), "%s%s%s%s.%s",
-               name,
-               a->compare ? ".c" : "",
-               a->bias ? ".b" :
-               a->lod ? ".l" :
-               a->deriv ? ".d" :
-               a->level_zero ? ".lz" : "",
-               a->offset ? ".o" : "",
-               type);
+       const char *dimname;
+       switch (dim) {
+       case ac_image_1d: dimname = "1d"; break;
+       case ac_image_2d: dimname = "2d"; break;
+       case ac_image_3d: dimname = "3d"; break;
+       case ac_image_cube: dimname = "cube"; break;
+       case ac_image_1darray: dimname = "1darray"; break;
+       case ac_image_2darray: dimname = "2darray"; break;
+       case ac_image_2dmsaa: dimname = "2dmsaa"; break;
+       case ac_image_2darraymsaa: dimname = "2darraymsaa"; break;
+       default: unreachable("invalid dim");
+       }
 
-       return ac_build_intrinsic(ctx, intr_name,
-                                 dst_type, args, num_args,
-                                 AC_FUNC_ATTR_READNONE |
-                                 AC_FUNC_ATTR_LEGACY);
+       bool lod_suffix =
+               a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
+       char intr_name[96];
+       snprintf(intr_name, sizeof(intr_name),
+                "llvm.amdgcn.image.%s%s" /* base name */
+                "%s%s%s" /* sample/gather modifiers */
+                ".%s.%s%s%s%s", /* dimension and type overloads */
+                name, atomic_subop,
+                a->compare ? ".c" : "",
+                a->bias ? ".b" :
+                lod_suffix ? ".l" :
+                a->derivs[0] ? ".d" :
+                a->level_zero ? ".lz" : "",
+                a->offset ? ".o" : "",
+                dimname,
+                atomic ? "i32" : "v4f32",
+                overload[0], overload[1], overload[2]);
+
+       LLVMTypeRef retty;
+       if (atomic)
+               retty = ctx->i32;
+       else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
+               retty = ctx->voidt;
+       else
+               retty = ctx->v4f32;
+
+       LLVMValueRef result =
+               ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
+                                  a->attributes);
+       if (!sample && retty == ctx->v4f32) {
+               result = LLVMBuildBitCast(ctx->builder, result,
+                                         ctx->v4i32, "");
+       }
+       return result;
 }
 
 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
                                    LLVMValueRef args[2])
 {
-       if (HAVE_LLVM >= 0x0500) {
-               LLVMTypeRef v2f16 =
-                       LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
+       LLVMTypeRef v2f16 =
+               LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
+
+       return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16,
+                                 args, 2, AC_FUNC_ATTR_READNONE);
+}
+
+/* Upper 16 bits must be zero. */
+static LLVMValueRef ac_llvm_pack_two_int16(struct ac_llvm_context *ctx,
+                                          LLVMValueRef val[2])
+{
+       return LLVMBuildOr(ctx->builder, val[0],
+                          LLVMBuildShl(ctx->builder, val[1],
+                                       LLVMConstInt(ctx->i32, 16, 0),
+                                       ""), "");
+}
+
+/* Upper 16 bits are ignored and will be dropped. */
+static LLVMValueRef ac_llvm_pack_two_int32_as_int16(struct ac_llvm_context *ctx,
+                                                   LLVMValueRef val[2])
+{
+       LLVMValueRef v[2] = {
+               LLVMBuildAnd(ctx->builder, val[0],
+                            LLVMConstInt(ctx->i32, 0xffff, 0), ""),
+               val[1],
+       };
+       return ac_llvm_pack_two_int16(ctx, v);
+}
+
+LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx,
+                                    LLVMValueRef args[2])
+{
+       if (HAVE_LLVM >= 0x0600) {
                LLVMValueRef res =
-                       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz",
-                                          v2f16, args, 2,
+                       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16",
+                                          ctx->v2i16, args, 2,
                                           AC_FUNC_ATTR_READNONE);
                return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
        }
 
-       return ac_build_intrinsic(ctx, "llvm.SI.packf16", ctx->i32, args, 2,
-                                 AC_FUNC_ATTR_READNONE |
-                                 AC_FUNC_ATTR_LEGACY);
+       LLVMValueRef val[2];
+
+       for (int chan = 0; chan < 2; chan++) {
+               /* Clamp between [-1, 1]. */
+               val[chan] = ac_build_fmin(ctx, args[chan], ctx->f32_1);
+               val[chan] = ac_build_fmax(ctx, val[chan], LLVMConstReal(ctx->f32, -1));
+               /* Convert to a signed integer in [-32767, 32767]. */
+               val[chan] = LLVMBuildFMul(ctx->builder, val[chan],
+                                         LLVMConstReal(ctx->f32, 32767), "");
+               /* If positive, add 0.5, else add -0.5. */
+               val[chan] = LLVMBuildFAdd(ctx->builder, val[chan],
+                               LLVMBuildSelect(ctx->builder,
+                                       LLVMBuildFCmp(ctx->builder, LLVMRealOGE,
+                                                     val[chan], ctx->f32_0, ""),
+                                       LLVMConstReal(ctx->f32, 0.5),
+                                       LLVMConstReal(ctx->f32, -0.5), ""), "");
+               val[chan] = LLVMBuildFPToSI(ctx->builder, val[chan], ctx->i32, "");
+       }
+       return ac_llvm_pack_two_int32_as_int16(ctx, val);
+}
+
+LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx,
+                                    LLVMValueRef args[2])
+{
+       if (HAVE_LLVM >= 0x0600) {
+               LLVMValueRef res =
+                       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16",
+                                          ctx->v2i16, args, 2,
+                                          AC_FUNC_ATTR_READNONE);
+               return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
+       }
+
+       LLVMValueRef val[2];
+
+       for (int chan = 0; chan < 2; chan++) {
+               val[chan] = ac_build_clamp(ctx, args[chan]);
+               val[chan] = LLVMBuildFMul(ctx->builder, val[chan],
+                                         LLVMConstReal(ctx->f32, 65535), "");
+               val[chan] = LLVMBuildFAdd(ctx->builder, val[chan],
+                                         LLVMConstReal(ctx->f32, 0.5), "");
+               val[chan] = LLVMBuildFPToUI(ctx->builder, val[chan],
+                                           ctx->i32, "");
+       }
+       return ac_llvm_pack_two_int32_as_int16(ctx, val);
+}
+
+/* The 8-bit and 10-bit clamping is for HW workarounds. */
+LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx,
+                                LLVMValueRef args[2], unsigned bits, bool hi)
+{
+       assert(bits == 8 || bits == 10 || bits == 16);
+
+       LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
+               bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
+       LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
+               bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
+       LLVMValueRef max_alpha =
+               bits != 10 ? max_rgb : ctx->i32_1;
+       LLVMValueRef min_alpha =
+               bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
+       bool has_intrinsic = HAVE_LLVM >= 0x0600;
+
+       /* Clamp. */
+       if (!has_intrinsic || bits != 16) {
+               for (int i = 0; i < 2; i++) {
+                       bool alpha = hi && i == 1;
+                       args[i] = ac_build_imin(ctx, args[i],
+                                               alpha ? max_alpha : max_rgb);
+                       args[i] = ac_build_imax(ctx, args[i],
+                                               alpha ? min_alpha : min_rgb);
+               }
+       }
+
+       if (has_intrinsic) {
+               LLVMValueRef res =
+                       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16",
+                                          ctx->v2i16, args, 2,
+                                          AC_FUNC_ATTR_READNONE);
+               return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
+       }
+
+       return ac_llvm_pack_two_int32_as_int16(ctx, args);
+}
+
+/* The 8-bit and 10-bit clamping is for HW workarounds. */
+LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx,
+                                LLVMValueRef args[2], unsigned bits, bool hi)
+{
+       assert(bits == 8 || bits == 10 || bits == 16);
+
+       LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
+               bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
+       LLVMValueRef max_alpha =
+               bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
+       bool has_intrinsic = HAVE_LLVM >= 0x0600;
+
+       /* Clamp. */
+       if (!has_intrinsic || bits != 16) {
+               for (int i = 0; i < 2; i++) {
+                       bool alpha = hi && i == 1;
+                       args[i] = ac_build_umin(ctx, args[i],
+                                               alpha ? max_alpha : max_rgb);
+               }
+       }
+
+       if (has_intrinsic) {
+               LLVMValueRef res =
+                       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16",
+                                          ctx->v2i16, args, 2,
+                                          AC_FUNC_ATTR_READNONE);
+               return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
+       }
+
+       return ac_llvm_pack_two_int16(ctx, args);
 }
 
 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
@@ -1537,20 +2035,11 @@ LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
                width,
        };
 
-       if (HAVE_LLVM >= 0x0500) {
-               return ac_build_intrinsic(ctx,
-                                         is_signed ? "llvm.amdgcn.sbfe.i32" :
-                                                     "llvm.amdgcn.ubfe.i32",
-                                         ctx->i32, args, 3,
-                                         AC_FUNC_ATTR_READNONE);
-       }
-
        return ac_build_intrinsic(ctx,
-                                 is_signed ? "llvm.AMDGPU.bfe.i32" :
-                                             "llvm.AMDGPU.bfe.u32",
+                                 is_signed ? "llvm.amdgcn.sbfe.i32" :
+                                             "llvm.amdgcn.ubfe.i32",
                                  ctx->i32, args, 3,
-                                 AC_FUNC_ATTR_READNONE |
-                                 AC_FUNC_ATTR_LEGACY);
+                                 AC_FUNC_ATTR_READNONE);
 }
 
 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned simm16)
@@ -1562,34 +2051,77 @@ void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned simm16)
                           ctx->voidt, args, 1, 0);
 }
 
-void ac_get_image_intr_name(const char *base_name,
-                           LLVMTypeRef data_type,
-                           LLVMTypeRef coords_type,
-                           LLVMTypeRef rsrc_type,
-                           char *out_name, unsigned out_len)
+LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
+                           unsigned bitsize)
 {
-        char coords_type_name[8];
+       LLVMTypeRef type;
+       char *intr;
 
-        ac_build_type_name_for_intr(coords_type, coords_type_name,
-                            sizeof(coords_type_name));
+       if (bitsize == 32) {
+               intr = "llvm.floor.f32";
+               type = ctx->f32;
+       } else {
+               intr = "llvm.floor.f64";
+               type = ctx->f64;
+       }
+
+       LLVMValueRef params[] = {
+               src0,
+       };
+       LLVMValueRef floor = ac_build_intrinsic(ctx, intr, type, params, 1,
+                                               AC_FUNC_ATTR_READNONE);
+       return LLVMBuildFSub(ctx->builder, src0, floor, "");
+}
+
+LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
+                           unsigned bitsize)
+{
+       LLVMValueRef cmp, val, zero, one;
+       LLVMTypeRef type;
 
-        if (HAVE_LLVM <= 0x0309) {
-                snprintf(out_name, out_len, "%s.%s", base_name, coords_type_name);
-        } else {
-                char data_type_name[8];
-                char rsrc_type_name[8];
+       if (bitsize == 32) {
+               type = ctx->i32;
+               zero = ctx->i32_0;
+               one = ctx->i32_1;
+       } else {
+               type = ctx->i64;
+               zero = ctx->i64_0;
+               one = ctx->i64_1;
+       }
+
+       cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
+       val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
+       cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
+       val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), "");
+       return val;
+}
+
+LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
+                           unsigned bitsize)
+{
+       LLVMValueRef cmp, val, zero, one;
+       LLVMTypeRef type;
+
+       if (bitsize == 32) {
+               type = ctx->f32;
+               zero = ctx->f32_0;
+               one = ctx->f32_1;
+       } else {
+               type = ctx->f64;
+               zero = ctx->f64_0;
+               one = ctx->f64_1;
+       }
 
-                ac_build_type_name_for_intr(data_type, data_type_name,
-                                        sizeof(data_type_name));
-                ac_build_type_name_for_intr(rsrc_type, rsrc_type_name,
-                                        sizeof(rsrc_type_name));
-                snprintf(out_name, out_len, "%s.%s.%s.%s", base_name,
-                         data_type_name, coords_type_name, rsrc_type_name);
-        }
+       cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
+       val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
+       cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
+       val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), "");
+       return val;
 }
 
-#define AC_EXP_TARGET (HAVE_LLVM >= 0x0500 ? 0 : 3)
-#define AC_EXP_OUT0 (HAVE_LLVM >= 0x0500 ? 2 : 5)
+#define AC_EXP_TARGET          0
+#define AC_EXP_ENABLED_CHANNELS 1
+#define AC_EXP_OUT0            2
 
 enum ac_ir_type {
        AC_IR_UNDEF,
@@ -1661,7 +2193,8 @@ static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset,
        return true;
 }
 
-static bool ac_eliminate_duplicated_output(uint8_t *vs_output_param_offset,
+static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
+                                          uint8_t *vs_output_param_offset,
                                           uint32_t num_outputs,
                                           struct ac_vs_exports *processed,
                                           struct ac_vs_exp_inst *exp)
@@ -1713,6 +2246,10 @@ static bool ac_eliminate_duplicated_output(uint8_t *vs_output_param_offset,
         */
        struct ac_vs_exp_inst *match = &processed->exp[p];
 
+       /* Get current enabled channels mask. */
+       LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
+       unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
+
        while (copy_back_channels) {
                unsigned chan = u_bit_scan(&copy_back_channels);
 
@@ -1720,6 +2257,13 @@ static bool ac_eliminate_duplicated_output(uint8_t *vs_output_param_offset,
                LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan,
                               exp->chan[chan].value);
                match->chan[chan] = exp->chan[chan];
+
+               /* Update number of enabled channels because the original mask
+                * is not always 0xf.
+                */
+               enabled_channels |= (1 << chan);
+               LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
+                              LLVMConstInt(ctx->i32, enabled_channels, 0));
        }
 
        /* The PARAM export is duplicated. Kill it. */
@@ -1807,7 +2351,8 @@ void ac_optimize_vs_outputs(struct ac_llvm_context *ctx,
                        /* Eliminate constant and duplicated PARAM exports. */
                        if (ac_eliminate_const_output(vs_output_param_offset,
                                                      num_outputs, &exp) ||
-                           ac_eliminate_duplicated_output(vs_output_param_offset,
+                           ac_eliminate_duplicated_output(ctx,
+                                                          vs_output_param_offset,
                                                           num_outputs, &exports,
                                                           &exp)) {
                                removed_any = true;
@@ -1885,6 +2430,20 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
                         LLVMTypeRef dst_type,
                         LLVMValueRef src0)
 {
+       unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
+       const char *intrin_name;
+       LLVMTypeRef type;
+       LLVMValueRef zero;
+       if (src0_bitsize == 64) {
+               intrin_name = "llvm.cttz.i64";
+               type = ctx->i64;
+               zero = ctx->i64_0;
+       } else {
+               intrin_name = "llvm.cttz.i32";
+               type = ctx->i32;
+               zero = ctx->i32_0;
+       }
+
        LLVMValueRef params[2] = {
                src0,
 
@@ -1900,15 +2459,19 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
                LLVMConstInt(ctx->i1, 1, false),
        };
 
-       LLVMValueRef lsb = ac_build_intrinsic(ctx, "llvm.cttz.i32", ctx->i32,
+       LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type,
                                              params, 2,
                                              AC_FUNC_ATTR_READNONE);
 
+       if (src0_bitsize == 64) {
+               lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
+       }
+
        /* TODO: We need an intrinsic to skip this conditional. */
        /* Check for zero: */
        return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder,
                                                           LLVMIntEQ, src0,
-                                                          ctx->i32_0, ""),
+                                                          zero, ""),
                               LLVMConstInt(ctx->i32, -1, 0), lsb, "");
 }
 
@@ -1917,3 +2480,771 @@ LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
        return LLVMPointerType(LLVMArrayType(elem_type, 0),
                               AC_CONST_ADDR_SPACE);
 }
+
+LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
+{
+       if (!HAVE_32BIT_POINTERS)
+               return ac_array_in_const_addr_space(elem_type);
+
+       return LLVMPointerType(LLVMArrayType(elem_type, 0),
+                              AC_CONST_32BIT_ADDR_SPACE);
+}
+
+static struct ac_llvm_flow *
+get_current_flow(struct ac_llvm_context *ctx)
+{
+       if (ctx->flow_depth > 0)
+               return &ctx->flow[ctx->flow_depth - 1];
+       return NULL;
+}
+
+static struct ac_llvm_flow *
+get_innermost_loop(struct ac_llvm_context *ctx)
+{
+       for (unsigned i = ctx->flow_depth; i > 0; --i) {
+               if (ctx->flow[i - 1].loop_entry_block)
+                       return &ctx->flow[i - 1];
+       }
+       return NULL;
+}
+
+static struct ac_llvm_flow *
+push_flow(struct ac_llvm_context *ctx)
+{
+       struct ac_llvm_flow *flow;
+
+       if (ctx->flow_depth >= ctx->flow_depth_max) {
+               unsigned new_max = MAX2(ctx->flow_depth << 1,
+                                       AC_LLVM_INITIAL_CF_DEPTH);
+
+               ctx->flow = realloc(ctx->flow, new_max * sizeof(*ctx->flow));
+               ctx->flow_depth_max = new_max;
+       }
+
+       flow = &ctx->flow[ctx->flow_depth];
+       ctx->flow_depth++;
+
+       flow->next_block = NULL;
+       flow->loop_entry_block = NULL;
+       return flow;
+}
+
+static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base,
+                               int label_id)
+{
+       char buf[32];
+       snprintf(buf, sizeof(buf), "%s%d", base, label_id);
+       LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
+}
+
+/* Append a basic block at the level of the parent flow.
+ */
+static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx,
+                                           const char *name)
+{
+       assert(ctx->flow_depth >= 1);
+
+       if (ctx->flow_depth >= 2) {
+               struct ac_llvm_flow *flow = &ctx->flow[ctx->flow_depth - 2];
+
+               return LLVMInsertBasicBlockInContext(ctx->context,
+                                                    flow->next_block, name);
+       }
+
+       LLVMValueRef main_fn =
+               LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
+       return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
+}
+
+/* Emit a branch to the given default target for the current block if
+ * applicable -- that is, if the current block does not already contain a
+ * branch from a break or continue.
+ */
+static void emit_default_branch(LLVMBuilderRef builder,
+                               LLVMBasicBlockRef target)
+{
+       if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
+                LLVMBuildBr(builder, target);
+}
+
+void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
+{
+       struct ac_llvm_flow *flow = push_flow(ctx);
+       flow->loop_entry_block = append_basic_block(ctx, "LOOP");
+       flow->next_block = append_basic_block(ctx, "ENDLOOP");
+       set_basicblock_name(flow->loop_entry_block, "loop", label_id);
+       LLVMBuildBr(ctx->builder, flow->loop_entry_block);
+       LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
+}
+
+void ac_build_break(struct ac_llvm_context *ctx)
+{
+       struct ac_llvm_flow *flow = get_innermost_loop(ctx);
+       LLVMBuildBr(ctx->builder, flow->next_block);
+}
+
+void ac_build_continue(struct ac_llvm_context *ctx)
+{
+       struct ac_llvm_flow *flow = get_innermost_loop(ctx);
+       LLVMBuildBr(ctx->builder, flow->loop_entry_block);
+}
+
+void ac_build_else(struct ac_llvm_context *ctx, int label_id)
+{
+       struct ac_llvm_flow *current_branch = get_current_flow(ctx);
+       LLVMBasicBlockRef endif_block;
+
+       assert(!current_branch->loop_entry_block);
+
+       endif_block = append_basic_block(ctx, "ENDIF");
+       emit_default_branch(ctx->builder, endif_block);
+
+       LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
+       set_basicblock_name(current_branch->next_block, "else", label_id);
+
+       current_branch->next_block = endif_block;
+}
+
+void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
+{
+       struct ac_llvm_flow *current_branch = get_current_flow(ctx);
+
+       assert(!current_branch->loop_entry_block);
+
+       emit_default_branch(ctx->builder, current_branch->next_block);
+       LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
+       set_basicblock_name(current_branch->next_block, "endif", label_id);
+
+       ctx->flow_depth--;
+}
+
+void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
+{
+       struct ac_llvm_flow *current_loop = get_current_flow(ctx);
+
+       assert(current_loop->loop_entry_block);
+
+       emit_default_branch(ctx->builder, current_loop->loop_entry_block);
+
+       LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
+       set_basicblock_name(current_loop->next_block, "endloop", label_id);
+       ctx->flow_depth--;
+}
+
+static void if_cond_emit(struct ac_llvm_context *ctx, LLVMValueRef cond,
+                        int label_id)
+{
+       struct ac_llvm_flow *flow = push_flow(ctx);
+       LLVMBasicBlockRef if_block;
+
+       if_block = append_basic_block(ctx, "IF");
+       flow->next_block = append_basic_block(ctx, "ELSE");
+       set_basicblock_name(if_block, "if", label_id);
+       LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
+       LLVMPositionBuilderAtEnd(ctx->builder, if_block);
+}
+
+void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
+                int label_id)
+{
+       LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
+                                         value, ctx->f32_0, "");
+       if_cond_emit(ctx, cond, label_id);
+}
+
+void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
+                 int label_id)
+{
+       LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+                                         ac_to_integer(ctx, value),
+                                         ctx->i32_0, "");
+       if_cond_emit(ctx, cond, label_id);
+}
+
+LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type,
+                            const char *name)
+{
+       LLVMBuilderRef builder = ac->builder;
+       LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
+       LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
+       LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
+       LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
+       LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
+       LLVMValueRef res;
+
+       if (first_instr) {
+               LLVMPositionBuilderBefore(first_builder, first_instr);
+       } else {
+               LLVMPositionBuilderAtEnd(first_builder, first_block);
+       }
+
+       res = LLVMBuildAlloca(first_builder, type, name);
+       LLVMBuildStore(builder, LLVMConstNull(type), res);
+
+       LLVMDisposeBuilder(first_builder);
+
+       return res;
+}
+
+LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac,
+                                  LLVMTypeRef type, const char *name)
+{
+       LLVMValueRef ptr = ac_build_alloca(ac, type, name);
+       LLVMBuildStore(ac->builder, LLVMGetUndef(type), ptr);
+       return ptr;
+}
+
+LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr,
+                         LLVMTypeRef type)
+{
+       int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
+       return LLVMBuildBitCast(ctx->builder, ptr,
+                               LLVMPointerType(type, addr_space), "");
+}
+
+LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value,
+                           unsigned count)
+{
+       unsigned num_components = ac_get_llvm_num_components(value);
+       if (count == num_components)
+               return value;
+
+       LLVMValueRef masks[] = {
+           LLVMConstInt(ctx->i32, 0, false), LLVMConstInt(ctx->i32, 1, false),
+           LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false)};
+
+       if (count == 1)
+               return LLVMBuildExtractElement(ctx->builder, value, masks[0],
+                                              "");
+
+       LLVMValueRef swizzle = LLVMConstVector(masks, count);
+       return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
+}
+
+LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param,
+                            unsigned rshift, unsigned bitwidth)
+{
+       LLVMValueRef value = param;
+       if (rshift)
+               value = LLVMBuildLShr(ctx->builder, value,
+                                     LLVMConstInt(ctx->i32, rshift, false), "");
+
+       if (rshift + bitwidth < 32) {
+               unsigned mask = (1 << bitwidth) - 1;
+               value = LLVMBuildAnd(ctx->builder, value,
+                                    LLVMConstInt(ctx->i32, mask, false), "");
+       }
+       return value;
+}
+
+/* Adjust the sample index according to FMASK.
+ *
+ * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
+ * which is the identity mapping. Each nibble says which physical sample
+ * should be fetched to get that sample.
+ *
+ * For example, 0x11111100 means there are only 2 samples stored and
+ * the second sample covers 3/4 of the pixel. When reading samples 0
+ * and 1, return physical sample 0 (determined by the first two 0s
+ * in FMASK), otherwise return physical sample 1.
+ *
+ * The sample index should be adjusted as follows:
+ *   addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
+ */
+void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
+                             LLVMValueRef *addr, bool is_array_tex)
+{
+       struct ac_image_args fmask_load = {};
+       fmask_load.opcode = ac_image_load;
+       fmask_load.resource = fmask;
+       fmask_load.dmask = 0xf;
+       fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
+
+       fmask_load.coords[0] = addr[0];
+       fmask_load.coords[1] = addr[1];
+       if (is_array_tex)
+               fmask_load.coords[2] = addr[2];
+
+       LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
+       fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value,
+                                             ac->i32_0, "");
+
+       /* Apply the formula. */
+       unsigned sample_chan = is_array_tex ? 3 : 2;
+       LLVMValueRef final_sample;
+       final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
+                                   LLVMConstInt(ac->i32, 4, 0), "");
+       final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
+       /* Mask the sample index by 0x7, because 0x8 means an unknown value
+        * with EQAA, so those will map to 0. */
+       final_sample = LLVMBuildAnd(ac->builder, final_sample,
+                                   LLVMConstInt(ac->i32, 0x7, 0), "");
+
+       /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
+        * resource descriptor is 0 (invalid).
+        */
+       LLVMValueRef tmp;
+       tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
+       tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
+       tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
+
+       /* Replace the MSAA sample index. */
+       addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample,
+                                           addr[sample_chan], "");
+}
+
+static LLVMValueRef
+_ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
+{
+       ac_build_optimization_barrier(ctx, &src);
+       return ac_build_intrinsic(ctx,
+                       lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
+                       LLVMTypeOf(src), (LLVMValueRef []) {
+                       src, lane },
+                       lane == NULL ? 1 : 2,
+                       AC_FUNC_ATTR_READNONE |
+                       AC_FUNC_ATTR_CONVERGENT);
+}
+
+/**
+ * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
+ * @param ctx
+ * @param src
+ * @param lane - id of the lane or NULL for the first active lane
+ * @return value of the lane
+ */
+LLVMValueRef
+ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
+{
+       LLVMTypeRef src_type = LLVMTypeOf(src);
+       src = ac_to_integer(ctx, src);
+       unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+       LLVMValueRef ret;
+
+       if (bits == 32) {
+               ret = _ac_build_readlane(ctx, src, lane);
+       } else {
+               assert(bits % 32 == 0);
+               LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+               LLVMValueRef src_vector =
+                       LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+               ret = LLVMGetUndef(vec_type);
+               for (unsigned i = 0; i < bits / 32; i++) {
+                       src = LLVMBuildExtractElement(ctx->builder, src_vector,
+                                               LLVMConstInt(ctx->i32, i, 0), "");
+                       LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane);
+                       ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp,
+                                               LLVMConstInt(ctx->i32, i, 0), "");
+               }
+       }
+       return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+LLVMValueRef
+ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane)
+{
+       /* TODO: Use the actual instruction when LLVM adds an intrinsic for it.
+        */
+       LLVMValueRef pred = LLVMBuildICmp(ctx->builder, LLVMIntEQ, lane,
+                                         ac_get_thread_id(ctx), "");
+       return LLVMBuildSelect(ctx->builder, pred, value, src, "");
+}
+
+LLVMValueRef
+ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
+{
+       LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask,
+                                                LLVMVectorType(ctx->i32, 2),
+                                                "");
+       LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec,
+                                                      ctx->i32_0, "");
+       LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec,
+                                                      ctx->i32_1, "");
+       LLVMValueRef val =
+               ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
+                                  (LLVMValueRef []) { mask_lo, ctx->i32_0 },
+                                  2, AC_FUNC_ATTR_READNONE);
+       val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32,
+                                (LLVMValueRef []) { mask_hi, val },
+                                2, AC_FUNC_ATTR_READNONE);
+       return val;
+}
+
+enum dpp_ctrl {
+       _dpp_quad_perm = 0x000,
+       _dpp_row_sl = 0x100,
+       _dpp_row_sr = 0x110,
+       _dpp_row_rr = 0x120,
+       dpp_wf_sl1 = 0x130,
+       dpp_wf_rl1 = 0x134,
+       dpp_wf_sr1 = 0x138,
+       dpp_wf_rr1 = 0x13C,
+       dpp_row_mirror = 0x140,
+       dpp_row_half_mirror = 0x141,
+       dpp_row_bcast15 = 0x142,
+       dpp_row_bcast31 = 0x143
+};
+
+static inline enum dpp_ctrl
+dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
+{
+       assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
+       return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
+}
+
+static inline enum dpp_ctrl
+dpp_row_sl(unsigned amount)
+{
+       assert(amount > 0 && amount < 16);
+       return _dpp_row_sl | amount;
+}
+
+static inline enum dpp_ctrl
+dpp_row_sr(unsigned amount)
+{
+       assert(amount > 0 && amount < 16);
+       return _dpp_row_sr | amount;
+}
+
+static LLVMValueRef
+_ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
+             enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
+             bool bound_ctrl)
+{
+       return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32",
+                                       LLVMTypeOf(old),
+                                       (LLVMValueRef[]) {
+                                               old, src,
+                                               LLVMConstInt(ctx->i32, dpp_ctrl, 0),
+                                               LLVMConstInt(ctx->i32, row_mask, 0),
+                                               LLVMConstInt(ctx->i32, bank_mask, 0),
+                                               LLVMConstInt(ctx->i1, bound_ctrl, 0) },
+                                       6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
+}
+
+static LLVMValueRef
+ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
+            enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
+            bool bound_ctrl)
+{
+       LLVMTypeRef src_type = LLVMTypeOf(src);
+       src = ac_to_integer(ctx, src);
+       old = ac_to_integer(ctx, old);
+       unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+       LLVMValueRef ret;
+       if (bits == 32) {
+               ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask,
+                                   bank_mask, bound_ctrl);
+       } else {
+               assert(bits % 32 == 0);
+               LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+               LLVMValueRef src_vector =
+                       LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+               LLVMValueRef old_vector =
+                       LLVMBuildBitCast(ctx->builder, old, vec_type, "");
+               ret = LLVMGetUndef(vec_type);
+               for (unsigned i = 0; i < bits / 32; i++) {
+                       src = LLVMBuildExtractElement(ctx->builder, src_vector,
+                                                     LLVMConstInt(ctx->i32, i,
+                                                                  0), "");
+                       old = LLVMBuildExtractElement(ctx->builder, old_vector,
+                                                     LLVMConstInt(ctx->i32, i,
+                                                                  0), "");
+                       LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src,
+                                                             dpp_ctrl,
+                                                             row_mask,
+                                                             bank_mask,
+                                                             bound_ctrl);
+                       ret = LLVMBuildInsertElement(ctx->builder, ret,
+                                                    ret_comp,
+                                                    LLVMConstInt(ctx->i32, i,
+                                                                 0), "");
+               }
+       }
+       return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+static inline unsigned
+ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
+{
+       assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
+       return and_mask | (or_mask << 5) | (xor_mask << 10);
+}
+
+static LLVMValueRef
+_ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
+{
+       return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle",
+                                  LLVMTypeOf(src), (LLVMValueRef []) {
+                                       src, LLVMConstInt(ctx->i32, mask, 0) },
+                                  2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
+}
+
+LLVMValueRef
+ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
+{
+       LLVMTypeRef src_type = LLVMTypeOf(src);
+       src = ac_to_integer(ctx, src);
+       unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+       LLVMValueRef ret;
+       if (bits == 32) {
+               ret = _ac_build_ds_swizzle(ctx, src, mask);
+       } else {
+               assert(bits % 32 == 0);
+               LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+               LLVMValueRef src_vector =
+                       LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+               ret = LLVMGetUndef(vec_type);
+               for (unsigned i = 0; i < bits / 32; i++) {
+                       src = LLVMBuildExtractElement(ctx->builder, src_vector,
+                                                     LLVMConstInt(ctx->i32, i,
+                                                                  0), "");
+                       LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src,
+                                                                    mask);
+                       ret = LLVMBuildInsertElement(ctx->builder, ret,
+                                                    ret_comp,
+                                                    LLVMConstInt(ctx->i32, i,
+                                                                 0), "");
+               }
+       }
+       return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+static LLVMValueRef
+ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
+{
+       char name[32], type[8];
+       ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
+       snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
+       return ac_build_intrinsic(ctx, name, LLVMTypeOf(src),
+                                 (LLVMValueRef []) { src }, 1,
+                                 AC_FUNC_ATTR_READNONE);
+}
+
+static LLVMValueRef
+ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
+                     LLVMValueRef inactive)
+{
+       char name[33], type[8];
+       LLVMTypeRef src_type = LLVMTypeOf(src);
+       src = ac_to_integer(ctx, src);
+       inactive = ac_to_integer(ctx, inactive);
+       ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
+       snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
+       LLVMValueRef ret =
+               ac_build_intrinsic(ctx, name,
+                                       LLVMTypeOf(src), (LLVMValueRef []) {
+                                       src, inactive }, 2,
+                                       AC_FUNC_ATTR_READNONE |
+                                       AC_FUNC_ATTR_CONVERGENT);
+       return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+static LLVMValueRef
+get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size)
+{
+       if (type_size == 4) {
+               switch (op) {
+               case nir_op_iadd: return ctx->i32_0;
+               case nir_op_fadd: return ctx->f32_0;
+               case nir_op_imul: return ctx->i32_1;
+               case nir_op_fmul: return ctx->f32_1;
+               case nir_op_imin: return LLVMConstInt(ctx->i32, INT32_MAX, 0);
+               case nir_op_umin: return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
+               case nir_op_fmin: return LLVMConstReal(ctx->f32, INFINITY);
+               case nir_op_imax: return LLVMConstInt(ctx->i32, INT32_MIN, 0);
+               case nir_op_umax: return ctx->i32_0;
+               case nir_op_fmax: return LLVMConstReal(ctx->f32, -INFINITY);
+               case nir_op_iand: return LLVMConstInt(ctx->i32, -1, 0);
+               case nir_op_ior: return ctx->i32_0;
+               case nir_op_ixor: return ctx->i32_0;
+               default:
+                       unreachable("bad reduction intrinsic");
+               }
+       } else { /* type_size == 64bit */
+               switch (op) {
+               case nir_op_iadd: return ctx->i64_0;
+               case nir_op_fadd: return ctx->f64_0;
+               case nir_op_imul: return ctx->i64_1;
+               case nir_op_fmul: return ctx->f64_1;
+               case nir_op_imin: return LLVMConstInt(ctx->i64, INT64_MAX, 0);
+               case nir_op_umin: return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
+               case nir_op_fmin: return LLVMConstReal(ctx->f64, INFINITY);
+               case nir_op_imax: return LLVMConstInt(ctx->i64, INT64_MIN, 0);
+               case nir_op_umax: return ctx->i64_0;
+               case nir_op_fmax: return LLVMConstReal(ctx->f64, -INFINITY);
+               case nir_op_iand: return LLVMConstInt(ctx->i64, -1, 0);
+               case nir_op_ior: return ctx->i64_0;
+               case nir_op_ixor: return ctx->i64_0;
+               default:
+                       unreachable("bad reduction intrinsic");
+               }
+       }
+}
+
+static LLVMValueRef
+ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op)
+{
+       bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
+       switch (op) {
+       case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
+       case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
+       case nir_op_imul: return LLVMBuildMul(ctx->builder, lhs, rhs, "");
+       case nir_op_fmul: return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
+       case nir_op_imin: return LLVMBuildSelect(ctx->builder,
+                                       LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
+                                       lhs, rhs, "");
+       case nir_op_umin: return LLVMBuildSelect(ctx->builder,
+                                       LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
+                                       lhs, rhs, "");
+       case nir_op_fmin: return ac_build_intrinsic(ctx,
+                                       _64bit ? "llvm.minnum.f64" : "llvm.minnum.f32",
+                                       _64bit ? ctx->f64 : ctx->f32,
+                                       (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
+       case nir_op_imax: return LLVMBuildSelect(ctx->builder,
+                                       LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
+                                       lhs, rhs, "");
+       case nir_op_umax: return LLVMBuildSelect(ctx->builder,
+                                       LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
+                                       lhs, rhs, "");
+       case nir_op_fmax: return ac_build_intrinsic(ctx,
+                                       _64bit ? "llvm.maxnum.f64" : "llvm.maxnum.f32",
+                                       _64bit ? ctx->f64 : ctx->f32,
+                                       (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
+       case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
+       case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
+       case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
+       default:
+               unreachable("bad reduction intrinsic");
+       }
+}
+
+/* TODO: add inclusive and excluse scan functions for SI chip class.  */
+static LLVMValueRef
+ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity)
+{
+       LLVMValueRef result, tmp;
+       result = src;
+       tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
+       result = ac_build_alu_op(ctx, result, tmp, op);
+       tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
+       result = ac_build_alu_op(ctx, result, tmp, op);
+       tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
+       result = ac_build_alu_op(ctx, result, tmp, op);
+       tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
+       result = ac_build_alu_op(ctx, result, tmp, op);
+       tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
+       result = ac_build_alu_op(ctx, result, tmp, op);
+       tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
+       result = ac_build_alu_op(ctx, result, tmp, op);
+       tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
+       result = ac_build_alu_op(ctx, result, tmp, op);
+       return result;
+}
+
+LLVMValueRef
+ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
+{
+       ac_build_optimization_barrier(ctx, &src);
+       LLVMValueRef result;
+       LLVMValueRef identity = get_reduction_identity(ctx, op,
+                                                               ac_get_type_size(LLVMTypeOf(src)));
+       result = LLVMBuildBitCast(ctx->builder,
+                                                               ac_build_set_inactive(ctx, src, identity),
+                                                               LLVMTypeOf(identity), "");
+       result = ac_build_scan(ctx, op, result, identity);
+
+       return ac_build_wwm(ctx, result);
+}
+
+LLVMValueRef
+ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
+{
+       ac_build_optimization_barrier(ctx, &src);
+       LLVMValueRef result;
+       LLVMValueRef identity = get_reduction_identity(ctx, op,
+                                                               ac_get_type_size(LLVMTypeOf(src)));
+       result = LLVMBuildBitCast(ctx->builder,
+                                                               ac_build_set_inactive(ctx, src, identity),
+                                                               LLVMTypeOf(identity), "");
+       result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
+       result = ac_build_scan(ctx, op, result, identity);
+
+       return ac_build_wwm(ctx, result);
+}
+
+LLVMValueRef
+ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
+{
+       if (cluster_size == 1) return src;
+       ac_build_optimization_barrier(ctx, &src);
+       LLVMValueRef result, swap;
+       LLVMValueRef identity = get_reduction_identity(ctx, op,
+                                                               ac_get_type_size(LLVMTypeOf(src)));
+       result = LLVMBuildBitCast(ctx->builder,
+                                                               ac_build_set_inactive(ctx, src, identity),
+                                                               LLVMTypeOf(identity), "");
+       swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
+       result = ac_build_alu_op(ctx, result, swap, op);
+       if (cluster_size == 2) return ac_build_wwm(ctx, result);
+
+       swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
+       result = ac_build_alu_op(ctx, result, swap, op);
+       if (cluster_size == 4) return ac_build_wwm(ctx, result);
+
+       if (ctx->chip_class >= VI)
+               swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
+       else
+               swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
+       result = ac_build_alu_op(ctx, result, swap, op);
+       if (cluster_size == 8) return ac_build_wwm(ctx, result);
+
+       if (ctx->chip_class >= VI)
+               swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
+       else
+               swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
+       result = ac_build_alu_op(ctx, result, swap, op);
+       if (cluster_size == 16) return ac_build_wwm(ctx, result);
+
+       if (ctx->chip_class >= VI && cluster_size != 32)
+               swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
+       else
+               swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
+       result = ac_build_alu_op(ctx, result, swap, op);
+       if (cluster_size == 32) return ac_build_wwm(ctx, result);
+
+       if (ctx->chip_class >= VI) {
+               swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
+               result = ac_build_alu_op(ctx, result, swap, op);
+               result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
+               return ac_build_wwm(ctx, result);
+       } else {
+               swap = ac_build_readlane(ctx, result, ctx->i32_0);
+               result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
+               result = ac_build_alu_op(ctx, result, swap, op);
+               return ac_build_wwm(ctx, result);
+       }
+}
+
+LLVMValueRef
+ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
+               unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
+{
+       unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
+       if (ctx->chip_class >= VI && HAVE_LLVM >= 0x0600) {
+               return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
+       } else {
+               return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
+       }
+}
+
+LLVMValueRef
+ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
+{
+       index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
+       return ac_build_intrinsic(ctx,
+                 "llvm.amdgcn.ds.bpermute", ctx->i32,
+                 (LLVMValueRef []) {index, src}, 2,
+                 AC_FUNC_ATTR_READNONE |
+                 AC_FUNC_ATTR_CONVERGENT);
+}