ac/nir: make ac_build_fdiv support 16-bit floats

[mesa.git] / src / amd / common / ac_llvm_build.c
diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c

index 77b07989430924ba07ff8348dc1ff8c6717db258..c6bc507358ec38c2aa798c7ea5784073ed8baac4 100644 (file)
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -37,6 +37,7 @@
  #include "util/bitscan.h"
  #include "util/macros.h"
  #include "util/u_atomic.h"
+#include "util/u_math.h"
  #include "sid.h"
  
  #include "shader_enums.h"
@@ -56,15 +57,15 @@ struct ac_llvm_flow {
   * The caller is responsible for initializing ctx::module and ctx::builder.
   */
  void
-ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context,
+ac_llvm_context_init(struct ac_llvm_context *ctx,
                      enum chip_class chip_class, enum radeon_family family)
  {
         LLVMValueRef args[1];
  
+       ctx->context = LLVMContextCreate();
+
         ctx->chip_class = chip_class;
         ctx->family = family;
-
-       ctx->context = context;
         ctx->module = NULL;
         ctx->builder = NULL;
  
@@ -74,7 +75,7 @@ ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context,
         ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
         ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
         ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
-       ctx->intptr = HAVE_32BIT_POINTERS ? ctx->i32 : ctx->i64;
+       ctx->intptr = ctx->i32;
         ctx->f16 = LLVMHalfTypeInContext(ctx->context);
         ctx->f32 = LLVMFloatTypeInContext(ctx->context);
         ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
@@ -86,6 +87,8 @@ ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context,
         ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
         ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
  
+       ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
+       ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
         ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
         ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
         ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
@@ -174,12 +177,14 @@ ac_get_type_size(LLVMTypeRef type)
         switch (kind) {
         case LLVMIntegerTypeKind:
                 return LLVMGetIntTypeWidth(type) / 8;
+       case LLVMHalfTypeKind:
+               return 2;
         case LLVMFloatTypeKind:
                 return 4;
         case LLVMDoubleTypeKind:
                 return 8;
         case LLVMPointerTypeKind:
-               if (LLVMGetPointerAddressSpace(type) == AC_CONST_32BIT_ADDR_SPACE)
+               if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
                         return 4;
                 return 8;
         case LLVMVectorTypeKind:
@@ -214,6 +219,16 @@ ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
                 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type),
                                       LLVMGetVectorSize(t));
         }
+       if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
+               switch (LLVMGetPointerAddressSpace(t)) {
+               case AC_ADDR_SPACE_GLOBAL:
+                       return ctx->i64;
+               case AC_ADDR_SPACE_LDS:
+                       return ctx->i32;
+               default:
+                       unreachable("unhandled address space");
+               }
+       }
         return to_integer_type_scalar(ctx, t);
  }
  
@@ -221,9 +236,21 @@ LLVMValueRef
  ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
  {
         LLVMTypeRef type = LLVMTypeOf(v);
+       if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
+               return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
+       }
         return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
  }
  
+LLVMValueRef
+ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
+{
+       LLVMTypeRef type = LLVMTypeOf(v);
+       if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
+               return v;
+       return ac_to_integer(ctx, v);
+}
+
  static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
  {
         if (t == ctx->i16 || t == ctx->f16)
@@ -319,6 +346,9 @@ void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
         case LLVMIntegerTypeKind:
                 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
                 break;
+       case LLVMHalfTypeKind:
+               snprintf(buf, bufsize, "f16");
+               break;
         case LLVMFloatTypeKind:
                 snprintf(buf, bufsize, "f32");
                 break;
@@ -342,6 +372,12 @@ ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
         return phi;
  }
  
+void ac_build_s_barrier(struct ac_llvm_context *ctx)
+{
+       ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL,
+                          0, AC_FUNC_ATTR_CONVERGENT);
+}
+
  /* Prevent optimizations (at least of memory accesses) across the current
   * point in the program by emitting empty inline assembly that is marked as
   * having side effects.
@@ -509,39 +545,68 @@ ac_build_gather_values(struct ac_llvm_context *ctx,
         return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
  }
  
-/* Expand a scalar or vector to <4 x type> by filling the remaining channels
- * with undef. Extract at most num_channels components from the input.
+/* Expand a scalar or vector to <dst_channels x type> by filling the remaining
+ * channels with undef. Extract at most src_channels components from the input.
   */
-LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
-                                    LLVMValueRef value,
-                                    unsigned num_channels)
+static LLVMValueRef
+ac_build_expand(struct ac_llvm_context *ctx,
+               LLVMValueRef value,
+               unsigned src_channels,
+               unsigned dst_channels)
  {
         LLVMTypeRef elemtype;
-       LLVMValueRef chan[4];
+       LLVMValueRef chan[dst_channels];
  
         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
                 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
-               num_channels = MIN2(num_channels, vec_size);
  
-               if (num_channels >= 4)
+               if (src_channels == dst_channels && vec_size == dst_channels)
                         return value;
  
-               for (unsigned i = 0; i < num_channels; i++)
+               src_channels = MIN2(src_channels, vec_size);
+
+               for (unsigned i = 0; i < src_channels; i++)
                         chan[i] = ac_llvm_extract_elem(ctx, value, i);
  
                 elemtype = LLVMGetElementType(LLVMTypeOf(value));
         } else {
-               if (num_channels) {
-                       assert(num_channels == 1);
+               if (src_channels) {
+                       assert(src_channels == 1);
                         chan[0] = value;
                 }
                 elemtype = LLVMTypeOf(value);
         }
  
-       while (num_channels < 4)
-               chan[num_channels++] = LLVMGetUndef(elemtype);
+       for (unsigned i = src_channels; i < dst_channels; i++)
+               chan[i] = LLVMGetUndef(elemtype);
+
+       return ac_build_gather_values(ctx, chan, dst_channels);
+}
+
+/* Expand a scalar or vector to <4 x type> by filling the remaining channels
+ * with undef. Extract at most num_channels components from the input.
+ */
+LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
+                                    LLVMValueRef value,
+                                    unsigned num_channels)
+{
+       return ac_build_expand(ctx, value, num_channels, 4);
+}
+
+LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
+{
+       unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
+       const char *name;
  
-       return ac_build_gather_values(ctx, chan, 4);
+       if (type_size == 2)
+               name = "llvm.rint.f16";
+       else if (type_size == 4)
+               name = "llvm.rint.f32";
+       else
+               name = "llvm.rint.f64";
+
+       return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1,
+                                 AC_FUNC_ATTR_READNONE);
  }
  
  LLVMValueRef
@@ -549,7 +614,15 @@ ac_build_fdiv(struct ac_llvm_context *ctx,
               LLVMValueRef num,
               LLVMValueRef den)
  {
-       LLVMValueRef ret = LLVMBuildFDiv(ctx->builder, num, den, "");
+       /* If we do (num / den), LLVM >= 7.0 does:
+        *    return num * v_rcp_f32(den * (fabs(den) > 0x1.0p+96f ? 0x1.0p-32f : 1.0f));
+        *
+        * If we do (num * (1 / den)), LLVM does:
+        *    return num * v_rcp_f32(den);
+        */
+       LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0);
+       LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, "");
+       LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, "");
  
         /* Use v_rcp_f32 instead of precise division. */
         if (!LLVMIsConstant(ret))
@@ -557,6 +630,67 @@ ac_build_fdiv(struct ac_llvm_context *ctx,
         return ret;
  }
  
+/* See fast_idiv_by_const.h. */
+/* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
+LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx,
+                               LLVMValueRef num,
+                               LLVMValueRef multiplier,
+                               LLVMValueRef pre_shift,
+                               LLVMValueRef post_shift,
+                               LLVMValueRef increment)
+{
+       LLVMBuilderRef builder = ctx->builder;
+
+       num = LLVMBuildLShr(builder, num, pre_shift, "");
+       num = LLVMBuildMul(builder,
+                          LLVMBuildZExt(builder, num, ctx->i64, ""),
+                          LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
+       num = LLVMBuildAdd(builder, num,
+                          LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
+       num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
+       num = LLVMBuildTrunc(builder, num, ctx->i32, "");
+       return LLVMBuildLShr(builder, num, post_shift, "");
+}
+
+/* See fast_idiv_by_const.h. */
+/* If num != UINT_MAX, this more efficient version can be used. */
+/* Set: increment = util_fast_udiv_info::increment; */
+LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx,
+                                   LLVMValueRef num,
+                                   LLVMValueRef multiplier,
+                                   LLVMValueRef pre_shift,
+                                   LLVMValueRef post_shift,
+                                   LLVMValueRef increment)
+{
+       LLVMBuilderRef builder = ctx->builder;
+
+       num = LLVMBuildLShr(builder, num, pre_shift, "");
+       num = LLVMBuildNUWAdd(builder, num, increment, "");
+       num = LLVMBuildMul(builder,
+                          LLVMBuildZExt(builder, num, ctx->i64, ""),
+                          LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
+       num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
+       num = LLVMBuildTrunc(builder, num, ctx->i32, "");
+       return LLVMBuildLShr(builder, num, post_shift, "");
+}
+
+/* See fast_idiv_by_const.h. */
+/* Both operands must fit in 31 bits and the divisor must not be 1. */
+LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx,
+                                             LLVMValueRef num,
+                                             LLVMValueRef multiplier,
+                                             LLVMValueRef post_shift)
+{
+       LLVMBuilderRef builder = ctx->builder;
+
+       num = LLVMBuildMul(builder,
+                          LLVMBuildZExt(builder, num, ctx->i64, ""),
+                          LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
+       num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
+       num = LLVMBuildTrunc(builder, num, ctx->i32, "");
+       return LLVMBuildLShr(builder, num, post_shift, "");
+}
+
  /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
   * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
   * already multiplied by two. id is the cube face number.
@@ -654,8 +788,7 @@ ac_prepare_cube_coords(struct ac_llvm_context *ctx,
         LLVMValueRef invma;
  
         if (is_array && !is_lod) {
-               LLVMValueRef tmp = coords_arg[3];
-               tmp = ac_build_intrinsic(ctx, "llvm.rint.f32", ctx->f32, &tmp, 1, 0);
+               LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
  
                 /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
                  *
@@ -750,8 +883,7 @@ ac_prepare_cube_coords(struct ac_llvm_context *ctx,
         if (is_array) {
                 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
                 /* coords_arg.w component - array_index for cube arrays */
-               LLVMValueRef tmp = LLVMBuildFMul(ctx->builder, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), "");
-               coords[2] = LLVMBuildFAdd(ctx->builder, tmp, coords[2], "");
+               coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
         }
  
         memcpy(coords_arg, coords, sizeof(coords));
@@ -805,17 +937,32 @@ ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
                                   ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
  }
  
+LLVMValueRef
+ac_build_gep_ptr(struct ac_llvm_context *ctx,
+                LLVMValueRef base_ptr,
+                LLVMValueRef index)
+{
+       return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
+}
+
  LLVMValueRef
  ac_build_gep0(struct ac_llvm_context *ctx,
               LLVMValueRef base_ptr,
               LLVMValueRef index)
  {
         LLVMValueRef indices[2] = {
-               LLVMConstInt(ctx->i32, 0, 0),
+               ctx->i32_0,
                 index,
         };
-       return LLVMBuildGEP(ctx->builder, base_ptr,
-                           indices, 2, "");
+       return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
+}
+
+LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr,
+                                 LLVMValueRef index)
+{
+       return LLVMBuildPointerCast(ctx->builder,
+                                   ac_build_gep0(ctx, ptr, index),
+                                   LLVMTypeOf(ptr), "");
  }
  
  void
@@ -836,14 +983,39 @@ ac_build_indexed_store(struct ac_llvm_context *ctx,
   * \param uniform   Whether the base_ptr and index can be assumed to be
   *                  dynamically uniform (i.e. load to an SGPR)
   * \param invariant Whether the load is invariant (no other opcodes affect it)
+ * \param no_unsigned_wraparound
+ *    For all possible re-associations and re-distributions of an expression
+ *    "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
+ *    without inbounds in base_ptr), this parameter is true if "addr + offset"
+ *    does not result in an unsigned integer wraparound. This is used for
+ *    optimal code generation of 32-bit pointer arithmetic.
+ *
+ *    For example, a 32-bit immediate offset that causes a 32-bit unsigned
+ *    integer wraparound can't be an imm offset in s_load_dword, because
+ *    the instruction performs "addr + offset" in 64 bits.
+ *
+ *    Expected usage for bindless textures by chaining GEPs:
+ *      // possible unsigned wraparound, don't use InBounds:
+ *      ptr1 = LLVMBuildGEP(base_ptr, index);
+ *      image = load(ptr1); // becomes "s_load ptr1, 0"
+ *
+ *      ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
+ *      sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
   */
  static LLVMValueRef
  ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
-                    LLVMValueRef index, bool uniform, bool invariant)
+                    LLVMValueRef index, bool uniform, bool invariant,
+                    bool no_unsigned_wraparound)
  {
         LLVMValueRef pointer, result;
+       LLVMValueRef indices[2] = {ctx->i32_0, index};
+
+       if (no_unsigned_wraparound &&
+           LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
+               pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, indices, 2, "");
+       else
+               pointer = LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
  
-       pointer = ac_build_gep0(ctx, base_ptr, index);
         if (uniform)
                 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
         result = LLVMBuildLoad(ctx->builder, pointer, "");
@@ -855,19 +1027,28 @@ ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
  LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
                            LLVMValueRef index)
  {
-       return ac_build_load_custom(ctx, base_ptr, index, false, false);
+       return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
  }
  
  LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx,
                                      LLVMValueRef base_ptr, LLVMValueRef index)
  {
-       return ac_build_load_custom(ctx, base_ptr, index, false, true);
+       return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
  }
  
+/* This assumes that there is no unsigned integer wraparound during the address
+ * computation, excluding all GEPs within base_ptr. */
  LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx,
                                    LLVMValueRef base_ptr, LLVMValueRef index)
  {
-       return ac_build_load_custom(ctx, base_ptr, index, true, true);
+       return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
+}
+
+/* See ac_build_load_custom() documentation. */
+LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
+                                  LLVMValueRef base_ptr, LLVMValueRef index)
+{
+       return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
  }
  
  /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
@@ -887,36 +1068,35 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
                             bool writeonly_memory,
                             bool swizzle_enable_hint)
  {
+       /* Split 3 channel stores, becase LLVM doesn't support 3-channel
+        * intrinsics. */
+       if (num_channels == 3) {
+               LLVMValueRef v[3], v01;
+
+               for (int i = 0; i < 3; i++) {
+                       v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
+                                       LLVMConstInt(ctx->i32, i, 0), "");
+               }
+               v01 = ac_build_gather_values(ctx, v, 2);
+
+               ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
+                                           soffset, inst_offset, glc, slc,
+                                           writeonly_memory, swizzle_enable_hint);
+               ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
+                                           soffset, inst_offset + 8,
+                                           glc, slc,
+                                           writeonly_memory, swizzle_enable_hint);
+               return;
+       }
+
         /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
          * (voffset is swizzled, but soffset isn't swizzled).
          * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
          */
         if (!swizzle_enable_hint) {
-               /* Split 3 channel stores, becase LLVM doesn't support 3-channel
-                * intrinsics. */
-               if (num_channels == 3) {
-                       LLVMValueRef v[3], v01;
-
-                       for (int i = 0; i < 3; i++) {
-                               v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
-                                               LLVMConstInt(ctx->i32, i, 0), "");
-                       }
-                       v01 = ac_build_gather_values(ctx, v, 2);
-
-                       ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
-                                                   soffset, inst_offset, glc, slc,
-                                                   writeonly_memory, swizzle_enable_hint);
-                       ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
-                                                   soffset, inst_offset + 8,
-                                                   glc, slc,
-                                                   writeonly_memory, swizzle_enable_hint);
-                       return;
-               }
+               LLVMValueRef offset = soffset;
  
-               unsigned func = CLAMP(num_channels, 1, 3) - 1;
                 static const char *types[] = {"f32", "v2f32", "v4f32"};
-               char name[256];
-               LLVMValueRef offset = soffset;
  
                 if (inst_offset)
                         offset = LLVMBuildAdd(ctx->builder, offset,
@@ -927,59 +1107,52 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
                 LLVMValueRef args[] = {
                         ac_to_float(ctx, vdata),
                         LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
-                       LLVMConstInt(ctx->i32, 0, 0),
+                       ctx->i32_0,
                         offset,
                         LLVMConstInt(ctx->i1, glc, 0),
                         LLVMConstInt(ctx->i1, slc, 0),
                 };
  
+               char name[256];
                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
-                        types[func]);
+                        types[CLAMP(num_channels, 1, 3) - 1]);
  
                 ac_build_intrinsic(ctx, name, ctx->voidt,
                                    args, ARRAY_SIZE(args),
                                    writeonly_memory ?
-                                          AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
-                                          AC_FUNC_ATTR_WRITEONLY);
+                                  AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
+                                  AC_FUNC_ATTR_WRITEONLY);
                 return;
         }
  
-       static unsigned dfmt[] = {
+       static const unsigned dfmt[] = {
                 V_008F0C_BUF_DATA_FORMAT_32,
                 V_008F0C_BUF_DATA_FORMAT_32_32,
                 V_008F0C_BUF_DATA_FORMAT_32_32_32,
                 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
         };
-       assert(num_channels >= 1 && num_channels <= 4);
-
+       static const char *types[] = {"i32", "v2i32", "v4i32"};
         LLVMValueRef args[] = {
-               rsrc,
                 vdata,
-               LLVMConstInt(ctx->i32, num_channels, 0),
-               voffset ? voffset : LLVMGetUndef(ctx->i32),
+               LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
+               ctx->i32_0,
+               voffset ? voffset : ctx->i32_0,
                 soffset,
                 LLVMConstInt(ctx->i32, inst_offset, 0),
                 LLVMConstInt(ctx->i32, dfmt[num_channels - 1], 0),
                 LLVMConstInt(ctx->i32, V_008F0C_BUF_NUM_FORMAT_UINT, 0),
-               LLVMConstInt(ctx->i32, voffset != NULL, 0),
-               LLVMConstInt(ctx->i32, 0, 0), /* idxen */
-               LLVMConstInt(ctx->i32, glc, 0),
-               LLVMConstInt(ctx->i32, slc, 0),
-               LLVMConstInt(ctx->i32, 0, 0), /* tfe*/
+               LLVMConstInt(ctx->i1, glc, 0),
+               LLVMConstInt(ctx->i1, slc, 0),
         };
-
-       /* The instruction offset field has 12 bits */
-       assert(voffset || inst_offset < (1 << 12));
-
-       /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
-       unsigned func = CLAMP(num_channels, 1, 3) - 1;
-       const char *types[] = {"i32", "v2i32", "v4i32"};
         char name[256];
-       snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
+       snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s",
+                types[CLAMP(num_channels, 1, 3) - 1]);
  
         ac_build_intrinsic(ctx, name, ctx->voidt,
                            args, ARRAY_SIZE(args),
-                          AC_FUNC_ATTR_LEGACY);
+                          writeonly_memory ?
+                                  AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
+                                  AC_FUNC_ATTR_WRITEONLY);
  }
  
  static LLVMValueRef
@@ -995,7 +1168,7 @@ ac_build_buffer_load_common(struct ac_llvm_context *ctx,
  {
         LLVMValueRef args[] = {
                 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
-               vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
+               vindex ? vindex : ctx->i32_0,
                 voffset,
                 LLVMConstInt(ctx->i1, glc, 0),
                 LLVMConstInt(ctx->i1, slc, 0)
@@ -1019,6 +1192,47 @@ ac_build_buffer_load_common(struct ac_llvm_context *ctx,
                                   ac_get_load_intr_attribs(can_speculate));
  }
  
+static LLVMValueRef
+ac_build_llvm8_buffer_load_common(struct ac_llvm_context *ctx,
+                                 LLVMValueRef rsrc,
+                                 LLVMValueRef vindex,
+                                 LLVMValueRef voffset,
+                                 LLVMValueRef soffset,
+                                 unsigned num_channels,
+                                 bool glc,
+                                 bool slc,
+                                 bool can_speculate,
+                                 bool use_format,
+                                 bool structurized)
+{
+       LLVMValueRef args[5];
+       int idx = 0;
+       args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
+       if (structurized)
+               args[idx++] = vindex ? vindex : ctx->i32_0;
+       args[idx++] = voffset ? voffset : ctx->i32_0;
+       args[idx++] = soffset ? soffset : ctx->i32_0;
+       args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+       unsigned func = CLAMP(num_channels, 1, 3) - 1;
+
+       LLVMTypeRef types[] = {ctx->f32, ctx->v2f32, ctx->v4f32};
+       const char *type_names[] = {"f32", "v2f32", "v4f32"};
+       const char *indexing_kind = structurized ? "struct" : "raw";
+       char name[256];
+
+       if (use_format) {
+               snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s",
+                        indexing_kind, type_names[func]);
+       } else {
+               snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s",
+                        indexing_kind, type_names[func]);
+       }
+
+       return ac_build_intrinsic(ctx, name, types[func], args,
+                                 idx,
+                                 ac_get_load_intr_attribs(can_speculate));
+}
+
  LLVMValueRef
  ac_build_buffer_load(struct ac_llvm_context *ctx,
                      LLVMValueRef rsrc,
@@ -1038,8 +1252,8 @@ ac_build_buffer_load(struct ac_llvm_context *ctx,
         if (soffset)
                 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
  
-       /* TODO: VI and later generations can use SMEM with GLC=1.*/
-       if (allow_smem && !glc && !slc) {
+       if (allow_smem && !slc &&
+           (!glc || (HAVE_LLVM >= 0x0800 && ctx->chip_class >= VI))) {
                 assert(vindex == NULL);
  
                 LLVMValueRef result[8];
@@ -1049,11 +1263,19 @@ ac_build_buffer_load(struct ac_llvm_context *ctx,
                                 offset = LLVMBuildAdd(ctx->builder, offset,
                                                       LLVMConstInt(ctx->i32, 4, 0), "");
                         }
-                       LLVMValueRef args[2] = {rsrc, offset};
-                       result[i] = ac_build_intrinsic(ctx, "llvm.SI.load.const.v4i32",
-                                                      ctx->f32, args, 2,
+                       const char *intrname =
+                               HAVE_LLVM >= 0x0800 ? "llvm.amdgcn.s.buffer.load.f32"
+                                                   : "llvm.SI.load.const.v4i32";
+                       unsigned num_args = HAVE_LLVM >= 0x0800 ? 3 : 2;
+                       LLVMValueRef args[3] = {
+                               rsrc,
+                               offset,
+                               glc ? ctx->i32_1 : ctx->i32_0,
+                       };
+                       result[i] = ac_build_intrinsic(ctx, intrname,
+                                                      ctx->f32, args, num_args,
                                                        AC_FUNC_ATTR_READNONE |
-                                                      AC_FUNC_ATTR_LEGACY);
+                                                      (HAVE_LLVM < 0x0800 ? AC_FUNC_ATTR_LEGACY : 0));
                 }
                 if (num_channels == 1)
                         return result[0];
@@ -1076,6 +1298,11 @@ LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
                                          bool glc,
                                          bool can_speculate)
  {
+       if (HAVE_LLVM >= 0x800) {
+               return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
+                                                        num_channels, glc, false,
+                                                        can_speculate, true, true);
+       }
         return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset,
                                            num_channels, glc, false,
                                            can_speculate, true);
@@ -1089,8 +1316,14 @@ LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
                                                    bool glc,
                                                    bool can_speculate)
  {
+       if (HAVE_LLVM >= 0x800) {
+               return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
+                                                        num_channels, glc, false,
+                                                        can_speculate, true, true);
+       }
+
         LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), "");
-       LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 1, 0), "");
+       LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, ctx->i32_1, "");
         stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), "");
  
         LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->builder,
@@ -1105,6 +1338,83 @@ LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
                                            can_speculate, true);
  }
  
+LLVMValueRef
+ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
+                           LLVMValueRef rsrc,
+                           LLVMValueRef vindex,
+                           LLVMValueRef voffset,
+                               LLVMValueRef soffset,
+                               LLVMValueRef immoffset,
+                               LLVMValueRef glc)
+{
+       unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
+       unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
+       LLVMValueRef res;
+
+       if (HAVE_LLVM >= 0x0800) {
+               voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
+
+               res = ac_build_llvm8_tbuffer_load(ctx, rsrc, vindex, voffset,
+                                                 soffset, 1, dfmt, nfmt, glc,
+                                                 false, true, true);
+       } else {
+               const char *name = "llvm.amdgcn.tbuffer.load.i32";
+               LLVMTypeRef type = ctx->i32;
+               LLVMValueRef params[] = {
+                                       rsrc,
+                                       vindex,
+                                       voffset,
+                                       soffset,
+                                       immoffset,
+                                       LLVMConstInt(ctx->i32, dfmt, false),
+                                       LLVMConstInt(ctx->i32, nfmt, false),
+                                       glc,
+                                       ctx->i1false,
+               };
+               res = ac_build_intrinsic(ctx, name, type, params, 9, 0);
+       }
+
+       return LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
+}
+
+LLVMValueRef
+ac_build_llvm8_tbuffer_load(struct ac_llvm_context *ctx,
+                           LLVMValueRef rsrc,
+                           LLVMValueRef vindex,
+                           LLVMValueRef voffset,
+                           LLVMValueRef soffset,
+                           unsigned num_channels,
+                           unsigned dfmt,
+                           unsigned nfmt,
+                           bool glc,
+                           bool slc,
+                           bool can_speculate,
+                           bool structurized)
+{
+       LLVMValueRef args[6];
+       int idx = 0;
+       args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
+       if (structurized)
+               args[idx++] = vindex ? vindex : ctx->i32_0;
+       args[idx++] = voffset ? voffset : ctx->i32_0;
+       args[idx++] = soffset ? soffset : ctx->i32_0;
+       args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0);
+       args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+       unsigned func = CLAMP(num_channels, 1, 3) - 1;
+
+       LLVMTypeRef types[] = {ctx->i32, ctx->v2i32, ctx->v4i32};
+       const char *type_names[] = {"i32", "v2i32", "v4i32"};
+       const char *indexing_kind = structurized ? "struct" : "raw";
+       char name[256];
+
+       snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s",
+                indexing_kind, type_names[func]);
+
+       return ac_build_intrinsic(ctx, name, types[func], args,
+                                 idx,
+                                 ac_get_load_intr_attribs(can_speculate));
+}
+
  /**
   * Set range metadata on an instruction.  This can only be used on load and
   * call instructions.  If you know an instruction can only produce the values
@@ -1132,7 +1442,7 @@ ac_get_thread_id(struct ac_llvm_context *ctx)
  
         LLVMValueRef tid_args[2];
         tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
-       tid_args[1] = LLVMConstInt(ctx->i32, 0, false);
+       tid_args[1] = ctx->i32_0;
         tid_args[1] = ac_build_intrinsic(ctx,
                                          "llvm.amdgcn.mbcnt.lo", ctx->i32,
                                          tid_args, 2, AC_FUNC_ATTR_READNONE);
@@ -1174,79 +1484,29 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
               int idx,
               LLVMValueRef val)
  {
-       LLVMValueRef tl, trbl, args[2];
+       unsigned tl_lanes[4], trbl_lanes[4];
+       LLVMValueRef tl, trbl;
         LLVMValueRef result;
  
-       if (ctx->chip_class >= VI) {
-               LLVMValueRef thread_id, tl_tid, trbl_tid;
-               thread_id = ac_get_thread_id(ctx);
-
-               tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
-                                     LLVMConstInt(ctx->i32, mask, false), "");
-
-               trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
-                                       LLVMConstInt(ctx->i32, idx, false), "");
-
-               args[0] = LLVMBuildMul(ctx->builder, tl_tid,
-                                      LLVMConstInt(ctx->i32, 4, false), "");
-               args[1] = val;
-               tl = ac_build_intrinsic(ctx,
-                                       "llvm.amdgcn.ds.bpermute", ctx->i32,
-                                       args, 2,
-                                       AC_FUNC_ATTR_READNONE |
-                                       AC_FUNC_ATTR_CONVERGENT);
-
-               args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
-                                      LLVMConstInt(ctx->i32, 4, false), "");
-               trbl = ac_build_intrinsic(ctx,
-                                         "llvm.amdgcn.ds.bpermute", ctx->i32,
-                                         args, 2,
-                                         AC_FUNC_ATTR_READNONE |
-                                         AC_FUNC_ATTR_CONVERGENT);
-       } else {
-               uint32_t masks[2] = {};
-
-               switch (mask) {
-               case AC_TID_MASK_TOP_LEFT:
-                       masks[0] = 0x8000;
-                       if (idx == 1)
-                               masks[1] = 0x8055;
-                       else
-                               masks[1] = 0x80aa;
-
-                       break;
-               case AC_TID_MASK_TOP:
-                       masks[0] = 0x8044;
-                       masks[1] = 0x80ee;
-                       break;
-               case AC_TID_MASK_LEFT:
-                       masks[0] = 0x80a0;
-                       masks[1] = 0x80f5;
-                       break;
-               default:
-                       assert(0);
-               }
-
-               args[0] = val;
-               args[1] = LLVMConstInt(ctx->i32, masks[0], false);
-
-               tl = ac_build_intrinsic(ctx,
-                                       "llvm.amdgcn.ds.swizzle", ctx->i32,
-                                       args, 2,
-                                       AC_FUNC_ATTR_READNONE |
-                                       AC_FUNC_ATTR_CONVERGENT);
-
-               args[1] = LLVMConstInt(ctx->i32, masks[1], false);
-               trbl = ac_build_intrinsic(ctx,
-                                       "llvm.amdgcn.ds.swizzle", ctx->i32,
-                                       args, 2,
-                                       AC_FUNC_ATTR_READNONE |
-                                       AC_FUNC_ATTR_CONVERGENT);
+       for (unsigned i = 0; i < 4; ++i) {
+               tl_lanes[i] = i & mask;
+               trbl_lanes[i] = (i & mask) + idx;
         }
  
+       tl = ac_build_quad_swizzle(ctx, val,
+                                  tl_lanes[0], tl_lanes[1],
+                                  tl_lanes[2], tl_lanes[3]);
+       trbl = ac_build_quad_swizzle(ctx, val,
+                                    trbl_lanes[0], trbl_lanes[1],
+                                    trbl_lanes[2], trbl_lanes[3]);
+
         tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
         trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
         result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
+
+       result = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32,
+                                   &result, 1, 0);
+
         return result;
  }
  
@@ -1278,7 +1538,7 @@ ac_build_imsb(struct ac_llvm_context *ctx,
         LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
         LLVMValueRef cond = LLVMBuildOr(ctx->builder,
                                         LLVMBuildICmp(ctx->builder, LLVMIntEQ,
-                                                     arg, LLVMConstInt(ctx->i32, 0, 0), ""),
+                                                     arg, ctx->i32_0, ""),
                                         LLVMBuildICmp(ctx->builder, LLVMIntEQ,
                                                       arg, all_ones, ""), "");
  
@@ -1294,17 +1554,31 @@ ac_build_umsb(struct ac_llvm_context *ctx,
         LLVMTypeRef type;
         LLVMValueRef highest_bit;
         LLVMValueRef zero;
+       unsigned bitsize;
  
-       if (ac_get_elem_bits(ctx, LLVMTypeOf(arg)) == 64) {
+       bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
+       switch (bitsize) {
+       case 64:
                 intrin_name = "llvm.ctlz.i64";
                 type = ctx->i64;
                 highest_bit = LLVMConstInt(ctx->i64, 63, false);
                 zero = ctx->i64_0;
-       } else {
+               break;
+       case 32:
                 intrin_name = "llvm.ctlz.i32";
                 type = ctx->i32;
                 highest_bit = LLVMConstInt(ctx->i32, 31, false);
                 zero = ctx->i32_0;
+               break;
+       case 16:
+               intrin_name = "llvm.ctlz.i16";
+               type = ctx->i16;
+               highest_bit = LLVMConstInt(ctx->i16, 15, false);
+               zero = ctx->i16_0;
+               break;
+       default:
+               unreachable(!"invalid bitsize");
+               break;
         }
  
         LLVMValueRef params[2] = {
@@ -1330,16 +1604,20 @@ ac_build_umsb(struct ac_llvm_context *ctx,
  LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
                            LLVMValueRef b)
  {
+       char name[64];
+       snprintf(name, sizeof(name), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
         LLVMValueRef args[2] = {a, b};
-       return ac_build_intrinsic(ctx, "llvm.minnum.f32", ctx->f32, args, 2,
+       return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
                                   AC_FUNC_ATTR_READNONE);
  }
  
  LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
                            LLVMValueRef b)
  {
+       char name[64];
+       snprintf(name, sizeof(name), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
         LLVMValueRef args[2] = {a, b};
-       return ac_build_intrinsic(ctx, "llvm.maxnum.f32", ctx->f32, args, 2,
+       return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
                                   AC_FUNC_ATTR_READNONE);
  }
  
@@ -1366,66 +1644,42 @@ LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,
  
  LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
  {
-       if (HAVE_LLVM >= 0x0500) {
-               return ac_build_fmin(ctx, ac_build_fmax(ctx, value, ctx->f32_0),
-                                    ctx->f32_1);
-       }
-
-       LLVMValueRef args[3] = {
-               value,
-               LLVMConstReal(ctx->f32, 0),
-               LLVMConstReal(ctx->f32, 1),
-       };
-
-       return ac_build_intrinsic(ctx, "llvm.AMDGPU.clamp.", ctx->f32, args, 3,
-                                 AC_FUNC_ATTR_READNONE |
-                                 AC_FUNC_ATTR_LEGACY);
+       LLVMTypeRef t = LLVMTypeOf(value);
+       return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
+                            LLVMConstReal(t, 1.0));
  }
  
  void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
  {
         LLVMValueRef args[9];
  
-       if (HAVE_LLVM >= 0x0500) {
-               args[0] = LLVMConstInt(ctx->i32, a->target, 0);
-               args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
-
-               if (a->compr) {
-                       LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
-                       LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
-
-                       args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
-                                                  v2i16, "");
-                       args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
-                                                  v2i16, "");
-                       args[4] = LLVMConstInt(ctx->i1, a->done, 0);
-                       args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
-
-                       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
-                                          ctx->voidt, args, 6, 0);
-               } else {
-                       args[2] = a->out[0];
-                       args[3] = a->out[1];
-                       args[4] = a->out[2];
-                       args[5] = a->out[3];
-                       args[6] = LLVMConstInt(ctx->i1, a->done, 0);
-                       args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
-
-                       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
-                                          ctx->voidt, args, 8, 0);
-               }
-               return;
-       }
+       args[0] = LLVMConstInt(ctx->i32, a->target, 0);
+       args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
+
+       if (a->compr) {
+               LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
+               LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
  
-       args[0] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
-       args[1] = LLVMConstInt(ctx->i32, a->valid_mask, 0);
-       args[2] = LLVMConstInt(ctx->i32, a->done, 0);
-       args[3] = LLVMConstInt(ctx->i32, a->target, 0);
-       args[4] = LLVMConstInt(ctx->i32, a->compr, 0);
-       memcpy(args + 5, a->out, sizeof(a->out[0]) * 4);
+               args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
+                               v2i16, "");
+               args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
+                               v2i16, "");
+               args[4] = LLVMConstInt(ctx->i1, a->done, 0);
+               args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
  
-       ac_build_intrinsic(ctx, "llvm.SI.export", ctx->voidt, args, 9,
-                          AC_FUNC_ATTR_LEGACY);
+               ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
+                                  ctx->voidt, args, 6, 0);
+       } else {
+               args[2] = a->out[0];
+               args[3] = a->out[1];
+               args[4] = a->out[2];
+               args[5] = a->out[3];
+               args[6] = LLVMConstInt(ctx->i1, a->done, 0);
+               args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
+
+               ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
+                                  ctx->voidt, args, 8, 0);
+       }
  }
  
  void ac_build_export_null(struct ac_llvm_context *ctx)
@@ -1445,81 +1699,212 @@ void ac_build_export_null(struct ac_llvm_context *ctx)
         ac_build_export(ctx, &args);
  }
  
+static unsigned ac_num_coords(enum ac_image_dim dim)
+{
+       switch (dim) {
+       case ac_image_1d:
+               return 1;
+       case ac_image_2d:
+       case ac_image_1darray:
+                return 2;
+       case ac_image_3d:
+       case ac_image_cube:
+       case ac_image_2darray:
+       case ac_image_2dmsaa:
+               return 3;
+       case ac_image_2darraymsaa:
+               return 4;
+       default:
+               unreachable("ac_num_coords: bad dim");
+       }
+}
+
+static unsigned ac_num_derivs(enum ac_image_dim dim)
+{
+       switch (dim) {
+       case ac_image_1d:
+       case ac_image_1darray:
+               return 2;
+       case ac_image_2d:
+       case ac_image_2darray:
+       case ac_image_cube:
+               return 4;
+       case ac_image_3d:
+               return 6;
+       case ac_image_2dmsaa:
+       case ac_image_2darraymsaa:
+       default:
+               unreachable("derivatives not supported");
+       }
+}
+
+static const char *get_atomic_name(enum ac_atomic_op op)
+{
+       switch (op) {
+       case ac_atomic_swap: return "swap";
+       case ac_atomic_add: return "add";
+       case ac_atomic_sub: return "sub";
+       case ac_atomic_smin: return "smin";
+       case ac_atomic_umin: return "umin";
+       case ac_atomic_smax: return "smax";
+       case ac_atomic_umax: return "umax";
+       case ac_atomic_and: return "and";
+       case ac_atomic_or: return "or";
+       case ac_atomic_xor: return "xor";
+       }
+       unreachable("bad atomic op");
+}
+
  LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
                                    struct ac_image_args *a)
  {
-       LLVMValueRef args[11];
+       const char *overload[3] = { "", "", "" };
+       unsigned num_overloads = 0;
+       LLVMValueRef args[18];
         unsigned num_args = 0;
-       const char *name = NULL;
-       char intr_name[128], type[64];
+       enum ac_image_dim dim = a->dim;
+
+       assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
+              !a->level_zero);
+       assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
+               a->opcode != ac_image_store_mip) ||
+              a->lod);
+       assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
+              (!a->compare && !a->offset));
+       assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
+               a->opcode == ac_image_get_lod) ||
+              !a->bias);
+       assert((a->bias ? 1 : 0) +
+              (a->lod ? 1 : 0) +
+              (a->level_zero ? 1 : 0) +
+              (a->derivs[0] ? 1 : 0) <= 1);
+
+       if (a->opcode == ac_image_get_lod) {
+               switch (dim) {
+               case ac_image_1darray:
+                       dim = ac_image_1d;
+                       break;
+               case ac_image_2darray:
+               case ac_image_cube:
+                       dim = ac_image_2d;
+                       break;
+               default:
+                       break;
+               }
+       }
  
         bool sample = a->opcode == ac_image_sample ||
                       a->opcode == ac_image_gather4 ||
                       a->opcode == ac_image_get_lod;
-       bool da = a->dim == ac_image_cube ||
-                 a->dim == ac_image_1darray ||
-                 a->dim == ac_image_2darray ||
-                 a->dim == ac_image_2darraymsaa;
-       if (a->opcode == ac_image_get_lod)
-               da = false;
-
-       if (sample)
-               args[num_args++] = ac_to_float(ctx, a->addr);
-       else
-               args[num_args++] = a->addr;
+       bool atomic = a->opcode == ac_image_atomic ||
+                     a->opcode == ac_image_atomic_cmpswap;
+       LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
+
+       if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
+               args[num_args++] = a->data[0];
+               if (a->opcode == ac_image_atomic_cmpswap)
+                       args[num_args++] = a->data[1];
+       }
+
+       if (!atomic)
+               args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
+
+       if (a->offset)
+               args[num_args++] = ac_to_integer(ctx, a->offset);
+       if (a->bias) {
+               args[num_args++] = ac_to_float(ctx, a->bias);
+               overload[num_overloads++] = ".f32";
+       }
+       if (a->compare)
+               args[num_args++] = ac_to_float(ctx, a->compare);
+       if (a->derivs[0]) {
+               unsigned count = ac_num_derivs(dim);
+               for (unsigned i = 0; i < count; ++i)
+                       args[num_args++] = ac_to_float(ctx, a->derivs[i]);
+               overload[num_overloads++] = ".f32";
+       }
+       unsigned num_coords =
+               a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
+       for (unsigned i = 0; i < num_coords; ++i)
+               args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
+       if (a->lod)
+               args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
+       overload[num_overloads++] = sample ? ".f32" : ".i32";
  
         args[num_args++] = a->resource;
-       if (sample)
+       if (sample) {
                 args[num_args++] = a->sampler;
-       args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, 0);
-       if (sample)
-               args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, 0);
-       args[num_args++] = ctx->i1false; /* glc */
-       args[num_args++] = ctx->i1false; /* slc */
-       args[num_args++] = ctx->i1false; /* lwe */
-       args[num_args++] = LLVMConstInt(ctx->i1, da, 0);
+               args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
+       }
  
+       args[num_args++] = ctx->i32_0; /* texfailctrl */
+       args[num_args++] = LLVMConstInt(ctx->i32, a->cache_policy, false);
+
+       const char *name;
+       const char *atomic_subop = "";
         switch (a->opcode) {
-       case ac_image_sample:
-               name = "llvm.amdgcn.image.sample";
-               break;
-       case ac_image_gather4:
-               name = "llvm.amdgcn.image.gather4";
-               break;
-       case ac_image_load:
-               name = "llvm.amdgcn.image.load";
+       case ac_image_sample: name = "sample"; break;
+       case ac_image_gather4: name = "gather4"; break;
+       case ac_image_load: name = "load"; break;
+       case ac_image_load_mip: name = "load.mip"; break;
+       case ac_image_store: name = "store"; break;
+       case ac_image_store_mip: name = "store.mip"; break;
+       case ac_image_atomic:
+               name = "atomic.";
+               atomic_subop = get_atomic_name(a->atomic);
                 break;
-       case ac_image_load_mip:
-               name = "llvm.amdgcn.image.load.mip";
+       case ac_image_atomic_cmpswap:
+               name = "atomic.";
+               atomic_subop = "cmpswap";
                 break;
-       case ac_image_get_lod:
-               name = "llvm.amdgcn.image.getlod";
-               break;
-       case ac_image_get_resinfo:
-               name = "llvm.amdgcn.image.getresinfo";
-               break;
-       default:
-               unreachable("invalid image opcode");
+       case ac_image_get_lod: name = "getlod"; break;
+       case ac_image_get_resinfo: name = "getresinfo"; break;
+       default: unreachable("invalid image opcode");
         }
  
-       ac_build_type_name_for_intr(LLVMTypeOf(args[0]), type,
-                                   sizeof(type));
+       const char *dimname;
+       switch (dim) {
+       case ac_image_1d: dimname = "1d"; break;
+       case ac_image_2d: dimname = "2d"; break;
+       case ac_image_3d: dimname = "3d"; break;
+       case ac_image_cube: dimname = "cube"; break;
+       case ac_image_1darray: dimname = "1darray"; break;
+       case ac_image_2darray: dimname = "2darray"; break;
+       case ac_image_2dmsaa: dimname = "2dmsaa"; break;
+       case ac_image_2darraymsaa: dimname = "2darraymsaa"; break;
+       default: unreachable("invalid dim");
+       }
  
-       snprintf(intr_name, sizeof(intr_name), "%s%s%s%s.v4f32.%s.v8i32",
-               name,
-               a->compare ? ".c" : "",
-               a->bias ? ".b" :
-               a->lod ? ".l" :
-               a->deriv ? ".d" :
-               a->level_zero ? ".lz" : "",
-               a->offset ? ".o" : "",
-               type);
+       bool lod_suffix =
+               a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
+       char intr_name[96];
+       snprintf(intr_name, sizeof(intr_name),
+                "llvm.amdgcn.image.%s%s" /* base name */
+                "%s%s%s" /* sample/gather modifiers */
+                ".%s.%s%s%s%s", /* dimension and type overloads */
+                name, atomic_subop,
+                a->compare ? ".c" : "",
+                a->bias ? ".b" :
+                lod_suffix ? ".l" :
+                a->derivs[0] ? ".d" :
+                a->level_zero ? ".lz" : "",
+                a->offset ? ".o" : "",
+                dimname,
+                atomic ? "i32" : "v4f32",
+                overload[0], overload[1], overload[2]);
+
+       LLVMTypeRef retty;
+       if (atomic)
+               retty = ctx->i32;
+       else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
+               retty = ctx->voidt;
+       else
+               retty = ctx->v4f32;
  
         LLVMValueRef result =
-               ac_build_intrinsic(ctx, intr_name,
-                                  ctx->v4f32, args, num_args,
-                                  AC_FUNC_ATTR_READNONE);
-       if (!sample) {
+               ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
+                                  a->attributes);
+       if (!sample && retty == ctx->v4f32) {
                 result = LLVMBuildBitCast(ctx->builder, result,
                                           ctx->v4i32, "");
         }
@@ -1529,98 +1914,31 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
  LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
                                     LLVMValueRef args[2])
  {
-       if (HAVE_LLVM >= 0x0500) {
-               LLVMTypeRef v2f16 =
-                       LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
-               LLVMValueRef res =
-                       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz",
-                                          v2f16, args, 2,
-                                          AC_FUNC_ATTR_READNONE);
-               return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
-       }
+       LLVMTypeRef v2f16 =
+               LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
  
-       return ac_build_intrinsic(ctx, "llvm.SI.packf16", ctx->i32, args, 2,
-                                 AC_FUNC_ATTR_READNONE |
-                                 AC_FUNC_ATTR_LEGACY);
-}
-
-/* Upper 16 bits must be zero. */
-static LLVMValueRef ac_llvm_pack_two_int16(struct ac_llvm_context *ctx,
-                                          LLVMValueRef val[2])
-{
-       return LLVMBuildOr(ctx->builder, val[0],
-                          LLVMBuildShl(ctx->builder, val[1],
-                                       LLVMConstInt(ctx->i32, 16, 0),
-                                       ""), "");
-}
-
-/* Upper 16 bits are ignored and will be dropped. */
-static LLVMValueRef ac_llvm_pack_two_int32_as_int16(struct ac_llvm_context *ctx,
-                                                   LLVMValueRef val[2])
-{
-       LLVMValueRef v[2] = {
-               LLVMBuildAnd(ctx->builder, val[0],
-                            LLVMConstInt(ctx->i32, 0xffff, 0), ""),
-               val[1],
-       };
-       return ac_llvm_pack_two_int16(ctx, v);
+       return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16,
+                                 args, 2, AC_FUNC_ATTR_READNONE);
  }
  
  LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx,
                                      LLVMValueRef args[2])
  {
-       if (HAVE_LLVM >= 0x0600) {
-               LLVMValueRef res =
-                       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16",
-                                          ctx->v2i16, args, 2,
-                                          AC_FUNC_ATTR_READNONE);
-               return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
-       }
-
-       LLVMValueRef val[2];
-
-       for (int chan = 0; chan < 2; chan++) {
-               /* Clamp between [-1, 1]. */
-               val[chan] = ac_build_fmin(ctx, args[chan], ctx->f32_1);
-               val[chan] = ac_build_fmax(ctx, val[chan], LLVMConstReal(ctx->f32, -1));
-               /* Convert to a signed integer in [-32767, 32767]. */
-               val[chan] = LLVMBuildFMul(ctx->builder, val[chan],
-                                         LLVMConstReal(ctx->f32, 32767), "");
-               /* If positive, add 0.5, else add -0.5. */
-               val[chan] = LLVMBuildFAdd(ctx->builder, val[chan],
-                               LLVMBuildSelect(ctx->builder,
-                                       LLVMBuildFCmp(ctx->builder, LLVMRealOGE,
-                                                     val[chan], ctx->f32_0, ""),
-                                       LLVMConstReal(ctx->f32, 0.5),
-                                       LLVMConstReal(ctx->f32, -0.5), ""), "");
-               val[chan] = LLVMBuildFPToSI(ctx->builder, val[chan], ctx->i32, "");
-       }
-       return ac_llvm_pack_two_int32_as_int16(ctx, val);
+       LLVMValueRef res =
+               ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16",
+                                  ctx->v2i16, args, 2,
+                                  AC_FUNC_ATTR_READNONE);
+       return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
  }
  
  LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx,
                                      LLVMValueRef args[2])
  {
-       if (HAVE_LLVM >= 0x0600) {
-               LLVMValueRef res =
-                       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16",
-                                          ctx->v2i16, args, 2,
-                                          AC_FUNC_ATTR_READNONE);
-               return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
-       }
-
-       LLVMValueRef val[2];
-
-       for (int chan = 0; chan < 2; chan++) {
-               val[chan] = ac_build_clamp(ctx, args[chan]);
-               val[chan] = LLVMBuildFMul(ctx->builder, val[chan],
-                                         LLVMConstReal(ctx->f32, 65535), "");
-               val[chan] = LLVMBuildFAdd(ctx->builder, val[chan],
-                                         LLVMConstReal(ctx->f32, 0.5), "");
-               val[chan] = LLVMBuildFPToUI(ctx->builder, val[chan],
-                                           ctx->i32, "");
-       }
-       return ac_llvm_pack_two_int32_as_int16(ctx, val);
+       LLVMValueRef res =
+               ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16",
+                                  ctx->v2i16, args, 2,
+                                  AC_FUNC_ATTR_READNONE);
+       return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
  }
  
  /* The 8-bit and 10-bit clamping is for HW workarounds. */
@@ -1637,10 +1955,9 @@ LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx,
                 bits != 10 ? max_rgb : ctx->i32_1;
         LLVMValueRef min_alpha =
                 bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
-       bool has_intrinsic = HAVE_LLVM >= 0x0600;
  
         /* Clamp. */
-       if (!has_intrinsic || bits != 16) {
+       if (bits != 16) {
                 for (int i = 0; i < 2; i++) {
                         bool alpha = hi && i == 1;
                         args[i] = ac_build_imin(ctx, args[i],
@@ -1650,15 +1967,11 @@ LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx,
                 }
         }
  
-       if (has_intrinsic) {
-               LLVMValueRef res =
-                       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16",
-                                          ctx->v2i16, args, 2,
-                                          AC_FUNC_ATTR_READNONE);
-               return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
-       }
-
-       return ac_llvm_pack_two_int32_as_int16(ctx, args);
+       LLVMValueRef res =
+               ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16",
+                                  ctx->v2i16, args, 2,
+                                  AC_FUNC_ATTR_READNONE);
+       return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
  }
  
  /* The 8-bit and 10-bit clamping is for HW workarounds. */
@@ -1671,10 +1984,9 @@ LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx,
                 bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
         LLVMValueRef max_alpha =
                 bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
-       bool has_intrinsic = HAVE_LLVM >= 0x0600;
  
         /* Clamp. */
-       if (!has_intrinsic || bits != 16) {
+       if (bits != 16) {
                 for (int i = 0; i < 2; i++) {
                         bool alpha = hi && i == 1;
                         args[i] = ac_build_umin(ctx, args[i],
@@ -1682,37 +1994,23 @@ LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx,
                 }
         }
  
-       if (has_intrinsic) {
-               LLVMValueRef res =
-                       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16",
-                                          ctx->v2i16, args, 2,
-                                          AC_FUNC_ATTR_READNONE);
-               return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
-       }
-
-       return ac_llvm_pack_two_int16(ctx, args);
+       LLVMValueRef res =
+               ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16",
+                                  ctx->v2i16, args, 2,
+                                  AC_FUNC_ATTR_READNONE);
+       return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
  }
  
  LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
  {
-       assert(HAVE_LLVM >= 0x0600);
         return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1,
                                   &i1, 1, AC_FUNC_ATTR_READNONE);
  }
  
  void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
  {
-       if (HAVE_LLVM >= 0x0600) {
-               ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt,
-                                  &i1, 1, 0);
-               return;
-       }
-
-       LLVMValueRef value = LLVMBuildSelect(ctx->builder, i1,
-                                            LLVMConstReal(ctx->f32, 1),
-                                            LLVMConstReal(ctx->f32, -1), "");
-       ac_build_intrinsic(ctx, "llvm.AMDGPU.kill", ctx->voidt,
-                          &value, 1, AC_FUNC_ATTR_LEGACY);
+       ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt,
+                          &i1, 1, 0);
  }
  
  LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
@@ -1725,20 +2023,25 @@ LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
                 width,
         };
  
-       if (HAVE_LLVM >= 0x0500) {
-               return ac_build_intrinsic(ctx,
-                                         is_signed ? "llvm.amdgcn.sbfe.i32" :
-                                                     "llvm.amdgcn.ubfe.i32",
-                                         ctx->i32, args, 3,
-                                         AC_FUNC_ATTR_READNONE);
-       }
-
         return ac_build_intrinsic(ctx,
-                                 is_signed ? "llvm.AMDGPU.bfe.i32" :
-                                             "llvm.AMDGPU.bfe.u32",
+                                 is_signed ? "llvm.amdgcn.sbfe.i32" :
+                                             "llvm.amdgcn.ubfe.i32",
                                   ctx->i32, args, 3,
-                                 AC_FUNC_ATTR_READNONE |
-                                 AC_FUNC_ATTR_LEGACY);
+                                 AC_FUNC_ATTR_READNONE);
+}
+
+LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0,
+                          LLVMValueRef s1, LLVMValueRef s2)
+{
+       return LLVMBuildAdd(ctx->builder,
+                           LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
+}
+
+LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0,
+                          LLVMValueRef s1, LLVMValueRef s2)
+{
+       return LLVMBuildFAdd(ctx->builder,
+                            LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
  }
  
  void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned simm16)
@@ -1775,19 +2078,11 @@ LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
  LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
                             unsigned bitsize)
  {
-       LLVMValueRef cmp, val, zero, one;
-       LLVMTypeRef type;
-
-       if (bitsize == 32) {
-               type = ctx->i32;
-               zero = ctx->i32_0;
-               one = ctx->i32_1;
-       } else {
-               type = ctx->i64;
-               zero = ctx->i64_0;
-               one = ctx->i64_1;
-       }
+       LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize);
+       LLVMValueRef zero = LLVMConstInt(type, 0, false);
+       LLVMValueRef one = LLVMConstInt(type, 1, false);
  
+       LLVMValueRef cmp, val;
         cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
         val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
         cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
@@ -1818,31 +2113,69 @@ LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
         return val;
  }
  
-void ac_get_image_intr_name(const char *base_name,
-                           LLVMTypeRef data_type,
-                           LLVMTypeRef coords_type,
-                           LLVMTypeRef rsrc_type,
-                           char *out_name, unsigned out_len)
+LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
+{
+       LLVMValueRef result;
+       unsigned bitsize;
+
+       bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
+
+       switch (bitsize) {
+       case 64:
+               result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64,
+                                           (LLVMValueRef []) { src0 }, 1,
+                                           AC_FUNC_ATTR_READNONE);
+
+               result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
+               break;
+       case 32:
+               result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32,
+                                           (LLVMValueRef []) { src0 }, 1,
+                                           AC_FUNC_ATTR_READNONE);
+               break;
+       case 16:
+               result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16,
+                                           (LLVMValueRef []) { src0 }, 1,
+                                           AC_FUNC_ATTR_READNONE);
+               break;
+       default:
+               unreachable(!"invalid bitsize");
+               break;
+       }
+
+       return result;
+}
+
+LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
+                                      LLVMValueRef src0)
  {
-        char coords_type_name[8];
+       LLVMValueRef result;
+       unsigned bitsize;
  
-        ac_build_type_name_for_intr(coords_type, coords_type_name,
-                            sizeof(coords_type_name));
+       bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
  
-       char data_type_name[8];
-       char rsrc_type_name[8];
+       switch (bitsize) {
+       case 32:
+               result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32,
+                                           (LLVMValueRef []) { src0 }, 1,
+                                           AC_FUNC_ATTR_READNONE);
+               break;
+       case 16:
+               result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16,
+                                           (LLVMValueRef []) { src0 }, 1,
+                                           AC_FUNC_ATTR_READNONE);
+               break;
+       default:
+               unreachable(!"invalid bitsize");
+               break;
+       }
  
-       ac_build_type_name_for_intr(data_type, data_type_name,
-                                   sizeof(data_type_name));
-       ac_build_type_name_for_intr(rsrc_type, rsrc_type_name,
-                                   sizeof(rsrc_type_name));
-       snprintf(out_name, out_len, "%s.%s.%s.%s", base_name,
-                data_type_name, coords_type_name, rsrc_type_name);
+       return result;
  }
  
-#define AC_EXP_TARGET (HAVE_LLVM >= 0x0500 ? 0 : 3)
-#define AC_EXP_ENABLED_CHANNELS (HAVE_LLVM >= 0x0500 ? 1 : 0)
-#define AC_EXP_OUT0 (HAVE_LLVM >= 0x0500 ? 2 : 5)
+#define AC_EXP_TARGET          0
+#define AC_EXP_ENABLED_CHANNELS 1
+#define AC_EXP_OUT0            2
  
  enum ac_ir_type {
         AC_IR_UNDEF,
@@ -2128,7 +2461,7 @@ void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
  {
         unsigned lds_size = ctx->chip_class >= CIK ? 65536 : 32768;
         ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
-                                    LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_LOCAL_ADDR_SPACE),
+                                    LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS),
                                      "lds");
  }
  
@@ -2155,14 +2488,25 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
         const char *intrin_name;
         LLVMTypeRef type;
         LLVMValueRef zero;
-       if (src0_bitsize == 64) {
+
+       switch (src0_bitsize) {
+       case 64:
                 intrin_name = "llvm.cttz.i64";
                 type = ctx->i64;
                 zero = ctx->i64_0;
-       } else {
+               break;
+       case 32:
                 intrin_name = "llvm.cttz.i32";
                 type = ctx->i32;
                 zero = ctx->i32_0;
+               break;
+       case 16:
+               intrin_name = "llvm.cttz.i16";
+               type = ctx->i16;
+               zero = ctx->i16_0;
+               break;
+       default:
+               unreachable(!"invalid bitsize");
         }
  
         LLVMValueRef params[2] = {
@@ -2177,7 +2521,7 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
                  *
                  * The hardware already implements the correct behavior.
                  */
-               LLVMConstInt(ctx->i1, 1, false),
+               ctx->i1true,
         };
  
         LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type,
@@ -2199,16 +2543,13 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
  LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
  {
         return LLVMPointerType(LLVMArrayType(elem_type, 0),
-                              AC_CONST_ADDR_SPACE);
+                              AC_ADDR_SPACE_CONST);
  }
  
  LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
  {
-       if (!HAVE_32BIT_POINTERS)
-               return ac_array_in_const_addr_space(elem_type);
-
         return LLVMPointerType(LLVMArrayType(elem_type, 0),
-                              AC_CONST_32BIT_ADDR_SPACE);
+                              AC_ADDR_SPACE_CONST_32BIT);
  }
  
  static struct ac_llvm_flow *
@@ -2352,8 +2693,7 @@ void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
         ctx->flow_depth--;
  }
  
-static void if_cond_emit(struct ac_llvm_context *ctx, LLVMValueRef cond,
-                        int label_id)
+void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
  {
         struct ac_llvm_flow *flow = push_flow(ctx);
         LLVMBasicBlockRef if_block;
@@ -2370,7 +2710,7 @@ void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
  {
         LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
                                           value, ctx->f32_0, "");
-       if_cond_emit(ctx, cond, label_id);
+       ac_build_ifcc(ctx, cond, label_id);
  }
  
  void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
@@ -2379,10 +2719,10 @@ void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
         LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
                                           ac_to_integer(ctx, value),
                                           ctx->i32_0, "");
-       if_cond_emit(ctx, cond, label_id);
+       ac_build_ifcc(ctx, cond, label_id);
  }
  
-LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type,
+LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type,
                              const char *name)
  {
         LLVMBuilderRef builder = ac->builder;
@@ -2400,18 +2740,15 @@ LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type,
         }
  
         res = LLVMBuildAlloca(first_builder, type, name);
-       LLVMBuildStore(builder, LLVMConstNull(type), res);
-
         LLVMDisposeBuilder(first_builder);
-
         return res;
  }
  
-LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac,
+LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac,
                                    LLVMTypeRef type, const char *name)
  {
-       LLVMValueRef ptr = ac_build_alloca(ac, type, name);
-       LLVMBuildStore(ac->builder, LLVMGetUndef(type), ptr);
+       LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
+       LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
         return ptr;
  }
  
@@ -2430,9 +2767,11 @@ LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value,
         if (count == num_components)
                 return value;
  
-       LLVMValueRef masks[] = {
-           LLVMConstInt(ctx->i32, 0, false), LLVMConstInt(ctx->i32, 1, false),
-           LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false)};
+       LLVMValueRef masks[MAX2(count, 2)];
+       masks[0] = ctx->i32_0;
+       masks[1] = ctx->i32_1;
+       for (unsigned i = 2; i < count; i++)
+               masks[i] = LLVMConstInt(ctx->i32, i, false);
  
         if (count == 1)
                 return LLVMBuildExtractElement(ctx->builder, value, masks[0],
@@ -2481,12 +2820,10 @@ void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
         fmask_load.dmask = 0xf;
         fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
  
-       LLVMValueRef fmask_addr[4];
-       memcpy(fmask_addr, addr, sizeof(fmask_addr[0]) * 3);
-       fmask_addr[3] = LLVMGetUndef(ac->i32);
-
-       fmask_load.addr = ac_build_gather_values(ac, fmask_addr,
-                                                is_array_tex ? 4 : 2);
+       fmask_load.coords[0] = addr[0];
+       fmask_load.coords[1] = addr[1];
+       if (is_array_tex)
+               fmask_load.coords[2] = addr[2];
  
         LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
         fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value,
@@ -2498,11 +2835,13 @@ void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
         final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
                                     LLVMConstInt(ac->i32, 4, 0), "");
         final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
+       /* Mask the sample index by 0x7, because 0x8 means an unknown value
+        * with EQAA, so those will map to 0. */
         final_sample = LLVMBuildAnd(ac->builder, final_sample,
-                                   LLVMConstInt(ac->i32, 0xF, 0), "");
+                                   LLVMConstInt(ac->i32, 0x7, 0), "");
  
         /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
-        * resource descriptor is 0 (invalid),
+        * resource descriptor is 0 (invalid).
          */
         LLVMValueRef tmp;
         tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
@@ -2746,7 +3085,7 @@ static LLVMValueRef
  ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
                       LLVMValueRef inactive)
  {
-       char name[32], type[8];
+       char name[33], type[8];
         LLVMTypeRef src_type = LLVMTypeOf(src);
         src = ac_to_integer(ctx, src);
         inactive = ac_to_integer(ctx, inactive);
@@ -2840,24 +3179,44 @@ ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
         }
  }
  
-/* TODO: add inclusive and excluse scan functions for SI chip class.  */
+/**
+ * \param maxprefix specifies that the result only needs to be correct for a
+ *     prefix of this many threads
+ *
+ * TODO: add inclusive and excluse scan functions for SI chip class.
+ */
  static LLVMValueRef
-ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity)
+ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
+             unsigned maxprefix)
  {
         LLVMValueRef result, tmp;
         result = src;
+       if (maxprefix <= 1)
+               return result;
         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
         result = ac_build_alu_op(ctx, result, tmp, op);
+       if (maxprefix <= 2)
+               return result;
         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
         result = ac_build_alu_op(ctx, result, tmp, op);
+       if (maxprefix <= 3)
+               return result;
         tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
         result = ac_build_alu_op(ctx, result, tmp, op);
+       if (maxprefix <= 4)
+               return result;
         tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
         result = ac_build_alu_op(ctx, result, tmp, op);
+       if (maxprefix <= 8)
+               return result;
         tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
         result = ac_build_alu_op(ctx, result, tmp, op);
+       if (maxprefix <= 16)
+               return result;
         tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
         result = ac_build_alu_op(ctx, result, tmp, op);
+       if (maxprefix <= 32)
+               return result;
         tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
         result = ac_build_alu_op(ctx, result, tmp, op);
         return result;
@@ -2866,14 +3225,24 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu
  LLVMValueRef
  ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
  {
-       ac_build_optimization_barrier(ctx, &src);
         LLVMValueRef result;
-       LLVMValueRef identity = get_reduction_identity(ctx, op,
-                                                               ac_get_type_size(LLVMTypeOf(src)));
-       result = LLVMBuildBitCast(ctx->builder,
-                                                               ac_build_set_inactive(ctx, src, identity),
-                                                               LLVMTypeOf(identity), "");
-       result = ac_build_scan(ctx, op, result, identity);
+
+       if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
+               LLVMBuilderRef builder = ctx->builder;
+               src = LLVMBuildZExt(builder, src, ctx->i32, "");
+               result = ac_build_ballot(ctx, src);
+               result = ac_build_mbcnt(ctx, result);
+               result = LLVMBuildAdd(builder, result, src, "");
+               return result;
+       }
+
+       ac_build_optimization_barrier(ctx, &src);
+
+       LLVMValueRef identity =
+               get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
+       result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
+                                 LLVMTypeOf(identity), "");
+       result = ac_build_scan(ctx, op, result, identity, 64);
  
         return ac_build_wwm(ctx, result);
  }
@@ -2881,15 +3250,24 @@ ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op
  LLVMValueRef
  ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
  {
-       ac_build_optimization_barrier(ctx, &src);
         LLVMValueRef result;
-       LLVMValueRef identity = get_reduction_identity(ctx, op,
-                                                               ac_get_type_size(LLVMTypeOf(src)));
-       result = LLVMBuildBitCast(ctx->builder,
-                                                               ac_build_set_inactive(ctx, src, identity),
-                                                               LLVMTypeOf(identity), "");
+
+       if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
+               LLVMBuilderRef builder = ctx->builder;
+               src = LLVMBuildZExt(builder, src, ctx->i32, "");
+               result = ac_build_ballot(ctx, src);
+               result = ac_build_mbcnt(ctx, result);
+               return result;
+       }
+
+       ac_build_optimization_barrier(ctx, &src);
+
+       LLVMValueRef identity =
+               get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
+       result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
+                                 LLVMTypeOf(identity), "");
         result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
-       result = ac_build_scan(ctx, op, result, identity);
+       result = ac_build_scan(ctx, op, result, identity, 64);
  
         return ac_build_wwm(ctx, result);
  }
@@ -2947,12 +3325,181 @@ ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsign
         }
  }
  
+/**
+ * "Top half" of a scan that reduces per-wave values across an entire
+ * workgroup.
+ *
+ * The source value must be present in the highest lane of the wave, and the
+ * highest lane must be live.
+ */
+void
+ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+       if (ws->maxwaves <= 1)
+               return;
+
+       const LLVMValueRef i32_63 = LLVMConstInt(ctx->i32, 63, false);
+       LLVMBuilderRef builder = ctx->builder;
+       LLVMValueRef tid = ac_get_thread_id(ctx);
+       LLVMValueRef tmp;
+
+       tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, i32_63, "");
+       ac_build_ifcc(ctx, tmp, 1000);
+       LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
+       ac_build_endif(ctx, 1000);
+}
+
+/**
+ * "Bottom half" of a scan that reduces per-wave values across an entire
+ * workgroup.
+ *
+ * The caller must place a barrier between the top and bottom halves.
+ */
+void
+ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+       const LLVMTypeRef type = LLVMTypeOf(ws->src);
+       const LLVMValueRef identity =
+               get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
+
+       if (ws->maxwaves <= 1) {
+               ws->result_reduce = ws->src;
+               ws->result_inclusive = ws->src;
+               ws->result_exclusive = identity;
+               return;
+       }
+       assert(ws->maxwaves <= 32);
+
+       LLVMBuilderRef builder = ctx->builder;
+       LLVMValueRef tid = ac_get_thread_id(ctx);
+       LLVMBasicBlockRef bbs[2];
+       LLVMValueRef phivalues_scan[2];
+       LLVMValueRef tmp, tmp2;
+
+       bbs[0] = LLVMGetInsertBlock(builder);
+       phivalues_scan[0] = LLVMGetUndef(type);
+
+       if (ws->enable_reduce)
+               tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
+       else if (ws->enable_inclusive)
+               tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
+       else
+               tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
+       ac_build_ifcc(ctx, tmp, 1001);
+       {
+               tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
+
+               ac_build_optimization_barrier(ctx, &tmp);
+
+               bbs[1] = LLVMGetInsertBlock(builder);
+               phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves);
+       }
+       ac_build_endif(ctx, 1001);
+
+       const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
+
+       if (ws->enable_reduce) {
+               tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
+               ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
+       }
+       if (ws->enable_inclusive)
+               ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
+       if (ws->enable_exclusive) {
+               tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
+               tmp = ac_build_readlane(ctx, scan, tmp);
+               tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
+               ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
+       }
+}
+
+/**
+ * Inclusive scan of a per-wave value across an entire workgroup.
+ *
+ * This implies an s_barrier instruction.
+ *
+ * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
+ * of the workgroup are live. (This requirement cannot easily be relaxed in a
+ * useful manner because of the barrier in the algorithm.)
+ */
+void
+ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+       ac_build_wg_wavescan_top(ctx, ws);
+       ac_build_s_barrier(ctx);
+       ac_build_wg_wavescan_bottom(ctx, ws);
+}
+
+/**
+ * "Top half" of a scan that reduces per-thread values across an entire
+ * workgroup.
+ *
+ * All lanes must be active when this code runs.
+ */
+void
+ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+       if (ws->enable_exclusive) {
+               ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
+               if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
+                       ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
+               ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
+       } else {
+               ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
+       }
+
+       bool enable_inclusive = ws->enable_inclusive;
+       bool enable_exclusive = ws->enable_exclusive;
+       ws->enable_inclusive = false;
+       ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
+       ac_build_wg_wavescan_top(ctx, ws);
+       ws->enable_inclusive = enable_inclusive;
+       ws->enable_exclusive = enable_exclusive;
+}
+
+/**
+ * "Bottom half" of a scan that reduces per-thread values across an entire
+ * workgroup.
+ *
+ * The caller must place a barrier between the top and bottom halves.
+ */
+void
+ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+       bool enable_inclusive = ws->enable_inclusive;
+       bool enable_exclusive = ws->enable_exclusive;
+       ws->enable_inclusive = false;
+       ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
+       ac_build_wg_wavescan_bottom(ctx, ws);
+       ws->enable_inclusive = enable_inclusive;
+       ws->enable_exclusive = enable_exclusive;
+
+       /* ws->result_reduce is already the correct value */
+       if (ws->enable_inclusive)
+               ws->result_inclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->src, ws->op);
+       if (ws->enable_exclusive)
+               ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
+}
+
+/**
+ * A scan that reduces per-thread values across an entire workgroup.
+ *
+ * The caller must ensure that all lanes are active when this code runs
+ * (WWM is insufficient!), because there is an implied barrier.
+ */
+void
+ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+       ac_build_wg_scan_top(ctx, ws);
+       ac_build_s_barrier(ctx);
+       ac_build_wg_scan_bottom(ctx, ws);
+}
+
  LLVMValueRef
  ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
                 unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
  {
         unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
-       if (ctx->chip_class >= VI && HAVE_LLVM >= 0x0600) {
+       if (ctx->chip_class >= VI) {
                 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
         } else {
                 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);