ac: use new LLVM 8 intrinsic when storing 16-bit values
[mesa.git] / src / amd / common / ac_llvm_build.c
index 6955df48e003805c9c7d8612a834060599f748e8..73dd0230785b11a23cb3b2fb31454e5e89fe77e0 100644 (file)
@@ -75,7 +75,7 @@ ac_llvm_context_init(struct ac_llvm_context *ctx,
        ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
        ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
        ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
-       ctx->intptr = HAVE_32BIT_POINTERS ? ctx->i32 : ctx->i64;
+       ctx->intptr = ctx->i32;
        ctx->f16 = LLVMHalfTypeInContext(ctx->context);
        ctx->f32 = LLVMFloatTypeInContext(ctx->context);
        ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
@@ -184,7 +184,7 @@ ac_get_type_size(LLVMTypeRef type)
        case LLVMDoubleTypeKind:
                return 8;
        case LLVMPointerTypeKind:
-               if (LLVMGetPointerAddressSpace(type) == AC_CONST_32BIT_ADDR_SPACE)
+               if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
                        return 4;
                return 8;
        case LLVMVectorTypeKind:
@@ -219,6 +219,16 @@ ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
                return LLVMVectorType(to_integer_type_scalar(ctx, elem_type),
                                      LLVMGetVectorSize(t));
        }
+       if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
+               switch (LLVMGetPointerAddressSpace(t)) {
+               case AC_ADDR_SPACE_GLOBAL:
+                       return ctx->i64;
+               case AC_ADDR_SPACE_LDS:
+                       return ctx->i32;
+               default:
+                       unreachable("unhandled address space");
+               }
+       }
        return to_integer_type_scalar(ctx, t);
 }
 
@@ -226,9 +236,21 @@ LLVMValueRef
 ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
 {
        LLVMTypeRef type = LLVMTypeOf(v);
+       if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
+               return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
+       }
        return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
 }
 
+LLVMValueRef
+ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
+{
+       LLVMTypeRef type = LLVMTypeOf(v);
+       if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
+               return v;
+       return ac_to_integer(ctx, v);
+}
+
 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
 {
        if (t == ctx->i16 || t == ctx->f16)
@@ -523,39 +545,68 @@ ac_build_gather_values(struct ac_llvm_context *ctx,
        return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
 }
 
-/* Expand a scalar or vector to <4 x type> by filling the remaining channels
- * with undef. Extract at most num_channels components from the input.
+/* Expand a scalar or vector to <dst_channels x type> by filling the remaining
+ * channels with undef. Extract at most src_channels components from the input.
  */
-LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
-                                    LLVMValueRef value,
-                                    unsigned num_channels)
+static LLVMValueRef
+ac_build_expand(struct ac_llvm_context *ctx,
+               LLVMValueRef value,
+               unsigned src_channels,
+               unsigned dst_channels)
 {
        LLVMTypeRef elemtype;
-       LLVMValueRef chan[4];
+       LLVMValueRef chan[dst_channels];
 
        if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
                unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
-               num_channels = MIN2(num_channels, vec_size);
 
-               if (num_channels >= 4)
+               if (src_channels == dst_channels && vec_size == dst_channels)
                        return value;
 
-               for (unsigned i = 0; i < num_channels; i++)
+               src_channels = MIN2(src_channels, vec_size);
+
+               for (unsigned i = 0; i < src_channels; i++)
                        chan[i] = ac_llvm_extract_elem(ctx, value, i);
 
                elemtype = LLVMGetElementType(LLVMTypeOf(value));
        } else {
-               if (num_channels) {
-                       assert(num_channels == 1);
+               if (src_channels) {
+                       assert(src_channels == 1);
                        chan[0] = value;
                }
                elemtype = LLVMTypeOf(value);
        }
 
-       while (num_channels < 4)
-               chan[num_channels++] = LLVMGetUndef(elemtype);
+       for (unsigned i = src_channels; i < dst_channels; i++)
+               chan[i] = LLVMGetUndef(elemtype);
+
+       return ac_build_gather_values(ctx, chan, dst_channels);
+}
+
+/* Expand a scalar or vector to <4 x type> by filling the remaining channels
+ * with undef. Extract at most num_channels components from the input.
+ */
+LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
+                                    LLVMValueRef value,
+                                    unsigned num_channels)
+{
+       return ac_build_expand(ctx, value, num_channels, 4);
+}
+
+LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
+{
+       unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
+       const char *name;
+
+       if (type_size == 2)
+               name = "llvm.rint.f16";
+       else if (type_size == 4)
+               name = "llvm.rint.f32";
+       else
+               name = "llvm.rint.f64";
 
-       return ac_build_gather_values(ctx, chan, 4);
+       return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1,
+                                 AC_FUNC_ATTR_READNONE);
 }
 
 LLVMValueRef
@@ -569,7 +620,8 @@ ac_build_fdiv(struct ac_llvm_context *ctx,
         * If we do (num * (1 / den)), LLVM does:
         *    return num * v_rcp_f32(den);
         */
-       LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, ctx->f32_1, den, "");
+       LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0);
+       LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, "");
        LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, "");
 
        /* Use v_rcp_f32 instead of precise division. */
@@ -578,6 +630,67 @@ ac_build_fdiv(struct ac_llvm_context *ctx,
        return ret;
 }
 
+/* See fast_idiv_by_const.h. */
+/* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
+LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx,
+                               LLVMValueRef num,
+                               LLVMValueRef multiplier,
+                               LLVMValueRef pre_shift,
+                               LLVMValueRef post_shift,
+                               LLVMValueRef increment)
+{
+       LLVMBuilderRef builder = ctx->builder;
+
+       num = LLVMBuildLShr(builder, num, pre_shift, "");
+       num = LLVMBuildMul(builder,
+                          LLVMBuildZExt(builder, num, ctx->i64, ""),
+                          LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
+       num = LLVMBuildAdd(builder, num,
+                          LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
+       num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
+       num = LLVMBuildTrunc(builder, num, ctx->i32, "");
+       return LLVMBuildLShr(builder, num, post_shift, "");
+}
+
+/* See fast_idiv_by_const.h. */
+/* If num != UINT_MAX, this more efficient version can be used. */
+/* Set: increment = util_fast_udiv_info::increment; */
+LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx,
+                                   LLVMValueRef num,
+                                   LLVMValueRef multiplier,
+                                   LLVMValueRef pre_shift,
+                                   LLVMValueRef post_shift,
+                                   LLVMValueRef increment)
+{
+       LLVMBuilderRef builder = ctx->builder;
+
+       num = LLVMBuildLShr(builder, num, pre_shift, "");
+       num = LLVMBuildNUWAdd(builder, num, increment, "");
+       num = LLVMBuildMul(builder,
+                          LLVMBuildZExt(builder, num, ctx->i64, ""),
+                          LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
+       num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
+       num = LLVMBuildTrunc(builder, num, ctx->i32, "");
+       return LLVMBuildLShr(builder, num, post_shift, "");
+}
+
+/* See fast_idiv_by_const.h. */
+/* Both operands must fit in 31 bits and the divisor must not be 1. */
+LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx,
+                                             LLVMValueRef num,
+                                             LLVMValueRef multiplier,
+                                             LLVMValueRef post_shift)
+{
+       LLVMBuilderRef builder = ctx->builder;
+
+       num = LLVMBuildMul(builder,
+                          LLVMBuildZExt(builder, num, ctx->i64, ""),
+                          LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
+       num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
+       num = LLVMBuildTrunc(builder, num, ctx->i32, "");
+       return LLVMBuildLShr(builder, num, post_shift, "");
+}
+
 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
  * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
  * already multiplied by two. id is the cube face number.
@@ -675,8 +788,7 @@ ac_prepare_cube_coords(struct ac_llvm_context *ctx,
        LLVMValueRef invma;
 
        if (is_array && !is_lod) {
-               LLVMValueRef tmp = coords_arg[3];
-               tmp = ac_build_intrinsic(ctx, "llvm.rint.f32", ctx->f32, &tmp, 1, 0);
+               LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
 
                /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
                 *
@@ -807,6 +919,37 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
                                  ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
 }
 
+LLVMValueRef
+ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
+                      LLVMValueRef llvm_chan,
+                      LLVMValueRef attr_number,
+                      LLVMValueRef params,
+                      LLVMValueRef i,
+                      LLVMValueRef j)
+{
+       LLVMValueRef args[6];
+       LLVMValueRef p1;
+
+       args[0] = i;
+       args[1] = llvm_chan;
+       args[2] = attr_number;
+       args[3] = ctx->i1false;
+       args[4] = params;
+
+       p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
+                               ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
+
+       args[0] = p1;
+       args[1] = j;
+       args[2] = llvm_chan;
+       args[3] = attr_number;
+       args[4] = ctx->i1false;
+       args[5] = params;
+
+       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
+                                 ctx->f16, args, 6, AC_FUNC_ATTR_READNONE);
+}
+
 LLVMValueRef
 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
                       LLVMValueRef parameter,
@@ -825,6 +968,14 @@ ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
                                  ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
 }
 
+LLVMValueRef
+ac_build_gep_ptr(struct ac_llvm_context *ctx,
+                LLVMValueRef base_ptr,
+                LLVMValueRef index)
+{
+       return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
+}
+
 LLVMValueRef
 ac_build_gep0(struct ac_llvm_context *ctx,
              LLVMValueRef base_ptr,
@@ -891,7 +1042,7 @@ ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
        LLVMValueRef indices[2] = {ctx->i32_0, index};
 
        if (no_unsigned_wraparound &&
-           LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_CONST_32BIT_ADDR_SPACE)
+           LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
                pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, indices, 2, "");
        else
                pointer = LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
@@ -931,6 +1082,106 @@ LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
        return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
 }
 
+static void
+ac_build_buffer_store_common(struct ac_llvm_context *ctx,
+                            LLVMValueRef rsrc,
+                            LLVMValueRef data,
+                            LLVMValueRef vindex,
+                            LLVMValueRef voffset,
+                            unsigned num_channels,
+                            bool glc,
+                            bool slc,
+                            bool writeonly_memory,
+                            bool use_format)
+{
+       LLVMValueRef args[] = {
+               data,
+               LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
+               vindex ? vindex : ctx->i32_0,
+               voffset,
+               LLVMConstInt(ctx->i1, glc, 0),
+               LLVMConstInt(ctx->i1, slc, 0)
+       };
+       unsigned func = CLAMP(num_channels, 1, 3) - 1;
+
+       const char *type_names[] = {"f32", "v2f32", "v4f32"};
+       char name[256];
+
+       if (use_format) {
+               snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.format.%s",
+                        type_names[func]);
+       } else {
+               snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
+                        type_names[func]);
+       }
+
+       ac_build_intrinsic(ctx, name, ctx->voidt, args, ARRAY_SIZE(args),
+                          ac_get_store_intr_attribs(writeonly_memory));
+}
+
+static void
+ac_build_llvm8_buffer_store_common(struct ac_llvm_context *ctx,
+                                  LLVMValueRef rsrc,
+                                  LLVMValueRef data,
+                                  LLVMValueRef vindex,
+                                  LLVMValueRef voffset,
+                                  LLVMValueRef soffset,
+                                  unsigned num_channels,
+                                  bool glc,
+                                  bool slc,
+                                  bool writeonly_memory,
+                                  bool use_format,
+                                  bool structurized)
+{
+       LLVMValueRef args[6];
+       int idx = 0;
+       args[idx++] = data;
+       args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
+       if (structurized)
+               args[idx++] = vindex ? vindex : ctx->i32_0;
+       args[idx++] = voffset ? voffset : ctx->i32_0;
+       args[idx++] = soffset ? soffset : ctx->i32_0;
+       args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+       unsigned func = CLAMP(num_channels, 1, 3) - 1;
+
+       const char *type_names[] = {"f32", "v2f32", "v4f32"};
+       const char *indexing_kind = structurized ? "struct" : "raw";
+       char name[256];
+
+       if (use_format) {
+               snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s",
+                        indexing_kind, type_names[func]);
+       } else {
+               snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s",
+                        indexing_kind, type_names[func]);
+       }
+
+       ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
+                          ac_get_store_intr_attribs(writeonly_memory));
+}
+
+void
+ac_build_buffer_store_format(struct ac_llvm_context *ctx,
+                            LLVMValueRef rsrc,
+                            LLVMValueRef data,
+                            LLVMValueRef vindex,
+                            LLVMValueRef voffset,
+                            unsigned num_channels,
+                            bool glc,
+                            bool writeonly_memory)
+{
+       if (HAVE_LLVM >= 0x800) {
+               ac_build_llvm8_buffer_store_common(ctx, rsrc, data, vindex,
+                                                  voffset, NULL, num_channels,
+                                                  glc, false, writeonly_memory,
+                                                  true, true);
+       } else {
+               ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset,
+                                            num_channels, glc, false,
+                                            writeonly_memory, true);
+       }
+}
+
 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
  * or v4i32 (num_channels=3,4).
@@ -999,9 +1250,7 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
 
                ac_build_intrinsic(ctx, name, ctx->voidt,
                                   args, ARRAY_SIZE(args),
-                                  writeonly_memory ?
-                                  AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
-                                  AC_FUNC_ATTR_WRITEONLY);
+                                  ac_get_store_intr_attribs(writeonly_memory));
                return;
        }
 
@@ -1030,9 +1279,7 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
 
        ac_build_intrinsic(ctx, name, ctx->voidt,
                           args, ARRAY_SIZE(args),
-                          writeonly_memory ?
-                                  AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
-                                  AC_FUNC_ATTR_WRITEONLY);
+                          ac_get_store_intr_attribs(writeonly_memory));
 }
 
 static LLVMValueRef
@@ -1072,6 +1319,47 @@ ac_build_buffer_load_common(struct ac_llvm_context *ctx,
                                  ac_get_load_intr_attribs(can_speculate));
 }
 
+static LLVMValueRef
+ac_build_llvm8_buffer_load_common(struct ac_llvm_context *ctx,
+                                 LLVMValueRef rsrc,
+                                 LLVMValueRef vindex,
+                                 LLVMValueRef voffset,
+                                 LLVMValueRef soffset,
+                                 unsigned num_channels,
+                                 bool glc,
+                                 bool slc,
+                                 bool can_speculate,
+                                 bool use_format,
+                                 bool structurized)
+{
+       LLVMValueRef args[5];
+       int idx = 0;
+       args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
+       if (structurized)
+               args[idx++] = vindex ? vindex : ctx->i32_0;
+       args[idx++] = voffset ? voffset : ctx->i32_0;
+       args[idx++] = soffset ? soffset : ctx->i32_0;
+       args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+       unsigned func = CLAMP(num_channels, 1, 3) - 1;
+
+       LLVMTypeRef types[] = {ctx->f32, ctx->v2f32, ctx->v4f32};
+       const char *type_names[] = {"f32", "v2f32", "v4f32"};
+       const char *indexing_kind = structurized ? "struct" : "raw";
+       char name[256];
+
+       if (use_format) {
+               snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s",
+                        indexing_kind, type_names[func]);
+       } else {
+               snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s",
+                        indexing_kind, type_names[func]);
+       }
+
+       return ac_build_intrinsic(ctx, name, types[func], args,
+                                 idx,
+                                 ac_get_load_intr_attribs(can_speculate));
+}
+
 LLVMValueRef
 ac_build_buffer_load(struct ac_llvm_context *ctx,
                     LLVMValueRef rsrc,
@@ -1091,8 +1379,8 @@ ac_build_buffer_load(struct ac_llvm_context *ctx,
        if (soffset)
                offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
 
-       /* TODO: VI and later generations can use SMEM with GLC=1.*/
-       if (allow_smem && !glc && !slc) {
+       if (allow_smem && !slc &&
+           (!glc || (HAVE_LLVM >= 0x0800 && ctx->chip_class >= VI))) {
                assert(vindex == NULL);
 
                LLVMValueRef result[8];
@@ -1102,11 +1390,19 @@ ac_build_buffer_load(struct ac_llvm_context *ctx,
                                offset = LLVMBuildAdd(ctx->builder, offset,
                                                      LLVMConstInt(ctx->i32, 4, 0), "");
                        }
-                       LLVMValueRef args[2] = {rsrc, offset};
-                       result[i] = ac_build_intrinsic(ctx, "llvm.SI.load.const.v4i32",
-                                                      ctx->f32, args, 2,
+                       const char *intrname =
+                               HAVE_LLVM >= 0x0800 ? "llvm.amdgcn.s.buffer.load.f32"
+                                                   : "llvm.SI.load.const.v4i32";
+                       unsigned num_args = HAVE_LLVM >= 0x0800 ? 3 : 2;
+                       LLVMValueRef args[3] = {
+                               rsrc,
+                               offset,
+                               glc ? ctx->i32_1 : ctx->i32_0,
+                       };
+                       result[i] = ac_build_intrinsic(ctx, intrname,
+                                                      ctx->f32, args, num_args,
                                                       AC_FUNC_ATTR_READNONE |
-                                                      AC_FUNC_ATTR_LEGACY);
+                                                      (HAVE_LLVM < 0x0800 ? AC_FUNC_ATTR_LEGACY : 0));
                }
                if (num_channels == 1)
                        return result[0];
@@ -1116,6 +1412,14 @@ ac_build_buffer_load(struct ac_llvm_context *ctx,
                return ac_build_gather_values(ctx, result, num_channels);
        }
 
+       if (HAVE_LLVM >= 0x0800) {
+               return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex,
+                                                        offset, ctx->i32_0,
+                                                        num_channels, glc, slc,
+                                                        can_speculate, false,
+                                                        false);
+       }
+
        return ac_build_buffer_load_common(ctx, rsrc, vindex, offset,
                                           num_channels, glc, slc,
                                           can_speculate, false);
@@ -1129,6 +1433,11 @@ LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
                                         bool glc,
                                         bool can_speculate)
 {
+       if (HAVE_LLVM >= 0x800) {
+               return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
+                                                        num_channels, glc, false,
+                                                        can_speculate, true, true);
+       }
        return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset,
                                           num_channels, glc, false,
                                           can_speculate, true);
@@ -1142,6 +1451,12 @@ LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
                                                   bool glc,
                                                   bool can_speculate)
 {
+       if (HAVE_LLVM >= 0x800) {
+               return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
+                                                        num_channels, glc, false,
+                                                        can_speculate, true, true);
+       }
+
        LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), "");
        LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, ctx->i32_1, "");
        stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), "");
@@ -1158,31 +1473,294 @@ LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
                                           can_speculate, true);
 }
 
-LLVMValueRef
-ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
+static LLVMValueRef
+ac_build_llvm8_tbuffer_load(struct ac_llvm_context *ctx,
+                           LLVMValueRef rsrc,
+                           LLVMValueRef vindex,
+                           LLVMValueRef voffset,
+                           LLVMValueRef soffset,
+                           unsigned num_channels,
+                           unsigned dfmt,
+                           unsigned nfmt,
+                           bool glc,
+                           bool slc,
+                           bool can_speculate,
+                           bool structurized)
+{
+       LLVMValueRef args[6];
+       int idx = 0;
+       args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
+       if (structurized)
+               args[idx++] = vindex ? vindex : ctx->i32_0;
+       args[idx++] = voffset ? voffset : ctx->i32_0;
+       args[idx++] = soffset ? soffset : ctx->i32_0;
+       args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0);
+       args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+       unsigned func = CLAMP(num_channels, 1, 3) - 1;
+
+       LLVMTypeRef types[] = {ctx->i32, ctx->v2i32, ctx->v4i32};
+       const char *type_names[] = {"i32", "v2i32", "v4i32"};
+       const char *indexing_kind = structurized ? "struct" : "raw";
+       char name[256];
+
+       snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s",
+                indexing_kind, type_names[func]);
+
+       return ac_build_intrinsic(ctx, name, types[func], args,
+                                 idx,
+                                 ac_get_load_intr_attribs(can_speculate));
+}
+
+static LLVMValueRef
+ac_build_tbuffer_load(struct ac_llvm_context *ctx,
                            LLVMValueRef rsrc,
                            LLVMValueRef vindex,
                            LLVMValueRef voffset,
-                               LLVMValueRef soffset,
-                               LLVMValueRef immoffset)
+                           LLVMValueRef soffset,
+                           LLVMValueRef immoffset,
+                           unsigned num_channels,
+                           unsigned dfmt,
+                           unsigned nfmt,
+                           bool glc,
+                           bool slc,
+                           bool can_speculate,
+                           bool structurized) /* only matters for LLVM 8+ */
 {
-       const char *name = "llvm.amdgcn.tbuffer.load.i32";
-       LLVMTypeRef type = ctx->i32;
-       LLVMValueRef params[] = {
-                               rsrc,
-                               vindex,
-                               voffset,
-                               soffset,
-                               immoffset,
-                               LLVMConstInt(ctx->i32, V_008F0C_BUF_DATA_FORMAT_16, false),
-                               LLVMConstInt(ctx->i32, V_008F0C_BUF_NUM_FORMAT_UINT, false),
-                               ctx->i1false,
-                               ctx->i1false,
+       if (HAVE_LLVM >= 0x800) {
+               voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
+
+               return ac_build_llvm8_tbuffer_load(ctx, rsrc, vindex, voffset,
+                                                  soffset, num_channels,
+                                                  dfmt, nfmt, glc, slc,
+                                                  can_speculate, structurized);
+       }
+
+       LLVMValueRef args[] = {
+               rsrc,
+               vindex ? vindex : ctx->i32_0,
+               voffset,
+               soffset,
+               immoffset,
+               LLVMConstInt(ctx->i32, dfmt, false),
+               LLVMConstInt(ctx->i32, nfmt, false),
+               LLVMConstInt(ctx->i32, glc, false),
+               LLVMConstInt(ctx->i32, slc, false),
        };
-       LLVMValueRef res = ac_build_intrinsic(ctx, name, type, params, 9, 0);
+       unsigned func = CLAMP(num_channels, 1, 3) - 1;
+       LLVMTypeRef types[] = {ctx->i32, ctx->v2i32, ctx->v4i32};
+       const char *type_names[] = {"i32", "v2i32", "v4i32"};
+       char name[256];
+
+       snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.load.%s",
+                type_names[func]);
+
+       return ac_build_intrinsic(ctx, name, types[func], args, 9,
+                                 ac_get_load_intr_attribs(can_speculate));
+}
+
+LLVMValueRef
+ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx,
+                            LLVMValueRef rsrc,
+                            LLVMValueRef vindex,
+                            LLVMValueRef voffset,
+                            LLVMValueRef soffset,
+                            LLVMValueRef immoffset,
+                            unsigned num_channels,
+                            unsigned dfmt,
+                            unsigned nfmt,
+                            bool glc,
+                            bool slc,
+                            bool can_speculate)
+{
+       return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset,
+                                    immoffset, num_channels, dfmt, nfmt, glc,
+                                    slc, can_speculate, true);
+}
+
+LLVMValueRef
+ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx,
+                         LLVMValueRef rsrc,
+                         LLVMValueRef voffset,
+                         LLVMValueRef soffset,
+                         LLVMValueRef immoffset,
+                         unsigned num_channels,
+                         unsigned dfmt,
+                         unsigned nfmt,
+                         bool glc,
+                         bool slc,
+                         bool can_speculate)
+{
+       return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset,
+                                    immoffset, num_channels, dfmt, nfmt, glc,
+                                    slc, can_speculate, false);
+}
+
+LLVMValueRef
+ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
+                           LLVMValueRef rsrc,
+                           LLVMValueRef voffset,
+                           LLVMValueRef soffset,
+                           LLVMValueRef immoffset,
+                           bool glc)
+{
+       unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
+       unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
+       LLVMValueRef res;
+
+       res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
+                                       immoffset, 1, dfmt, nfmt, glc, false,
+                                       false);
+
        return LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
 }
 
+static void
+ac_build_llvm8_tbuffer_store(struct ac_llvm_context *ctx,
+                            LLVMValueRef rsrc,
+                            LLVMValueRef vdata,
+                            LLVMValueRef vindex,
+                            LLVMValueRef voffset,
+                            LLVMValueRef soffset,
+                            unsigned num_channels,
+                            unsigned dfmt,
+                            unsigned nfmt,
+                            bool glc,
+                            bool slc,
+                            bool writeonly_memory,
+                            bool structurized)
+{
+       LLVMValueRef args[7];
+       int idx = 0;
+       args[idx++] = vdata;
+       args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
+       if (structurized)
+               args[idx++] = vindex ? vindex : ctx->i32_0;
+       args[idx++] = voffset ? voffset : ctx->i32_0;
+       args[idx++] = soffset ? soffset : ctx->i32_0;
+       args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0);
+       args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+       unsigned func = CLAMP(num_channels, 1, 3) - 1;
+
+       const char *type_names[] = {"i32", "v2i32", "v4i32"};
+       const char *indexing_kind = structurized ? "struct" : "raw";
+       char name[256];
+
+       snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s",
+                indexing_kind, type_names[func]);
+
+       ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
+                          ac_get_store_intr_attribs(writeonly_memory));
+}
+
+static void
+ac_build_tbuffer_store(struct ac_llvm_context *ctx,
+                      LLVMValueRef rsrc,
+                      LLVMValueRef vdata,
+                      LLVMValueRef vindex,
+                      LLVMValueRef voffset,
+                      LLVMValueRef soffset,
+                      LLVMValueRef immoffset,
+                      unsigned num_channels,
+                      unsigned dfmt,
+                      unsigned nfmt,
+                      bool glc,
+                      bool slc,
+                      bool writeonly_memory,
+                      bool structurized) /* only matters for LLVM 8+ */
+{
+       if (HAVE_LLVM >= 0x800) {
+               voffset = LLVMBuildAdd(ctx->builder,
+                                      voffset ? voffset : ctx->i32_0,
+                                      immoffset, "");
+
+               ac_build_llvm8_tbuffer_store(ctx, rsrc, vdata, vindex, voffset,
+                                            soffset, num_channels, dfmt, nfmt,
+                                            glc, slc, writeonly_memory,
+                                            structurized);
+       } else {
+               LLVMValueRef params[] = {
+                       vdata,
+                       rsrc,
+                       vindex ? vindex : ctx->i32_0,
+                       voffset ? voffset : ctx->i32_0,
+                       soffset ? soffset : ctx->i32_0,
+                       immoffset,
+                       LLVMConstInt(ctx->i32, dfmt, false),
+                       LLVMConstInt(ctx->i32, nfmt, false),
+                       LLVMConstInt(ctx->i32, glc, false),
+                       LLVMConstInt(ctx->i32, slc, false),
+               };
+               unsigned func = CLAMP(num_channels, 1, 3) - 1;
+               const char *type_names[] = {"i32", "v2i32", "v4i32"};
+               char name[256];
+
+               snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s",
+                        type_names[func]);
+
+               ac_build_intrinsic(ctx, name, ctx->voidt, params, 10,
+                                  ac_get_store_intr_attribs(writeonly_memory));
+       }
+}
+
+void
+ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx,
+                             LLVMValueRef rsrc,
+                             LLVMValueRef vdata,
+                             LLVMValueRef vindex,
+                             LLVMValueRef voffset,
+                             LLVMValueRef soffset,
+                             LLVMValueRef immoffset,
+                             unsigned num_channels,
+                             unsigned dfmt,
+                             unsigned nfmt,
+                             bool glc,
+                             bool slc,
+                             bool writeonly_memory)
+{
+       ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset,
+                              immoffset, num_channels, dfmt, nfmt, glc, slc,
+                              writeonly_memory, true);
+}
+
+void
+ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx,
+                          LLVMValueRef rsrc,
+                          LLVMValueRef vdata,
+                          LLVMValueRef voffset,
+                          LLVMValueRef soffset,
+                          LLVMValueRef immoffset,
+                          unsigned num_channels,
+                          unsigned dfmt,
+                          unsigned nfmt,
+                          bool glc,
+                          bool slc,
+                          bool writeonly_memory)
+{
+       ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset,
+                              immoffset, num_channels, dfmt, nfmt, glc, slc,
+                              writeonly_memory, false);
+}
+
+void
+ac_build_tbuffer_store_short(struct ac_llvm_context *ctx,
+                            LLVMValueRef rsrc,
+                            LLVMValueRef vdata,
+                            LLVMValueRef voffset,
+                            LLVMValueRef soffset,
+                            bool glc,
+                            bool writeonly_memory)
+{
+       unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
+       unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
+
+       vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
+       vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
+
+       ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
+                                  ctx->i32_0, 1, dfmt, nfmt, glc, false,
+                                  writeonly_memory);
+}
+
 /**
  * Set range metadata on an instruction.  This can only be used on load and
  * call instructions.  If you know an instruction can only produce the values
@@ -1252,99 +1830,28 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
              int idx,
              LLVMValueRef val)
 {
-       LLVMValueRef tl, trbl, args[2];
+       unsigned tl_lanes[4], trbl_lanes[4];
+       LLVMValueRef tl, trbl;
        LLVMValueRef result;
 
-       if (HAVE_LLVM >= 0x0700) {
-               unsigned tl_lanes[4], trbl_lanes[4];
-
-               for (unsigned i = 0; i < 4; ++i) {
-                       tl_lanes[i] = i & mask;
-                       trbl_lanes[i] = (i & mask) + idx;
-               }
-
-               tl = ac_build_quad_swizzle(ctx, val,
-                                          tl_lanes[0], tl_lanes[1],
-                                          tl_lanes[2], tl_lanes[3]);
-               trbl = ac_build_quad_swizzle(ctx, val,
-                                            trbl_lanes[0], trbl_lanes[1],
-                                            trbl_lanes[2], trbl_lanes[3]);
-       } else if (ctx->chip_class >= VI) {
-               LLVMValueRef thread_id, tl_tid, trbl_tid;
-               thread_id = ac_get_thread_id(ctx);
-
-               tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
-                                     LLVMConstInt(ctx->i32, mask, false), "");
-
-               trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
-                                       LLVMConstInt(ctx->i32, idx, false), "");
-
-               args[0] = LLVMBuildMul(ctx->builder, tl_tid,
-                                      LLVMConstInt(ctx->i32, 4, false), "");
-               args[1] = val;
-               tl = ac_build_intrinsic(ctx,
-                                       "llvm.amdgcn.ds.bpermute", ctx->i32,
-                                       args, 2,
-                                       AC_FUNC_ATTR_READNONE |
-                                       AC_FUNC_ATTR_CONVERGENT);
-
-               args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
-                                      LLVMConstInt(ctx->i32, 4, false), "");
-               trbl = ac_build_intrinsic(ctx,
-                                         "llvm.amdgcn.ds.bpermute", ctx->i32,
-                                         args, 2,
-                                         AC_FUNC_ATTR_READNONE |
-                                         AC_FUNC_ATTR_CONVERGENT);
-       } else {
-               uint32_t masks[2] = {};
-
-               switch (mask) {
-               case AC_TID_MASK_TOP_LEFT:
-                       masks[0] = 0x8000;
-                       if (idx == 1)
-                               masks[1] = 0x8055;
-                       else
-                               masks[1] = 0x80aa;
-
-                       break;
-               case AC_TID_MASK_TOP:
-                       masks[0] = 0x8044;
-                       masks[1] = 0x80ee;
-                       break;
-               case AC_TID_MASK_LEFT:
-                       masks[0] = 0x80a0;
-                       masks[1] = 0x80f5;
-                       break;
-               default:
-                       assert(0);
-               }
-
-               args[0] = val;
-               args[1] = LLVMConstInt(ctx->i32, masks[0], false);
-
-               tl = ac_build_intrinsic(ctx,
-                                       "llvm.amdgcn.ds.swizzle", ctx->i32,
-                                       args, 2,
-                                       AC_FUNC_ATTR_READNONE |
-                                       AC_FUNC_ATTR_CONVERGENT);
-
-               args[1] = LLVMConstInt(ctx->i32, masks[1], false);
-               trbl = ac_build_intrinsic(ctx,
-                                       "llvm.amdgcn.ds.swizzle", ctx->i32,
-                                       args, 2,
-                                       AC_FUNC_ATTR_READNONE |
-                                       AC_FUNC_ATTR_CONVERGENT);
+       for (unsigned i = 0; i < 4; ++i) {
+               tl_lanes[i] = i & mask;
+               trbl_lanes[i] = (i & mask) + idx;
        }
 
+       tl = ac_build_quad_swizzle(ctx, val,
+                                  tl_lanes[0], tl_lanes[1],
+                                  tl_lanes[2], tl_lanes[3]);
+       trbl = ac_build_quad_swizzle(ctx, val,
+                                    trbl_lanes[0], trbl_lanes[1],
+                                    trbl_lanes[2], trbl_lanes[3]);
+
        tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
        trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
        result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
 
-       if (HAVE_LLVM >= 0x0700) {
-               result = ac_build_intrinsic(ctx,
-                       "llvm.amdgcn.wqm.f32", ctx->f32,
-                       &result, 1, 0);
-       }
+       result = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32,
+                                   &result, 1, 0);
 
        return result;
 }
@@ -1443,16 +1950,20 @@ ac_build_umsb(struct ac_llvm_context *ctx,
 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
                           LLVMValueRef b)
 {
+       char name[64];
+       snprintf(name, sizeof(name), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
        LLVMValueRef args[2] = {a, b};
-       return ac_build_intrinsic(ctx, "llvm.minnum.f32", ctx->f32, args, 2,
+       return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
                                  AC_FUNC_ATTR_READNONE);
 }
 
 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
                           LLVMValueRef b)
 {
+       char name[64];
+       snprintf(name, sizeof(name), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
        LLVMValueRef args[2] = {a, b};
-       return ac_build_intrinsic(ctx, "llvm.maxnum.f32", ctx->f32, args, 2,
+       return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
                                  AC_FUNC_ATTR_READNONE);
 }
 
@@ -1479,8 +1990,9 @@ LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,
 
 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
 {
-       return ac_build_fmin(ctx, ac_build_fmax(ctx, value, ctx->f32_0),
-                            ctx->f32_1);
+       LLVMTypeRef t = LLVMTypeOf(value);
+       return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
+                            LLVMConstReal(t, 1.0));
 }
 
 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
@@ -1589,171 +2101,6 @@ static const char *get_atomic_name(enum ac_atomic_op op)
        unreachable("bad atomic op");
 }
 
-/* LLVM 6 and older */
-static LLVMValueRef ac_build_image_opcode_llvm6(struct ac_llvm_context *ctx,
-                                               struct ac_image_args *a)
-{
-       LLVMValueRef args[16];
-       LLVMTypeRef retty = ctx->v4f32;
-       const char *name = NULL;
-       const char *atomic_subop = "";
-       char intr_name[128], coords_type[64];
-
-       bool sample = a->opcode == ac_image_sample ||
-                     a->opcode == ac_image_gather4 ||
-                     a->opcode == ac_image_get_lod;
-       bool atomic = a->opcode == ac_image_atomic ||
-                     a->opcode == ac_image_atomic_cmpswap;
-       bool da = a->dim == ac_image_cube ||
-                 a->dim == ac_image_1darray ||
-                 a->dim == ac_image_2darray ||
-                 a->dim == ac_image_2darraymsaa;
-       if (a->opcode == ac_image_get_lod)
-               da = false;
-
-       unsigned num_coords =
-               a->opcode != ac_image_get_resinfo ? ac_num_coords(a->dim) : 0;
-       LLVMValueRef addr;
-       unsigned num_addr = 0;
-
-       if (a->opcode == ac_image_get_lod) {
-               switch (a->dim) {
-               case ac_image_1darray:
-                       num_coords = 1;
-                       break;
-               case ac_image_2darray:
-               case ac_image_cube:
-                       num_coords = 2;
-                       break;
-               default:
-                       break;
-               }
-       }
-
-       if (a->offset)
-               args[num_addr++] = ac_to_integer(ctx, a->offset);
-       if (a->bias)
-               args[num_addr++] = ac_to_integer(ctx, a->bias);
-       if (a->compare)
-               args[num_addr++] = ac_to_integer(ctx, a->compare);
-       if (a->derivs[0]) {
-               unsigned num_derivs = ac_num_derivs(a->dim);
-               for (unsigned i = 0; i < num_derivs; ++i)
-                       args[num_addr++] = ac_to_integer(ctx, a->derivs[i]);
-       }
-       for (unsigned i = 0; i < num_coords; ++i)
-               args[num_addr++] = ac_to_integer(ctx, a->coords[i]);
-       if (a->lod)
-               args[num_addr++] = ac_to_integer(ctx, a->lod);
-
-       unsigned pad_goal = util_next_power_of_two(num_addr);
-       while (num_addr < pad_goal)
-               args[num_addr++] = LLVMGetUndef(ctx->i32);
-
-       addr = ac_build_gather_values(ctx, args, num_addr);
-
-       unsigned num_args = 0;
-       if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
-               args[num_args++] = a->data[0];
-               if (a->opcode == ac_image_atomic_cmpswap)
-                       args[num_args++] = a->data[1];
-       }
-
-       unsigned coords_arg = num_args;
-       if (sample)
-               args[num_args++] = ac_to_float(ctx, addr);
-       else
-               args[num_args++] = ac_to_integer(ctx, addr);
-
-       args[num_args++] = a->resource;
-       if (sample)
-               args[num_args++] = a->sampler;
-       if (!atomic) {
-               args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, 0);
-               if (sample)
-                       args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, 0);
-               args[num_args++] = a->cache_policy & ac_glc ? ctx->i1true : ctx->i1false;
-               args[num_args++] = a->cache_policy & ac_slc ? ctx->i1true : ctx->i1false;
-               args[num_args++] = ctx->i1false; /* lwe */
-               args[num_args++] = LLVMConstInt(ctx->i1, da, 0);
-       } else {
-               args[num_args++] = ctx->i1false; /* r128 */
-               args[num_args++] = LLVMConstInt(ctx->i1, da, 0);
-               args[num_args++] = a->cache_policy & ac_slc ? ctx->i1true : ctx->i1false;
-       }
-
-       switch (a->opcode) {
-       case ac_image_sample:
-               name = "llvm.amdgcn.image.sample";
-               break;
-       case ac_image_gather4:
-               name = "llvm.amdgcn.image.gather4";
-               break;
-       case ac_image_load:
-               name = "llvm.amdgcn.image.load";
-               break;
-       case ac_image_load_mip:
-               name = "llvm.amdgcn.image.load.mip";
-               break;
-       case ac_image_store:
-               name = "llvm.amdgcn.image.store";
-               retty = ctx->voidt;
-               break;
-       case ac_image_store_mip:
-               name = "llvm.amdgcn.image.store.mip";
-               retty = ctx->voidt;
-               break;
-       case ac_image_atomic:
-       case ac_image_atomic_cmpswap:
-               name = "llvm.amdgcn.image.atomic.";
-               retty = ctx->i32;
-               if (a->opcode == ac_image_atomic_cmpswap) {
-                       atomic_subop = "cmpswap";
-               } else {
-                       atomic_subop = get_atomic_name(a->atomic);
-               }
-               break;
-       case ac_image_get_lod:
-               name = "llvm.amdgcn.image.getlod";
-               break;
-       case ac_image_get_resinfo:
-               name = "llvm.amdgcn.image.getresinfo";
-               break;
-       default:
-               unreachable("invalid image opcode");
-       }
-
-       ac_build_type_name_for_intr(LLVMTypeOf(args[coords_arg]), coords_type,
-                                   sizeof(coords_type));
-
-       if (atomic) {
-               snprintf(intr_name, sizeof(intr_name), "llvm.amdgcn.image.atomic.%s.%s",
-                        atomic_subop, coords_type);
-       } else {
-               bool lod_suffix =
-                       a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
-
-               snprintf(intr_name, sizeof(intr_name), "%s%s%s%s.v4f32.%s.v8i32",
-                       name,
-                       a->compare ? ".c" : "",
-                       a->bias ? ".b" :
-                       lod_suffix ? ".l" :
-                       a->derivs[0] ? ".d" :
-                       a->level_zero ? ".lz" : "",
-                       a->offset ? ".o" : "",
-                       coords_type);
-       }
-
-       LLVMValueRef result =
-               ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
-                                  a->attributes);
-       if (!sample && retty == ctx->v4f32) {
-               result = LLVMBuildBitCast(ctx->builder, result,
-                                         ctx->v4i32, "");
-       }
-       return result;
-}
-
 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
                                   struct ac_image_args *a)
 {
@@ -1778,9 +2125,6 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
               (a->level_zero ? 1 : 0) +
               (a->derivs[0] ? 1 : 0) <= 1);
 
-       if (HAVE_LLVM < 0x0700)
-               return ac_build_image_opcode_llvm6(ctx, a);
-
        if (a->opcode == ac_image_get_lod) {
                switch (dim) {
                case ac_image_1darray:
@@ -2062,48 +2406,28 @@ LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
        char *intr;
 
        if (bitsize == 32) {
-               intr = "llvm.floor.f32";
+               intr = "llvm.amdgcn.fract.f32";
                type = ctx->f32;
        } else {
-               intr = "llvm.floor.f64";
+               intr = "llvm.amdgcn.fract.f64";
                type = ctx->f64;
        }
 
        LLVMValueRef params[] = {
                src0,
        };
-       LLVMValueRef floor = ac_build_intrinsic(ctx, intr, type, params, 1,
-                                               AC_FUNC_ATTR_READNONE);
-       return LLVMBuildFSub(ctx->builder, src0, floor, "");
+       return ac_build_intrinsic(ctx, intr, type, params, 1,
+                                 AC_FUNC_ATTR_READNONE);
 }
 
 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
                            unsigned bitsize)
 {
-       LLVMValueRef cmp, val, zero, one;
-       LLVMTypeRef type;
-
-       switch (bitsize) {
-       case 64:
-               type = ctx->i64;
-               zero = ctx->i64_0;
-               one = ctx->i64_1;
-               break;
-       case 32:
-               type = ctx->i32;
-               zero = ctx->i32_0;
-               one = ctx->i32_1;
-               break;
-       case 16:
-               type = ctx->i16;
-               zero = ctx->i16_0;
-               one = ctx->i16_1;
-               break;
-       default:
-               unreachable(!"invalid bitsize");
-               break;
-       }
+       LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize);
+       LLVMValueRef zero = LLVMConstInt(type, 0, false);
+       LLVMValueRef one = LLVMConstInt(type, 1, false);
 
+       LLVMValueRef cmp, val;
        cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
        val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
        cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
@@ -2154,6 +2478,11 @@ LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
                                            (LLVMValueRef []) { src0 }, 1,
                                            AC_FUNC_ATTR_READNONE);
                break;
+       case 16:
+               result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16,
+                                           (LLVMValueRef []) { src0 }, 1,
+                                           AC_FUNC_ATTR_READNONE);
+               break;
        default:
                unreachable(!"invalid bitsize");
                break;
@@ -2176,6 +2505,11 @@ LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
                                            (LLVMValueRef []) { src0 }, 1,
                                            AC_FUNC_ATTR_READNONE);
                break;
+       case 16:
+               result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16,
+                                           (LLVMValueRef []) { src0 }, 1,
+                                           AC_FUNC_ATTR_READNONE);
+               break;
        default:
                unreachable(!"invalid bitsize");
                break;
@@ -2472,7 +2806,7 @@ void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
 {
        unsigned lds_size = ctx->chip_class >= CIK ? 65536 : 32768;
        ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
-                                    LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_LOCAL_ADDR_SPACE),
+                                    LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS),
                                     "lds");
 }
 
@@ -2554,16 +2888,13 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
 {
        return LLVMPointerType(LLVMArrayType(elem_type, 0),
-                              AC_CONST_ADDR_SPACE);
+                              AC_ADDR_SPACE_CONST);
 }
 
 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
 {
-       if (!HAVE_32BIT_POINTERS)
-               return ac_array_in_const_addr_space(elem_type);
-
        return LLVMPointerType(LLVMArrayType(elem_type, 0),
-                              AC_CONST_32BIT_ADDR_SPACE);
+                              AC_ADDR_SPACE_CONST_32BIT);
 }
 
 static struct ac_llvm_flow *
@@ -2707,8 +3038,7 @@ void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
        ctx->flow_depth--;
 }
 
-static void if_cond_emit(struct ac_llvm_context *ctx, LLVMValueRef cond,
-                        int label_id)
+void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
 {
        struct ac_llvm_flow *flow = push_flow(ctx);
        LLVMBasicBlockRef if_block;
@@ -2725,7 +3055,7 @@ void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
 {
        LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
                                          value, ctx->f32_0, "");
-       if_cond_emit(ctx, cond, label_id);
+       ac_build_ifcc(ctx, cond, label_id);
 }
 
 void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
@@ -2734,10 +3064,10 @@ void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
        LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
                                          ac_to_integer(ctx, value),
                                          ctx->i32_0, "");
-       if_cond_emit(ctx, cond, label_id);
+       ac_build_ifcc(ctx, cond, label_id);
 }
 
-LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type,
+LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type,
                             const char *name)
 {
        LLVMBuilderRef builder = ac->builder;
@@ -2755,18 +3085,15 @@ LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type,
        }
 
        res = LLVMBuildAlloca(first_builder, type, name);
-       LLVMBuildStore(builder, LLVMConstNull(type), res);
-
        LLVMDisposeBuilder(first_builder);
-
        return res;
 }
 
-LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac,
+LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac,
                                   LLVMTypeRef type, const char *name)
 {
-       LLVMValueRef ptr = ac_build_alloca(ac, type, name);
-       LLVMBuildStore(ac->builder, LLVMGetUndef(type), ptr);
+       LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
+       LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
        return ptr;
 }
 
@@ -2785,9 +3112,11 @@ LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value,
        if (count == num_components)
                return value;
 
-       LLVMValueRef masks[] = {
-           ctx->i32_0, ctx->i32_1,
-           LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false)};
+       LLVMValueRef masks[MAX2(count, 2)];
+       masks[0] = ctx->i32_0;
+       masks[1] = ctx->i32_1;
+       for (unsigned i = 2; i < count; i++)
+               masks[i] = LLVMConstInt(ctx->i32, i, false);
 
        if (count == 1)
                return LLVMBuildExtractElement(ctx->builder, value, masks[0],
@@ -3195,24 +3524,44 @@ ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
        }
 }
 
-/* TODO: add inclusive and excluse scan functions for SI chip class.  */
+/**
+ * \param maxprefix specifies that the result only needs to be correct for a
+ *     prefix of this many threads
+ *
+ * TODO: add inclusive and excluse scan functions for SI chip class.
+ */
 static LLVMValueRef
-ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity)
+ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
+             unsigned maxprefix)
 {
        LLVMValueRef result, tmp;
        result = src;
+       if (maxprefix <= 1)
+               return result;
        tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
        result = ac_build_alu_op(ctx, result, tmp, op);
+       if (maxprefix <= 2)
+               return result;
        tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
        result = ac_build_alu_op(ctx, result, tmp, op);
+       if (maxprefix <= 3)
+               return result;
        tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
        result = ac_build_alu_op(ctx, result, tmp, op);
+       if (maxprefix <= 4)
+               return result;
        tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
        result = ac_build_alu_op(ctx, result, tmp, op);
+       if (maxprefix <= 8)
+               return result;
        tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
        result = ac_build_alu_op(ctx, result, tmp, op);
+       if (maxprefix <= 16)
+               return result;
        tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
        result = ac_build_alu_op(ctx, result, tmp, op);
+       if (maxprefix <= 32)
+               return result;
        tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
        result = ac_build_alu_op(ctx, result, tmp, op);
        return result;
@@ -3221,14 +3570,24 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu
 LLVMValueRef
 ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
 {
-       ac_build_optimization_barrier(ctx, &src);
        LLVMValueRef result;
-       LLVMValueRef identity = get_reduction_identity(ctx, op,
-                                                               ac_get_type_size(LLVMTypeOf(src)));
-       result = LLVMBuildBitCast(ctx->builder,
-                                                               ac_build_set_inactive(ctx, src, identity),
-                                                               LLVMTypeOf(identity), "");
-       result = ac_build_scan(ctx, op, result, identity);
+
+       if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
+               LLVMBuilderRef builder = ctx->builder;
+               src = LLVMBuildZExt(builder, src, ctx->i32, "");
+               result = ac_build_ballot(ctx, src);
+               result = ac_build_mbcnt(ctx, result);
+               result = LLVMBuildAdd(builder, result, src, "");
+               return result;
+       }
+
+       ac_build_optimization_barrier(ctx, &src);
+
+       LLVMValueRef identity =
+               get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
+       result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
+                                 LLVMTypeOf(identity), "");
+       result = ac_build_scan(ctx, op, result, identity, 64);
 
        return ac_build_wwm(ctx, result);
 }
@@ -3236,15 +3595,24 @@ ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op
 LLVMValueRef
 ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
 {
-       ac_build_optimization_barrier(ctx, &src);
        LLVMValueRef result;
-       LLVMValueRef identity = get_reduction_identity(ctx, op,
-                                                               ac_get_type_size(LLVMTypeOf(src)));
-       result = LLVMBuildBitCast(ctx->builder,
-                                                               ac_build_set_inactive(ctx, src, identity),
-                                                               LLVMTypeOf(identity), "");
+
+       if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
+               LLVMBuilderRef builder = ctx->builder;
+               src = LLVMBuildZExt(builder, src, ctx->i32, "");
+               result = ac_build_ballot(ctx, src);
+               result = ac_build_mbcnt(ctx, result);
+               return result;
+       }
+
+       ac_build_optimization_barrier(ctx, &src);
+
+       LLVMValueRef identity =
+               get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
+       result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
+                                 LLVMTypeOf(identity), "");
        result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
-       result = ac_build_scan(ctx, op, result, identity);
+       result = ac_build_scan(ctx, op, result, identity, 64);
 
        return ac_build_wwm(ctx, result);
 }
@@ -3302,6 +3670,175 @@ ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsign
        }
 }
 
+/**
+ * "Top half" of a scan that reduces per-wave values across an entire
+ * workgroup.
+ *
+ * The source value must be present in the highest lane of the wave, and the
+ * highest lane must be live.
+ */
+void
+ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+       if (ws->maxwaves <= 1)
+               return;
+
+       const LLVMValueRef i32_63 = LLVMConstInt(ctx->i32, 63, false);
+       LLVMBuilderRef builder = ctx->builder;
+       LLVMValueRef tid = ac_get_thread_id(ctx);
+       LLVMValueRef tmp;
+
+       tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, i32_63, "");
+       ac_build_ifcc(ctx, tmp, 1000);
+       LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
+       ac_build_endif(ctx, 1000);
+}
+
+/**
+ * "Bottom half" of a scan that reduces per-wave values across an entire
+ * workgroup.
+ *
+ * The caller must place a barrier between the top and bottom halves.
+ */
+void
+ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+       const LLVMTypeRef type = LLVMTypeOf(ws->src);
+       const LLVMValueRef identity =
+               get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
+
+       if (ws->maxwaves <= 1) {
+               ws->result_reduce = ws->src;
+               ws->result_inclusive = ws->src;
+               ws->result_exclusive = identity;
+               return;
+       }
+       assert(ws->maxwaves <= 32);
+
+       LLVMBuilderRef builder = ctx->builder;
+       LLVMValueRef tid = ac_get_thread_id(ctx);
+       LLVMBasicBlockRef bbs[2];
+       LLVMValueRef phivalues_scan[2];
+       LLVMValueRef tmp, tmp2;
+
+       bbs[0] = LLVMGetInsertBlock(builder);
+       phivalues_scan[0] = LLVMGetUndef(type);
+
+       if (ws->enable_reduce)
+               tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
+       else if (ws->enable_inclusive)
+               tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
+       else
+               tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
+       ac_build_ifcc(ctx, tmp, 1001);
+       {
+               tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
+
+               ac_build_optimization_barrier(ctx, &tmp);
+
+               bbs[1] = LLVMGetInsertBlock(builder);
+               phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves);
+       }
+       ac_build_endif(ctx, 1001);
+
+       const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
+
+       if (ws->enable_reduce) {
+               tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
+               ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
+       }
+       if (ws->enable_inclusive)
+               ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
+       if (ws->enable_exclusive) {
+               tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
+               tmp = ac_build_readlane(ctx, scan, tmp);
+               tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
+               ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
+       }
+}
+
+/**
+ * Inclusive scan of a per-wave value across an entire workgroup.
+ *
+ * This implies an s_barrier instruction.
+ *
+ * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
+ * of the workgroup are live. (This requirement cannot easily be relaxed in a
+ * useful manner because of the barrier in the algorithm.)
+ */
+void
+ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+       ac_build_wg_wavescan_top(ctx, ws);
+       ac_build_s_barrier(ctx);
+       ac_build_wg_wavescan_bottom(ctx, ws);
+}
+
+/**
+ * "Top half" of a scan that reduces per-thread values across an entire
+ * workgroup.
+ *
+ * All lanes must be active when this code runs.
+ */
+void
+ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+       if (ws->enable_exclusive) {
+               ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
+               if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
+                       ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
+               ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
+       } else {
+               ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
+       }
+
+       bool enable_inclusive = ws->enable_inclusive;
+       bool enable_exclusive = ws->enable_exclusive;
+       ws->enable_inclusive = false;
+       ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
+       ac_build_wg_wavescan_top(ctx, ws);
+       ws->enable_inclusive = enable_inclusive;
+       ws->enable_exclusive = enable_exclusive;
+}
+
+/**
+ * "Bottom half" of a scan that reduces per-thread values across an entire
+ * workgroup.
+ *
+ * The caller must place a barrier between the top and bottom halves.
+ */
+void
+ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+       bool enable_inclusive = ws->enable_inclusive;
+       bool enable_exclusive = ws->enable_exclusive;
+       ws->enable_inclusive = false;
+       ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
+       ac_build_wg_wavescan_bottom(ctx, ws);
+       ws->enable_inclusive = enable_inclusive;
+       ws->enable_exclusive = enable_exclusive;
+
+       /* ws->result_reduce is already the correct value */
+       if (ws->enable_inclusive)
+               ws->result_inclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->src, ws->op);
+       if (ws->enable_exclusive)
+               ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
+}
+
+/**
+ * A scan that reduces per-thread values across an entire workgroup.
+ *
+ * The caller must ensure that all lanes are active when this code runs
+ * (WWM is insufficient!), because there is an implied barrier.
+ */
+void
+ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+       ac_build_wg_scan_top(ctx, ws);
+       ac_build_s_barrier(ctx);
+       ac_build_wg_scan_bottom(ctx, ws);
+}
+
 LLVMValueRef
 ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
                unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)