ac: replace glc,slc with cache_policy for stores
[mesa.git] / src / amd / common / ac_llvm_build.c
index 3ad9bb348058bd192829812b5dd3dc8d33f96a27..eb143e3fa100507940c48f95e2716f5938bc84e0 100644 (file)
@@ -84,6 +84,7 @@ ac_llvm_context_init(struct ac_llvm_context *ctx,
        ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
        ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
        ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
+       ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
        ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
        ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
 
@@ -440,6 +441,7 @@ LLVMValueRef
 ac_build_ballot(struct ac_llvm_context *ctx,
                LLVMValueRef value)
 {
+       const char *name = HAVE_LLVM >= 0x900 ? "llvm.amdgcn.icmp.i64.i32" : "llvm.amdgcn.icmp.i32";
        LLVMValueRef args[3] = {
                value,
                ctx->i32_0,
@@ -453,8 +455,7 @@ ac_build_ballot(struct ac_llvm_context *ctx,
 
        args[0] = ac_to_integer(ctx, args[0]);
 
-       return ac_build_intrinsic(ctx,
-                                 "llvm.amdgcn.icmp.i32",
+       return ac_build_intrinsic(ctx, name,
                                  ctx->i64, args, 3,
                                  AC_FUNC_ATTR_NOUNWIND |
                                  AC_FUNC_ATTR_READNONE |
@@ -464,6 +465,7 @@ ac_build_ballot(struct ac_llvm_context *ctx,
 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx,
                                 LLVMValueRef value)
 {
+       const char *name = HAVE_LLVM >= 0x900 ? "llvm.amdgcn.icmp.i64.i1" : "llvm.amdgcn.icmp.i1";
        LLVMValueRef args[3] = {
                value,
                ctx->i1false,
@@ -471,7 +473,7 @@ LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx,
        };
 
        assert(HAVE_LLVM >= 0x0800);
-       return ac_build_intrinsic(ctx, "llvm.amdgcn.icmp.i1", ctx->i64, args, 3,
+       return ac_build_intrinsic(ctx, name, ctx->i64, args, 3,
                                  AC_FUNC_ATTR_NOUNWIND |
                                  AC_FUNC_ATTR_READNONE |
                                  AC_FUNC_ATTR_CONVERGENT);
@@ -1105,25 +1107,32 @@ LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
        return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
 }
 
+static LLVMValueRef get_cache_policy(struct ac_llvm_context *ctx,
+                                    bool load, bool glc, bool slc)
+{
+       return LLVMConstInt(ctx->i32,
+                           (glc ? ac_glc : 0) +
+                           (slc ? ac_slc : 0) +
+                           (ctx->chip_class >= GFX10 && glc && load ? ac_dlc : 0), 0);
+}
+
 static void
-ac_build_buffer_store_common(struct ac_llvm_context *ctx,
-                            LLVMValueRef rsrc,
-                            LLVMValueRef data,
-                            LLVMValueRef vindex,
-                            LLVMValueRef voffset,
-                            unsigned num_channels,
-                            bool glc,
-                            bool slc,
-                            bool writeonly_memory,
-                            bool use_format)
+ac_build_llvm7_buffer_store_common(struct ac_llvm_context *ctx,
+                                  LLVMValueRef rsrc,
+                                  LLVMValueRef data,
+                                  LLVMValueRef vindex,
+                                  LLVMValueRef voffset,
+                                  unsigned num_channels,
+                                  unsigned cache_policy,
+                                  bool use_format)
 {
        LLVMValueRef args[] = {
                data,
                LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
                vindex ? vindex : ctx->i32_0,
                voffset,
-               LLVMConstInt(ctx->i1, glc, 0),
-               LLVMConstInt(ctx->i1, slc, 0)
+               LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), 0),
+               LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), 0)
        };
        unsigned func = CLAMP(num_channels, 1, 3) - 1;
 
@@ -1139,7 +1148,7 @@ ac_build_buffer_store_common(struct ac_llvm_context *ctx,
        }
 
        ac_build_intrinsic(ctx, name, ctx->voidt, args, ARRAY_SIZE(args),
-                          ac_get_store_intr_attribs(writeonly_memory));
+                          AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
 }
 
 static void
@@ -1151,9 +1160,7 @@ ac_build_llvm8_buffer_store_common(struct ac_llvm_context *ctx,
                                   LLVMValueRef soffset,
                                   unsigned num_channels,
                                   LLVMTypeRef return_channel_type,
-                                  bool glc,
-                                  bool slc,
-                                  bool writeonly_memory,
+                                  unsigned cache_policy,
                                   bool use_format,
                                   bool structurized)
 {
@@ -1165,8 +1172,8 @@ ac_build_llvm8_buffer_store_common(struct ac_llvm_context *ctx,
                args[idx++] = vindex ? vindex : ctx->i32_0;
        args[idx++] = voffset ? voffset : ctx->i32_0;
        args[idx++] = soffset ? soffset : ctx->i32_0;
-       args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
-       unsigned func = num_channels == 3 ? 4 : num_channels;
+       args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
+       unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
        const char *indexing_kind = structurized ? "struct" : "raw";
        char name[256], type_name[8];
 
@@ -1182,7 +1189,7 @@ ac_build_llvm8_buffer_store_common(struct ac_llvm_context *ctx,
        }
 
        ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
-                          ac_get_store_intr_attribs(writeonly_memory));
+                          AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
 }
 
 void
@@ -1192,18 +1199,17 @@ ac_build_buffer_store_format(struct ac_llvm_context *ctx,
                             LLVMValueRef vindex,
                             LLVMValueRef voffset,
                             unsigned num_channels,
-                            bool glc,
-                            bool writeonly_memory)
+                            unsigned cache_policy)
 {
        if (HAVE_LLVM >= 0x800) {
                ac_build_llvm8_buffer_store_common(ctx, rsrc, data, vindex,
                                                   voffset, NULL, num_channels,
-                                                  ctx->f32, glc, false,
-                                                  writeonly_memory, true, true);
+                                                  ctx->f32, cache_policy,
+                                                  true, true);
        } else {
-               ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset,
-                                            num_channels, glc, false,
-                                            writeonly_memory, true);
+               ac_build_llvm7_buffer_store_common(ctx, rsrc, data, vindex, voffset,
+                                                  num_channels, cache_policy,
+                                                  true);
        }
 }
 
@@ -1219,14 +1225,12 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
                            LLVMValueRef voffset,
                            LLVMValueRef soffset,
                            unsigned inst_offset,
-                           bool glc,
-                           bool slc,
-                           bool writeonly_memory,
+                           unsigned cache_policy,
                            bool swizzle_enable_hint)
 {
-       /* Split 3 channel stores, becase LLVM doesn't support 3-channel
+       /* Split 3 channel stores, because only LLVM 9+ support 3-channel
         * intrinsics. */
-       if (num_channels == 3) {
+       if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) {
                LLVMValueRef v[3], v01;
 
                for (int i = 0; i < 3; i++) {
@@ -1236,12 +1240,12 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
                v01 = ac_build_gather_values(ctx, v, 2);
 
                ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
-                                           soffset, inst_offset, glc, slc,
-                                           writeonly_memory, swizzle_enable_hint);
+                                           soffset, inst_offset, cache_policy,
+                                           swizzle_enable_hint);
                ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
                                            soffset, inst_offset + 8,
-                                           glc, slc,
-                                           writeonly_memory, swizzle_enable_hint);
+                                           cache_policy,
+                                           swizzle_enable_hint);
                return;
        }
 
@@ -1263,18 +1267,17 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
                                                           voffset, offset,
                                                           num_channels,
                                                           ctx->f32,
-                                                          glc, slc,
-                                                          writeonly_memory,
+                                                          cache_policy,
                                                           false, false);
                } else {
                        if (voffset)
                                offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
 
-                       ac_build_buffer_store_common(ctx, rsrc,
-                                                    ac_to_float(ctx, vdata),
-                                                    ctx->i32_0, offset,
-                                                    num_channels, glc, slc,
-                                                    writeonly_memory, false);
+                       ac_build_llvm7_buffer_store_common(ctx, rsrc,
+                                                          ac_to_float(ctx, vdata),
+                                                          ctx->i32_0, offset,
+                                                          num_channels, cache_policy,
+                                                          false);
                }
                return;
        }
@@ -1290,20 +1293,19 @@ ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
        LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
 
        ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
-                                  immoffset, num_channels, dfmt, nfmt, glc,
-                                  slc, writeonly_memory);
+                                  immoffset, num_channels, dfmt, nfmt, cache_policy);
 }
 
 static LLVMValueRef
-ac_build_buffer_load_common(struct ac_llvm_context *ctx,
-                           LLVMValueRef rsrc,
-                           LLVMValueRef vindex,
-                           LLVMValueRef voffset,
-                           unsigned num_channels,
-                           bool glc,
-                           bool slc,
-                           bool can_speculate,
-                           bool use_format)
+ac_build_llvm7_buffer_load_common(struct ac_llvm_context *ctx,
+                                 LLVMValueRef rsrc,
+                                 LLVMValueRef vindex,
+                                 LLVMValueRef voffset,
+                                 unsigned num_channels,
+                                 bool glc,
+                                 bool slc,
+                                 bool can_speculate,
+                                 bool use_format)
 {
        LLVMValueRef args[] = {
                LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
@@ -1352,8 +1354,8 @@ ac_build_llvm8_buffer_load_common(struct ac_llvm_context *ctx,
                args[idx++] = vindex ? vindex : ctx->i32_0;
        args[idx++] = voffset ? voffset : ctx->i32_0;
        args[idx++] = soffset ? soffset : ctx->i32_0;
-       args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
-       unsigned func = num_channels == 3 ? 4 : num_channels;
+       args[idx++] = get_cache_policy(ctx, true, glc, slc);
+       unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
        const char *indexing_kind = structurized ? "struct" : "raw";
        char name[256], type_name[8];
 
@@ -1409,7 +1411,7 @@ ac_build_buffer_load(struct ac_llvm_context *ctx,
                        LLVMValueRef args[3] = {
                                rsrc,
                                offset,
-                               glc ? ctx->i32_1 : ctx->i32_0,
+                               get_cache_policy(ctx, true, glc, false),
                        };
                        result[i] = ac_build_intrinsic(ctx, intrname,
                                                       ctx->f32, args, num_args,
@@ -1419,7 +1421,7 @@ ac_build_buffer_load(struct ac_llvm_context *ctx,
                if (num_channels == 1)
                        return result[0];
 
-               if (num_channels == 3)
+               if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false))
                        result[num_channels++] = LLVMGetUndef(ctx->f32);
                return ac_build_gather_values(ctx, result, num_channels);
        }
@@ -1433,9 +1435,9 @@ ac_build_buffer_load(struct ac_llvm_context *ctx,
                                                         false);
        }
 
-       return ac_build_buffer_load_common(ctx, rsrc, vindex, offset,
-                                          num_channels, glc, slc,
-                                          can_speculate, false);
+       return ac_build_llvm7_buffer_load_common(ctx, rsrc, vindex, offset,
+                                                num_channels, glc, slc,
+                                                can_speculate, false);
 }
 
 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
@@ -1452,9 +1454,9 @@ LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
                                                         glc, false,
                                                         can_speculate, true, true);
        }
-       return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset,
-                                          num_channels, glc, false,
-                                          can_speculate, true);
+       return ac_build_llvm7_buffer_load_common(ctx, rsrc, vindex, voffset,
+                                                num_channels, glc, false,
+                                                can_speculate, true);
 }
 
 LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
@@ -1483,9 +1485,52 @@ LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
        LLVMValueRef new_rsrc = LLVMBuildInsertElement(ctx->builder, rsrc, new_elem_count,
                                                       LLVMConstInt(ctx->i32, 2, 0), "");
 
-       return ac_build_buffer_load_common(ctx, new_rsrc, vindex, voffset,
-                                          num_channels, glc, false,
-                                          can_speculate, true);
+       return ac_build_llvm7_buffer_load_common(ctx, new_rsrc, vindex, voffset,
+                                                num_channels, glc, false,
+                                                can_speculate, true);
+}
+
+/// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format
+/// value for LLVM8+ tbuffer intrinsics.
+static unsigned
+ac_get_tbuffer_format(struct ac_llvm_context *ctx,
+                     unsigned dfmt, unsigned nfmt)
+{
+       if (ctx->chip_class >= GFX10) {
+               unsigned format;
+               switch (dfmt) {
+               default: unreachable("bad dfmt");
+               case V_008F0C_BUF_DATA_FORMAT_8: format = V_008F0C_IMG_FORMAT_8_UINT; break;
+               case V_008F0C_BUF_DATA_FORMAT_8_8: format = V_008F0C_IMG_FORMAT_8_8_UINT; break;
+               case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: format = V_008F0C_IMG_FORMAT_8_8_8_8_UINT; break;
+               case V_008F0C_BUF_DATA_FORMAT_16: format = V_008F0C_IMG_FORMAT_16_UINT; break;
+               case V_008F0C_BUF_DATA_FORMAT_16_16: format = V_008F0C_IMG_FORMAT_16_16_UINT; break;
+               case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: format = V_008F0C_IMG_FORMAT_16_16_16_16_UINT; break;
+               case V_008F0C_BUF_DATA_FORMAT_32: format = V_008F0C_IMG_FORMAT_32_UINT; break;
+               case V_008F0C_BUF_DATA_FORMAT_32_32: format = V_008F0C_IMG_FORMAT_32_32_UINT; break;
+               case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_32_UINT; break;
+               }
+
+               // Use the regularity properties of the combined format enum.
+               //
+               // Note: float is incompatible with 8-bit data formats,
+               //       [us]{norm,scaled} are incomparible with 32-bit data formats.
+               //       [us]scaled are not writable.
+               switch (nfmt) {
+               case V_008F0C_BUF_NUM_FORMAT_UNORM: format -= 4; break;
+               case V_008F0C_BUF_NUM_FORMAT_SNORM: format -= 3; break;
+               case V_008F0C_BUF_NUM_FORMAT_USCALED: format -= 2; break;
+               case V_008F0C_BUF_NUM_FORMAT_SSCALED: format -= 1; break;
+               default: unreachable("bad nfmt");
+               case V_008F0C_BUF_NUM_FORMAT_UINT: break;
+               case V_008F0C_BUF_NUM_FORMAT_SINT: format += 1; break;
+               case V_008F0C_BUF_NUM_FORMAT_FLOAT: format += 2; break;
+               }
+
+               return format;
+       } else {
+               return dfmt | (nfmt << 4);
+       }
 }
 
 static LLVMValueRef
@@ -1509,9 +1554,9 @@ ac_build_llvm8_tbuffer_load(struct ac_llvm_context *ctx,
                args[idx++] = vindex ? vindex : ctx->i32_0;
        args[idx++] = voffset ? voffset : ctx->i32_0;
        args[idx++] = soffset ? soffset : ctx->i32_0;
-       args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0);
-       args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
-       unsigned func = num_channels == 3 ? 4 : num_channels;
+       args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
+       args[idx++] = get_cache_policy(ctx, true, glc, slc);
+       unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
        const char *indexing_kind = structurized ? "struct" : "raw";
        char name[256], type_name[8];
 
@@ -1810,7 +1855,7 @@ ac_build_opencoded_load_format(struct ac_llvm_context *ctx,
                                        can_speculate, false, true);
                } else {
                        tmp = LLVMBuildAdd(ctx->builder, voffset, tmp, "");
-                       loads[i] = ac_build_buffer_load_common(
+                       loads[i] = ac_build_llvm7_buffer_load_common(
                                        ctx, rsrc, vindex, tmp,
                                        1 << (load_log_size - 2), glc, slc, can_speculate, false);
                }
@@ -1995,9 +2040,7 @@ ac_build_llvm8_tbuffer_store(struct ac_llvm_context *ctx,
                             unsigned num_channels,
                             unsigned dfmt,
                             unsigned nfmt,
-                            bool glc,
-                            bool slc,
-                            bool writeonly_memory,
+                            unsigned cache_policy,
                             bool structurized)
 {
        LLVMValueRef args[7];
@@ -2008,9 +2051,9 @@ ac_build_llvm8_tbuffer_store(struct ac_llvm_context *ctx,
                args[idx++] = vindex ? vindex : ctx->i32_0;
        args[idx++] = voffset ? voffset : ctx->i32_0;
        args[idx++] = soffset ? soffset : ctx->i32_0;
-       args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0);
-       args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
-       unsigned func = num_channels == 3 ? 4 : num_channels;
+       args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
+       args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
+       unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
        const char *indexing_kind = structurized ? "struct" : "raw";
        char name[256], type_name[8];
 
@@ -2021,7 +2064,7 @@ ac_build_llvm8_tbuffer_store(struct ac_llvm_context *ctx,
                 indexing_kind, type_name);
 
        ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
-                          ac_get_store_intr_attribs(writeonly_memory));
+                          AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
 }
 
 static void
@@ -2035,9 +2078,7 @@ ac_build_tbuffer_store(struct ac_llvm_context *ctx,
                       unsigned num_channels,
                       unsigned dfmt,
                       unsigned nfmt,
-                      bool glc,
-                      bool slc,
-                      bool writeonly_memory,
+                      unsigned cache_policy,
                       bool structurized) /* only matters for LLVM 8+ */
 {
        if (HAVE_LLVM >= 0x800) {
@@ -2047,8 +2088,7 @@ ac_build_tbuffer_store(struct ac_llvm_context *ctx,
 
                ac_build_llvm8_tbuffer_store(ctx, rsrc, vdata, vindex, voffset,
                                             soffset, num_channels, dfmt, nfmt,
-                                            glc, slc, writeonly_memory,
-                                            structurized);
+                                            cache_policy, structurized);
        } else {
                LLVMValueRef params[] = {
                        vdata,
@@ -2059,8 +2099,8 @@ ac_build_tbuffer_store(struct ac_llvm_context *ctx,
                        immoffset,
                        LLVMConstInt(ctx->i32, dfmt, false),
                        LLVMConstInt(ctx->i32, nfmt, false),
-                       LLVMConstInt(ctx->i1, glc, false),
-                       LLVMConstInt(ctx->i1, slc, false),
+                       LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), false),
+                       LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), false),
                };
                unsigned func = CLAMP(num_channels, 1, 3) - 1;
                const char *type_names[] = {"i32", "v2i32", "v4i32"};
@@ -2070,7 +2110,7 @@ ac_build_tbuffer_store(struct ac_llvm_context *ctx,
                         type_names[func]);
 
                ac_build_intrinsic(ctx, name, ctx->voidt, params, 10,
-                                  ac_get_store_intr_attribs(writeonly_memory));
+                                  AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
        }
 }
 
@@ -2085,13 +2125,11 @@ ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx,
                              unsigned num_channels,
                              unsigned dfmt,
                              unsigned nfmt,
-                             bool glc,
-                             bool slc,
-                             bool writeonly_memory)
+                             unsigned cache_policy)
 {
        ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset,
-                              immoffset, num_channels, dfmt, nfmt, glc, slc,
-                              writeonly_memory, true);
+                              immoffset, num_channels, dfmt, nfmt, cache_policy,
+                              true);
 }
 
 void
@@ -2104,13 +2142,11 @@ ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx,
                           unsigned num_channels,
                           unsigned dfmt,
                           unsigned nfmt,
-                          bool glc,
-                          bool slc,
-                          bool writeonly_memory)
+                          unsigned cache_policy)
 {
        ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset,
-                              immoffset, num_channels, dfmt, nfmt, glc, slc,
-                              writeonly_memory, false);
+                              immoffset, num_channels, dfmt, nfmt, cache_policy,
+                              false);
 }
 
 void
@@ -2119,8 +2155,7 @@ ac_build_tbuffer_store_short(struct ac_llvm_context *ctx,
                             LLVMValueRef vdata,
                             LLVMValueRef voffset,
                             LLVMValueRef soffset,
-                            bool glc,
-                            bool writeonly_memory)
+                            unsigned cache_policy)
 {
        vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
 
@@ -2128,9 +2163,8 @@ ac_build_tbuffer_store_short(struct ac_llvm_context *ctx,
                /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
                ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL,
                                                   voffset, soffset, 1,
-                                                  ctx->i16, glc, false,
-                                                  writeonly_memory, false,
-                                                  false);
+                                                  ctx->i16, cache_policy,
+                                                  false, false);
        } else {
                unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
                unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
@@ -2138,8 +2172,7 @@ ac_build_tbuffer_store_short(struct ac_llvm_context *ctx,
                vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
 
                ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
-                                          ctx->i32_0, 1, dfmt, nfmt, glc, false,
-                                          writeonly_memory);
+                                          ctx->i32_0, 1, dfmt, nfmt, cache_policy);
        }
 }
 
@@ -2149,8 +2182,7 @@ ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx,
                            LLVMValueRef vdata,
                            LLVMValueRef voffset,
                            LLVMValueRef soffset,
-                           bool glc,
-                           bool writeonly_memory)
+                           unsigned cache_policy)
 {
        vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
 
@@ -2158,9 +2190,8 @@ ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx,
                /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
                ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL,
                                                   voffset, soffset, 1,
-                                                  ctx->i8, glc, false,
-                                                  writeonly_memory, false,
-                                                  false);
+                                                  ctx->i8, cache_policy,
+                                                  false, false);
        } else {
                unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
                unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
@@ -2168,8 +2199,7 @@ ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx,
                vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
 
                ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
-                                          ctx->i32_0, 1, dfmt, nfmt, glc, false,
-                                          writeonly_memory);
+                                          ctx->i32_0, 1, dfmt, nfmt, cache_policy);
        }
 }
 /**
@@ -2810,11 +2840,22 @@ LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
                width,
        };
 
-       return ac_build_intrinsic(ctx,
-                                 is_signed ? "llvm.amdgcn.sbfe.i32" :
-                                             "llvm.amdgcn.ubfe.i32",
-                                 ctx->i32, args, 3,
-                                 AC_FUNC_ATTR_READNONE);
+       LLVMValueRef result = ac_build_intrinsic(ctx,
+                                                is_signed ? "llvm.amdgcn.sbfe.i32" :
+                                                            "llvm.amdgcn.ubfe.i32",
+                                                ctx->i32, args, 3,
+                                                AC_FUNC_ATTR_READNONE);
+
+       if (HAVE_LLVM < 0x0800) {
+               /* FIXME: LLVM 7+ returns incorrect result when count is 0.
+                * https://bugs.freedesktop.org/show_bug.cgi?id=107276
+                */
+               LLVMValueRef zero = ctx->i32_0;
+               LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, width, zero, "");
+               result = LLVMBuildSelect(ctx->builder, icond, zero, result, "");
+       }
+
+       return result;
 }
 
 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0,
@@ -2831,13 +2872,49 @@ LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0,
                             LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
 }
 
-void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned simm16)
+void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
 {
+       if (!wait_flags)
+               return;
+
+       unsigned lgkmcnt = 63;
+       unsigned expcnt = 7;
+       unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15;
+       unsigned vscnt = 63;
+
+       if (wait_flags & AC_WAIT_LGKM)
+               lgkmcnt = 0;
+       if (wait_flags & AC_WAIT_EXP)
+               expcnt = 0;
+       if (wait_flags & AC_WAIT_VLOAD)
+               vmcnt = 0;
+
+       if (wait_flags & AC_WAIT_VSTORE) {
+               if (ctx->chip_class >= GFX10)
+                       vscnt = 0;
+               else
+                       vmcnt = 0;
+       }
+
+       unsigned simm16 = (lgkmcnt << 8) |
+                         (expcnt << 4) |
+                         (vmcnt & 0xf) |
+                         ((vmcnt >> 4) << 14);
+
        LLVMValueRef args[1] = {
                LLVMConstInt(ctx->i32, simm16, false),
        };
        ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt",
                           ctx->voidt, args, 1, 0);
+
+       /* TODO: add llvm.amdgcn.s.waitcnt.vscnt into LLVM: */
+       if (0 && ctx->chip_class >= GFX10 && vscnt == 0) {
+               LLVMValueRef args[1] = {
+                       LLVMConstInt(ctx->i32, vscnt, false),
+               };
+               ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt.vscnt",
+                                  ctx->voidt, args, 1, 0);
+       }
 }
 
 LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0,
@@ -3875,6 +3952,58 @@ ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
        return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
 }
 
+static LLVMValueRef
+_ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
+                    bool exchange_rows, bool bound_ctrl)
+{
+       LLVMValueRef args[6] = {
+               src,
+               src,
+               LLVMConstInt(ctx->i32, sel, false),
+               LLVMConstInt(ctx->i32, sel >> 32, false),
+               ctx->i1true, /* fi */
+               bound_ctrl ? ctx->i1true : ctx->i1false,
+       };
+       return ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16"
+                                                    : "llvm.amdgcn.permlane16",
+                                 ctx->i32, args, 6,
+                                 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
+}
+
+static LLVMValueRef
+ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
+                   bool exchange_rows, bool bound_ctrl)
+{
+       LLVMTypeRef src_type = LLVMTypeOf(src);
+       src = ac_to_integer(ctx, src);
+       unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+       LLVMValueRef ret;
+       if (bits == 32) {
+               ret = _ac_build_permlane16(ctx, src, sel, exchange_rows,
+                                          bound_ctrl);
+       } else {
+               assert(bits % 32 == 0);
+               LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+               LLVMValueRef src_vector =
+                       LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+               ret = LLVMGetUndef(vec_type);
+               for (unsigned i = 0; i < bits / 32; i++) {
+                       src = LLVMBuildExtractElement(ctx->builder, src_vector,
+                                                     LLVMConstInt(ctx->i32, i,
+                                                                  0), "");
+                       LLVMValueRef ret_comp =
+                               _ac_build_permlane16(ctx, src, sel,
+                                                    exchange_rows,
+                                                    bound_ctrl);
+                       ret = LLVMBuildInsertElement(ctx->builder, ret,
+                                                    ret_comp,
+                                                    LLVMConstInt(ctx->i32, i,
+                                                                 0), "");
+               }
+       }
+       return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
 static inline unsigned
 ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
 {
@@ -4038,10 +4167,18 @@ ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
  */
 static LLVMValueRef
 ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
-             unsigned maxprefix)
+             unsigned maxprefix, bool inclusive)
 {
        LLVMValueRef result, tmp;
-       result = src;
+
+       if (ctx->chip_class >= GFX10) {
+               result = inclusive ? src : identity;
+       } else {
+               if (inclusive)
+                       result = src;
+               else
+                       result = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
+       }
        if (maxprefix <= 1)
                return result;
        tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
@@ -4064,6 +4201,38 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu
        result = ac_build_alu_op(ctx, result, tmp, op);
        if (maxprefix <= 16)
                return result;
+
+       if (ctx->chip_class >= GFX10) {
+               /* dpp_row_bcast{15,31} are not supported on gfx10. */
+               LLVMBuilderRef builder = ctx->builder;
+               LLVMValueRef tid = ac_get_thread_id(ctx);
+               LLVMValueRef cc;
+               /* TODO-GFX10: Can we get better code-gen by putting this into
+                * a branch so that LLVM generates EXEC mask manipulations? */
+               if (inclusive)
+                       tmp = result;
+               else
+                       tmp = ac_build_alu_op(ctx, result, src, op);
+               tmp = ac_build_permlane16(ctx, tmp, ~(uint64_t)0, true, false);
+               tmp = ac_build_alu_op(ctx, result, tmp, op);
+               cc = LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->i32, 16, false), "");
+               cc = LLVMBuildICmp(builder, LLVMIntNE, cc, ctx->i32_0, "");
+               result = LLVMBuildSelect(builder, cc, tmp, result, "");
+               if (maxprefix <= 32)
+                       return result;
+
+               if (inclusive)
+                       tmp = result;
+               else
+                       tmp = ac_build_alu_op(ctx, result, src, op);
+               tmp = ac_build_readlane(ctx, tmp, LLVMConstInt(ctx->i32, 31, false));
+               tmp = ac_build_alu_op(ctx, result, tmp, op);
+               cc = LLVMBuildICmp(builder, LLVMIntUGE, tid,
+                                  LLVMConstInt(ctx->i32, 32, false), "");
+               result = LLVMBuildSelect(builder, cc, tmp, result, "");
+               return result;
+       }
+
        tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
        result = ac_build_alu_op(ctx, result, tmp, op);
        if (maxprefix <= 32)
@@ -4093,7 +4262,7 @@ ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op
                get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
        result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
                                  LLVMTypeOf(identity), "");
-       result = ac_build_scan(ctx, op, result, identity, 64);
+       result = ac_build_scan(ctx, op, result, identity, 64, true);
 
        return ac_build_wwm(ctx, result);
 }
@@ -4117,8 +4286,7 @@ ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op
                get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
        result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
                                  LLVMTypeOf(identity), "");
-       result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
-       result = ac_build_scan(ctx, op, result, identity, 64);
+       result = ac_build_scan(ctx, op, result, identity, 64, false);
 
        return ac_build_wwm(ctx, result);
 }
@@ -4156,7 +4324,9 @@ ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsign
        result = ac_build_alu_op(ctx, result, swap, op);
        if (cluster_size == 16) return ac_build_wwm(ctx, result);
 
-       if (ctx->chip_class >= GFX8 && cluster_size != 32)
+       if (ctx->chip_class >= GFX10)
+               swap = ac_build_permlane16(ctx, result, 0, true, false);
+       else if (ctx->chip_class >= GFX8 && cluster_size != 32)
                swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
        else
                swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
@@ -4164,7 +4334,10 @@ ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsign
        if (cluster_size == 32) return ac_build_wwm(ctx, result);
 
        if (ctx->chip_class >= GFX8) {
-               swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
+               if (ctx->chip_class >= GFX10)
+                       swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
+               else
+                       swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
                result = ac_build_alu_op(ctx, result, swap, op);
                result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
                return ac_build_wwm(ctx, result);
@@ -4243,7 +4416,7 @@ ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
                ac_build_optimization_barrier(ctx, &tmp);
 
                bbs[1] = LLVMGetInsertBlock(builder);
-               phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves);
+               phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);
        }
        ac_build_endif(ctx, 1001);
 
@@ -4326,7 +4499,7 @@ ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
 
        /* ws->result_reduce is already the correct value */
        if (ws->enable_inclusive)
-               ws->result_inclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->src, ws->op);
+               ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
        if (ws->enable_exclusive)
                ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
 }
@@ -4446,3 +4619,11 @@ ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
        result = LLVMBuildNot(ctx->builder, result, "");
        return LLVMBuildSExt(ctx->builder, result, ctx->i32, "");
 }
+
+LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func,
+                          LLVMValueRef *args, unsigned num_args)
+{
+       LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");
+       LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
+       return ret;
+}