ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
+ ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
ac_build_ballot(struct ac_llvm_context *ctx,
LLVMValueRef value)
{
+ const char *name = HAVE_LLVM >= 0x900 ? "llvm.amdgcn.icmp.i64.i32" : "llvm.amdgcn.icmp.i32";
LLVMValueRef args[3] = {
value,
ctx->i32_0,
args[0] = ac_to_integer(ctx, args[0]);
- return ac_build_intrinsic(ctx,
- "llvm.amdgcn.icmp.i32",
+ return ac_build_intrinsic(ctx, name,
ctx->i64, args, 3,
AC_FUNC_ATTR_NOUNWIND |
AC_FUNC_ATTR_READNONE |
LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx,
LLVMValueRef value)
{
+ const char *name = HAVE_LLVM >= 0x900 ? "llvm.amdgcn.icmp.i64.i1" : "llvm.amdgcn.icmp.i1";
LLVMValueRef args[3] = {
value,
ctx->i1false,
};
assert(HAVE_LLVM >= 0x0800);
- return ac_build_intrinsic(ctx, "llvm.amdgcn.icmp.i1", ctx->i64, args, 3,
+ return ac_build_intrinsic(ctx, name, ctx->i64, args, 3,
AC_FUNC_ATTR_NOUNWIND |
AC_FUNC_ATTR_READNONE |
AC_FUNC_ATTR_CONVERGENT);
return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
}
+static LLVMValueRef get_cache_policy(struct ac_llvm_context *ctx,
+ bool load, bool glc, bool slc)
+{
+ return LLVMConstInt(ctx->i32,
+ (glc ? ac_glc : 0) +
+ (slc ? ac_slc : 0) +
+ (ctx->chip_class >= GFX10 && glc && load ? ac_dlc : 0), 0);
+}
+
static void
-ac_build_buffer_store_common(struct ac_llvm_context *ctx,
- LLVMValueRef rsrc,
- LLVMValueRef data,
- LLVMValueRef vindex,
- LLVMValueRef voffset,
- unsigned num_channels,
- bool glc,
- bool slc,
- bool writeonly_memory,
- bool use_format)
+ac_build_llvm7_buffer_store_common(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef data,
+ LLVMValueRef vindex,
+ LLVMValueRef voffset,
+ unsigned num_channels,
+ unsigned cache_policy,
+ bool use_format)
{
LLVMValueRef args[] = {
data,
LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
vindex ? vindex : ctx->i32_0,
voffset,
- LLVMConstInt(ctx->i1, glc, 0),
- LLVMConstInt(ctx->i1, slc, 0)
+ LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), 0),
+ LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), 0)
};
unsigned func = CLAMP(num_channels, 1, 3) - 1;
}
ac_build_intrinsic(ctx, name, ctx->voidt, args, ARRAY_SIZE(args),
- ac_get_store_intr_attribs(writeonly_memory));
+ AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
}
static void
LLVMValueRef soffset,
unsigned num_channels,
LLVMTypeRef return_channel_type,
- bool glc,
- bool slc,
- bool writeonly_memory,
+ unsigned cache_policy,
bool use_format,
bool structurized)
{
args[idx++] = vindex ? vindex : ctx->i32_0;
args[idx++] = voffset ? voffset : ctx->i32_0;
args[idx++] = soffset ? soffset : ctx->i32_0;
- args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
- unsigned func = num_channels == 3 ? 4 : num_channels;
+ args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
+ unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
const char *indexing_kind = structurized ? "struct" : "raw";
char name[256], type_name[8];
}
ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
- ac_get_store_intr_attribs(writeonly_memory));
+ AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
}
void
LLVMValueRef vindex,
LLVMValueRef voffset,
unsigned num_channels,
- bool glc,
- bool writeonly_memory)
+ unsigned cache_policy)
{
if (HAVE_LLVM >= 0x800) {
ac_build_llvm8_buffer_store_common(ctx, rsrc, data, vindex,
voffset, NULL, num_channels,
- ctx->f32, glc, false,
- writeonly_memory, true, true);
+ ctx->f32, cache_policy,
+ true, true);
} else {
- ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset,
- num_channels, glc, false,
- writeonly_memory, true);
+ ac_build_llvm7_buffer_store_common(ctx, rsrc, data, vindex, voffset,
+ num_channels, cache_policy,
+ true);
}
}
LLVMValueRef voffset,
LLVMValueRef soffset,
unsigned inst_offset,
- bool glc,
- bool slc,
- bool writeonly_memory,
+ unsigned cache_policy,
bool swizzle_enable_hint)
{
- /* Split 3 channel stores, becase LLVM doesn't support 3-channel
+ /* Split 3 channel stores, because only LLVM 9+ support 3-channel
* intrinsics. */
- if (num_channels == 3) {
+ if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) {
LLVMValueRef v[3], v01;
for (int i = 0; i < 3; i++) {
v01 = ac_build_gather_values(ctx, v, 2);
ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
- soffset, inst_offset, glc, slc,
- writeonly_memory, swizzle_enable_hint);
+ soffset, inst_offset, cache_policy,
+ swizzle_enable_hint);
ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
soffset, inst_offset + 8,
- glc, slc,
- writeonly_memory, swizzle_enable_hint);
+ cache_policy,
+ swizzle_enable_hint);
return;
}
voffset, offset,
num_channels,
ctx->f32,
- glc, slc,
- writeonly_memory,
+ cache_policy,
false, false);
} else {
if (voffset)
offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
- ac_build_buffer_store_common(ctx, rsrc,
- ac_to_float(ctx, vdata),
- ctx->i32_0, offset,
- num_channels, glc, slc,
- writeonly_memory, false);
+ ac_build_llvm7_buffer_store_common(ctx, rsrc,
+ ac_to_float(ctx, vdata),
+ ctx->i32_0, offset,
+ num_channels, cache_policy,
+ false);
}
return;
}
LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
- immoffset, num_channels, dfmt, nfmt, glc,
- slc, writeonly_memory);
+ immoffset, num_channels, dfmt, nfmt, cache_policy);
}
static LLVMValueRef
-ac_build_buffer_load_common(struct ac_llvm_context *ctx,
- LLVMValueRef rsrc,
- LLVMValueRef vindex,
- LLVMValueRef voffset,
- unsigned num_channels,
- bool glc,
- bool slc,
- bool can_speculate,
- bool use_format)
+ac_build_llvm7_buffer_load_common(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef vindex,
+ LLVMValueRef voffset,
+ unsigned num_channels,
+ bool glc,
+ bool slc,
+ bool can_speculate,
+ bool use_format)
{
LLVMValueRef args[] = {
LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
args[idx++] = vindex ? vindex : ctx->i32_0;
args[idx++] = voffset ? voffset : ctx->i32_0;
args[idx++] = soffset ? soffset : ctx->i32_0;
- args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
- unsigned func = num_channels == 3 ? 4 : num_channels;
+ args[idx++] = get_cache_policy(ctx, true, glc, slc);
+ unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
const char *indexing_kind = structurized ? "struct" : "raw";
char name[256], type_name[8];
LLVMValueRef args[3] = {
rsrc,
offset,
- glc ? ctx->i32_1 : ctx->i32_0,
+ get_cache_policy(ctx, true, glc, false),
};
result[i] = ac_build_intrinsic(ctx, intrname,
ctx->f32, args, num_args,
if (num_channels == 1)
return result[0];
- if (num_channels == 3)
+ if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false))
result[num_channels++] = LLVMGetUndef(ctx->f32);
return ac_build_gather_values(ctx, result, num_channels);
}
false);
}
- return ac_build_buffer_load_common(ctx, rsrc, vindex, offset,
- num_channels, glc, slc,
- can_speculate, false);
+ return ac_build_llvm7_buffer_load_common(ctx, rsrc, vindex, offset,
+ num_channels, glc, slc,
+ can_speculate, false);
}
LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
glc, false,
can_speculate, true, true);
}
- return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset,
- num_channels, glc, false,
- can_speculate, true);
+ return ac_build_llvm7_buffer_load_common(ctx, rsrc, vindex, voffset,
+ num_channels, glc, false,
+ can_speculate, true);
}
LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
LLVMValueRef new_rsrc = LLVMBuildInsertElement(ctx->builder, rsrc, new_elem_count,
LLVMConstInt(ctx->i32, 2, 0), "");
- return ac_build_buffer_load_common(ctx, new_rsrc, vindex, voffset,
- num_channels, glc, false,
- can_speculate, true);
+ return ac_build_llvm7_buffer_load_common(ctx, new_rsrc, vindex, voffset,
+ num_channels, glc, false,
+ can_speculate, true);
+}
+
+/// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format
+/// value for LLVM8+ tbuffer intrinsics.
+static unsigned
+ac_get_tbuffer_format(struct ac_llvm_context *ctx,
+ unsigned dfmt, unsigned nfmt)
+{
+ if (ctx->chip_class >= GFX10) {
+ unsigned format;
+ switch (dfmt) {
+ default: unreachable("bad dfmt");
+ case V_008F0C_BUF_DATA_FORMAT_8: format = V_008F0C_IMG_FORMAT_8_UINT; break;
+ case V_008F0C_BUF_DATA_FORMAT_8_8: format = V_008F0C_IMG_FORMAT_8_8_UINT; break;
+ case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: format = V_008F0C_IMG_FORMAT_8_8_8_8_UINT; break;
+ case V_008F0C_BUF_DATA_FORMAT_16: format = V_008F0C_IMG_FORMAT_16_UINT; break;
+ case V_008F0C_BUF_DATA_FORMAT_16_16: format = V_008F0C_IMG_FORMAT_16_16_UINT; break;
+ case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: format = V_008F0C_IMG_FORMAT_16_16_16_16_UINT; break;
+ case V_008F0C_BUF_DATA_FORMAT_32: format = V_008F0C_IMG_FORMAT_32_UINT; break;
+ case V_008F0C_BUF_DATA_FORMAT_32_32: format = V_008F0C_IMG_FORMAT_32_32_UINT; break;
+ case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_32_UINT; break;
+ }
+
+ // Use the regularity properties of the combined format enum.
+ //
+ // Note: float is incompatible with 8-bit data formats,
+ // [us]{norm,scaled} are incomparible with 32-bit data formats.
+ // [us]scaled are not writable.
+ switch (nfmt) {
+ case V_008F0C_BUF_NUM_FORMAT_UNORM: format -= 4; break;
+ case V_008F0C_BUF_NUM_FORMAT_SNORM: format -= 3; break;
+ case V_008F0C_BUF_NUM_FORMAT_USCALED: format -= 2; break;
+ case V_008F0C_BUF_NUM_FORMAT_SSCALED: format -= 1; break;
+ default: unreachable("bad nfmt");
+ case V_008F0C_BUF_NUM_FORMAT_UINT: break;
+ case V_008F0C_BUF_NUM_FORMAT_SINT: format += 1; break;
+ case V_008F0C_BUF_NUM_FORMAT_FLOAT: format += 2; break;
+ }
+
+ return format;
+ } else {
+ return dfmt | (nfmt << 4);
+ }
}
static LLVMValueRef
args[idx++] = vindex ? vindex : ctx->i32_0;
args[idx++] = voffset ? voffset : ctx->i32_0;
args[idx++] = soffset ? soffset : ctx->i32_0;
- args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0);
- args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
- unsigned func = num_channels == 3 ? 4 : num_channels;
+ args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
+ args[idx++] = get_cache_policy(ctx, true, glc, slc);
+ unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
const char *indexing_kind = structurized ? "struct" : "raw";
char name[256], type_name[8];
can_speculate, false, true);
} else {
tmp = LLVMBuildAdd(ctx->builder, voffset, tmp, "");
- loads[i] = ac_build_buffer_load_common(
+ loads[i] = ac_build_llvm7_buffer_load_common(
ctx, rsrc, vindex, tmp,
1 << (load_log_size - 2), glc, slc, can_speculate, false);
}
unsigned num_channels,
unsigned dfmt,
unsigned nfmt,
- bool glc,
- bool slc,
- bool writeonly_memory,
+ unsigned cache_policy,
bool structurized)
{
LLVMValueRef args[7];
args[idx++] = vindex ? vindex : ctx->i32_0;
args[idx++] = voffset ? voffset : ctx->i32_0;
args[idx++] = soffset ? soffset : ctx->i32_0;
- args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0);
- args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
- unsigned func = num_channels == 3 ? 4 : num_channels;
+ args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
+ args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
+ unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
const char *indexing_kind = structurized ? "struct" : "raw";
char name[256], type_name[8];
indexing_kind, type_name);
ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
- ac_get_store_intr_attribs(writeonly_memory));
+ AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
}
static void
unsigned num_channels,
unsigned dfmt,
unsigned nfmt,
- bool glc,
- bool slc,
- bool writeonly_memory,
+ unsigned cache_policy,
bool structurized) /* only matters for LLVM 8+ */
{
if (HAVE_LLVM >= 0x800) {
ac_build_llvm8_tbuffer_store(ctx, rsrc, vdata, vindex, voffset,
soffset, num_channels, dfmt, nfmt,
- glc, slc, writeonly_memory,
- structurized);
+ cache_policy, structurized);
} else {
LLVMValueRef params[] = {
vdata,
immoffset,
LLVMConstInt(ctx->i32, dfmt, false),
LLVMConstInt(ctx->i32, nfmt, false),
- LLVMConstInt(ctx->i1, glc, false),
- LLVMConstInt(ctx->i1, slc, false),
+ LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), false),
+ LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), false),
};
unsigned func = CLAMP(num_channels, 1, 3) - 1;
const char *type_names[] = {"i32", "v2i32", "v4i32"};
type_names[func]);
ac_build_intrinsic(ctx, name, ctx->voidt, params, 10,
- ac_get_store_intr_attribs(writeonly_memory));
+ AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
}
}
unsigned num_channels,
unsigned dfmt,
unsigned nfmt,
- bool glc,
- bool slc,
- bool writeonly_memory)
+ unsigned cache_policy)
{
ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset,
- immoffset, num_channels, dfmt, nfmt, glc, slc,
- writeonly_memory, true);
+ immoffset, num_channels, dfmt, nfmt, cache_policy,
+ true);
}
void
unsigned num_channels,
unsigned dfmt,
unsigned nfmt,
- bool glc,
- bool slc,
- bool writeonly_memory)
+ unsigned cache_policy)
{
ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset,
- immoffset, num_channels, dfmt, nfmt, glc, slc,
- writeonly_memory, false);
+ immoffset, num_channels, dfmt, nfmt, cache_policy,
+ false);
}
void
LLVMValueRef vdata,
LLVMValueRef voffset,
LLVMValueRef soffset,
- bool glc,
- bool writeonly_memory)
+ unsigned cache_policy)
{
vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
/* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL,
voffset, soffset, 1,
- ctx->i16, glc, false,
- writeonly_memory, false,
- false);
+ ctx->i16, cache_policy,
+ false, false);
} else {
unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
- ctx->i32_0, 1, dfmt, nfmt, glc, false,
- writeonly_memory);
+ ctx->i32_0, 1, dfmt, nfmt, cache_policy);
}
}
LLVMValueRef vdata,
LLVMValueRef voffset,
LLVMValueRef soffset,
- bool glc,
- bool writeonly_memory)
+ unsigned cache_policy)
{
vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
/* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL,
voffset, soffset, 1,
- ctx->i8, glc, false,
- writeonly_memory, false,
- false);
+ ctx->i8, cache_policy,
+ false, false);
} else {
unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
- ctx->i32_0, 1, dfmt, nfmt, glc, false,
- writeonly_memory);
+ ctx->i32_0, 1, dfmt, nfmt, cache_policy);
}
}
/**
width,
};
- return ac_build_intrinsic(ctx,
- is_signed ? "llvm.amdgcn.sbfe.i32" :
- "llvm.amdgcn.ubfe.i32",
- ctx->i32, args, 3,
- AC_FUNC_ATTR_READNONE);
+ LLVMValueRef result = ac_build_intrinsic(ctx,
+ is_signed ? "llvm.amdgcn.sbfe.i32" :
+ "llvm.amdgcn.ubfe.i32",
+ ctx->i32, args, 3,
+ AC_FUNC_ATTR_READNONE);
+
+ if (HAVE_LLVM < 0x0800) {
+ /* FIXME: LLVM 7+ returns incorrect result when count is 0.
+ * https://bugs.freedesktop.org/show_bug.cgi?id=107276
+ */
+ LLVMValueRef zero = ctx->i32_0;
+ LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, width, zero, "");
+ result = LLVMBuildSelect(ctx->builder, icond, zero, result, "");
+ }
+
+ return result;
}
LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0,
LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
}
-void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned simm16)
+void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
{
+ if (!wait_flags)
+ return;
+
+ unsigned lgkmcnt = 63;
+ unsigned expcnt = 7;
+ unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15;
+ unsigned vscnt = 63;
+
+ if (wait_flags & AC_WAIT_LGKM)
+ lgkmcnt = 0;
+ if (wait_flags & AC_WAIT_EXP)
+ expcnt = 0;
+ if (wait_flags & AC_WAIT_VLOAD)
+ vmcnt = 0;
+
+ if (wait_flags & AC_WAIT_VSTORE) {
+ if (ctx->chip_class >= GFX10)
+ vscnt = 0;
+ else
+ vmcnt = 0;
+ }
+
+ unsigned simm16 = (lgkmcnt << 8) |
+ (expcnt << 4) |
+ (vmcnt & 0xf) |
+ ((vmcnt >> 4) << 14);
+
LLVMValueRef args[1] = {
LLVMConstInt(ctx->i32, simm16, false),
};
ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt",
ctx->voidt, args, 1, 0);
+
+ /* TODO: add llvm.amdgcn.s.waitcnt.vscnt into LLVM: */
+ if (0 && ctx->chip_class >= GFX10 && vscnt == 0) {
+ LLVMValueRef args[1] = {
+ LLVMConstInt(ctx->i32, vscnt, false),
+ };
+ ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt.vscnt",
+ ctx->voidt, args, 1, 0);
+ }
}
LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0,
return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
}
+static LLVMValueRef
+_ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
+ bool exchange_rows, bool bound_ctrl)
+{
+ LLVMValueRef args[6] = {
+ src,
+ src,
+ LLVMConstInt(ctx->i32, sel, false),
+ LLVMConstInt(ctx->i32, sel >> 32, false),
+ ctx->i1true, /* fi */
+ bound_ctrl ? ctx->i1true : ctx->i1false,
+ };
+ return ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16"
+ : "llvm.amdgcn.permlane16",
+ ctx->i32, args, 6,
+ AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
+}
+
+static LLVMValueRef
+ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
+ bool exchange_rows, bool bound_ctrl)
+{
+ LLVMTypeRef src_type = LLVMTypeOf(src);
+ src = ac_to_integer(ctx, src);
+ unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+ LLVMValueRef ret;
+ if (bits == 32) {
+ ret = _ac_build_permlane16(ctx, src, sel, exchange_rows,
+ bound_ctrl);
+ } else {
+ assert(bits % 32 == 0);
+ LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+ LLVMValueRef src_vector =
+ LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+ ret = LLVMGetUndef(vec_type);
+ for (unsigned i = 0; i < bits / 32; i++) {
+ src = LLVMBuildExtractElement(ctx->builder, src_vector,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ LLVMValueRef ret_comp =
+ _ac_build_permlane16(ctx, src, sel,
+ exchange_rows,
+ bound_ctrl);
+ ret = LLVMBuildInsertElement(ctx->builder, ret,
+ ret_comp,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ }
+ }
+ return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
static inline unsigned
ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
{
*/
static LLVMValueRef
ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
- unsigned maxprefix)
+ unsigned maxprefix, bool inclusive)
{
LLVMValueRef result, tmp;
- result = src;
+
+ if (ctx->chip_class >= GFX10) {
+ result = inclusive ? src : identity;
+ } else {
+ if (inclusive)
+ result = src;
+ else
+ result = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
+ }
if (maxprefix <= 1)
return result;
tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
if (maxprefix <= 16)
return result;
+
+ if (ctx->chip_class >= GFX10) {
+ /* dpp_row_bcast{15,31} are not supported on gfx10. */
+ LLVMBuilderRef builder = ctx->builder;
+ LLVMValueRef tid = ac_get_thread_id(ctx);
+ LLVMValueRef cc;
+ /* TODO-GFX10: Can we get better code-gen by putting this into
+ * a branch so that LLVM generates EXEC mask manipulations? */
+ if (inclusive)
+ tmp = result;
+ else
+ tmp = ac_build_alu_op(ctx, result, src, op);
+ tmp = ac_build_permlane16(ctx, tmp, ~(uint64_t)0, true, false);
+ tmp = ac_build_alu_op(ctx, result, tmp, op);
+ cc = LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->i32, 16, false), "");
+ cc = LLVMBuildICmp(builder, LLVMIntNE, cc, ctx->i32_0, "");
+ result = LLVMBuildSelect(builder, cc, tmp, result, "");
+ if (maxprefix <= 32)
+ return result;
+
+ if (inclusive)
+ tmp = result;
+ else
+ tmp = ac_build_alu_op(ctx, result, src, op);
+ tmp = ac_build_readlane(ctx, tmp, LLVMConstInt(ctx->i32, 31, false));
+ tmp = ac_build_alu_op(ctx, result, tmp, op);
+ cc = LLVMBuildICmp(builder, LLVMIntUGE, tid,
+ LLVMConstInt(ctx->i32, 32, false), "");
+ result = LLVMBuildSelect(builder, cc, tmp, result, "");
+ return result;
+ }
+
tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
if (maxprefix <= 32)
get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
LLVMTypeOf(identity), "");
- result = ac_build_scan(ctx, op, result, identity, 64);
+ result = ac_build_scan(ctx, op, result, identity, 64, true);
return ac_build_wwm(ctx, result);
}
get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
LLVMTypeOf(identity), "");
- result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
- result = ac_build_scan(ctx, op, result, identity, 64);
+ result = ac_build_scan(ctx, op, result, identity, 64, false);
return ac_build_wwm(ctx, result);
}
result = ac_build_alu_op(ctx, result, swap, op);
if (cluster_size == 16) return ac_build_wwm(ctx, result);
- if (ctx->chip_class >= GFX8 && cluster_size != 32)
+ if (ctx->chip_class >= GFX10)
+ swap = ac_build_permlane16(ctx, result, 0, true, false);
+ else if (ctx->chip_class >= GFX8 && cluster_size != 32)
swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
else
swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
if (cluster_size == 32) return ac_build_wwm(ctx, result);
if (ctx->chip_class >= GFX8) {
- swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
+ if (ctx->chip_class >= GFX10)
+ swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
+ else
+ swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
result = ac_build_alu_op(ctx, result, swap, op);
result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
return ac_build_wwm(ctx, result);
ac_build_optimization_barrier(ctx, &tmp);
bbs[1] = LLVMGetInsertBlock(builder);
- phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves);
+ phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);
}
ac_build_endif(ctx, 1001);
/* ws->result_reduce is already the correct value */
if (ws->enable_inclusive)
- ws->result_inclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->src, ws->op);
+ ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
if (ws->enable_exclusive)
ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
}
result = LLVMBuildNot(ctx->builder, result, "");
return LLVMBuildSExt(ctx->builder, result, ctx->i32, "");
}
+
+LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func,
+ LLVMValueRef *args, unsigned num_args)
+{
+ LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");
+ LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
+ return ret;
+}