* The caller is responsible for initializing ctx::module and ctx::builder.
*/
void
-ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context,
+ac_llvm_context_init(struct ac_llvm_context *ctx,
enum chip_class chip_class, enum radeon_family family)
{
LLVMValueRef args[1];
+ ctx->context = LLVMContextCreate();
+
ctx->chip_class = chip_class;
ctx->family = family;
-
- ctx->context = context;
ctx->module = NULL;
ctx->builder = NULL;
ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
- ctx->intptr = HAVE_32BIT_POINTERS ? ctx->i32 : ctx->i64;
+ ctx->intptr = ctx->i32;
ctx->f16 = LLVMHalfTypeInContext(ctx->context);
ctx->f32 = LLVMFloatTypeInContext(ctx->context);
ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
+ ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
+ ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
switch (kind) {
case LLVMIntegerTypeKind:
return LLVMGetIntTypeWidth(type) / 8;
+ case LLVMHalfTypeKind:
+ return 2;
case LLVMFloatTypeKind:
return 4;
case LLVMDoubleTypeKind:
return 8;
case LLVMPointerTypeKind:
- if (LLVMGetPointerAddressSpace(type) == AC_CONST_32BIT_ADDR_SPACE)
+ if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
return 4;
return 8;
case LLVMVectorTypeKind:
return LLVMVectorType(to_integer_type_scalar(ctx, elem_type),
LLVMGetVectorSize(t));
}
+ if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
+ switch (LLVMGetPointerAddressSpace(t)) {
+ case AC_ADDR_SPACE_GLOBAL:
+ return ctx->i64;
+ case AC_ADDR_SPACE_LDS:
+ return ctx->i32;
+ default:
+ unreachable("unhandled address space");
+ }
+ }
return to_integer_type_scalar(ctx, t);
}
ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
{
LLVMTypeRef type = LLVMTypeOf(v);
+ if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
+ return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
+ }
return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
}
+LLVMValueRef
+ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
+{
+ LLVMTypeRef type = LLVMTypeOf(v);
+ if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
+ return v;
+ return ac_to_integer(ctx, v);
+}
+
static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
{
if (t == ctx->i16 || t == ctx->f16)
case LLVMIntegerTypeKind:
snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
break;
+ case LLVMHalfTypeKind:
+ snprintf(buf, bufsize, "f16");
+ break;
case LLVMFloatTypeKind:
snprintf(buf, bufsize, "f32");
break;
return phi;
}
+void ac_build_s_barrier(struct ac_llvm_context *ctx)
+{
+ ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL,
+ 0, AC_FUNC_ATTR_CONVERGENT);
+}
+
/* Prevent optimizations (at least of memory accesses) across the current
* point in the program by emitting empty inline assembly that is marked as
* having side effects.
return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
}
-/* Expand a scalar or vector to <4 x type> by filling the remaining channels
- * with undef. Extract at most num_channels components from the input.
+/* Expand a scalar or vector to <dst_channels x type> by filling the remaining
+ * channels with undef. Extract at most src_channels components from the input.
*/
-LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
- LLVMValueRef value,
- unsigned num_channels)
+static LLVMValueRef
+ac_build_expand(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ unsigned src_channels,
+ unsigned dst_channels)
{
LLVMTypeRef elemtype;
- LLVMValueRef chan[4];
+ LLVMValueRef chan[dst_channels];
if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
- num_channels = MIN2(num_channels, vec_size);
- if (num_channels >= 4)
+ if (src_channels == dst_channels && vec_size == dst_channels)
return value;
- for (unsigned i = 0; i < num_channels; i++)
+ src_channels = MIN2(src_channels, vec_size);
+
+ for (unsigned i = 0; i < src_channels; i++)
chan[i] = ac_llvm_extract_elem(ctx, value, i);
elemtype = LLVMGetElementType(LLVMTypeOf(value));
} else {
- if (num_channels) {
- assert(num_channels == 1);
+ if (src_channels) {
+ assert(src_channels == 1);
chan[0] = value;
}
elemtype = LLVMTypeOf(value);
}
- while (num_channels < 4)
- chan[num_channels++] = LLVMGetUndef(elemtype);
+ for (unsigned i = src_channels; i < dst_channels; i++)
+ chan[i] = LLVMGetUndef(elemtype);
+
+ return ac_build_gather_values(ctx, chan, dst_channels);
+}
+
+/* Expand a scalar or vector to <4 x type> by filling the remaining channels
+ * with undef. Extract at most num_channels components from the input.
+ */
+LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
+ LLVMValueRef value,
+ unsigned num_channels)
+{
+ return ac_build_expand(ctx, value, num_channels, 4);
+}
+
+LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
+{
+ unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
+ const char *name;
+
+ if (type_size == 2)
+ name = "llvm.rint.f16";
+ else if (type_size == 4)
+ name = "llvm.rint.f32";
+ else
+ name = "llvm.rint.f64";
- return ac_build_gather_values(ctx, chan, 4);
+ return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1,
+ AC_FUNC_ATTR_READNONE);
}
LLVMValueRef
LLVMValueRef num,
LLVMValueRef den)
{
- LLVMValueRef ret = LLVMBuildFDiv(ctx->builder, num, den, "");
+ /* If we do (num / den), LLVM >= 7.0 does:
+ * return num * v_rcp_f32(den * (fabs(den) > 0x1.0p+96f ? 0x1.0p-32f : 1.0f));
+ *
+ * If we do (num * (1 / den)), LLVM does:
+ * return num * v_rcp_f32(den);
+ */
+ LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0);
+ LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, "");
+ LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, "");
/* Use v_rcp_f32 instead of precise division. */
if (!LLVMIsConstant(ret))
return ret;
}
+/* See fast_idiv_by_const.h. */
+/* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
+LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx,
+ LLVMValueRef num,
+ LLVMValueRef multiplier,
+ LLVMValueRef pre_shift,
+ LLVMValueRef post_shift,
+ LLVMValueRef increment)
+{
+ LLVMBuilderRef builder = ctx->builder;
+
+ num = LLVMBuildLShr(builder, num, pre_shift, "");
+ num = LLVMBuildMul(builder,
+ LLVMBuildZExt(builder, num, ctx->i64, ""),
+ LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
+ num = LLVMBuildAdd(builder, num,
+ LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
+ num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
+ num = LLVMBuildTrunc(builder, num, ctx->i32, "");
+ return LLVMBuildLShr(builder, num, post_shift, "");
+}
+
+/* See fast_idiv_by_const.h. */
+/* If num != UINT_MAX, this more efficient version can be used. */
+/* Set: increment = util_fast_udiv_info::increment; */
+LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx,
+ LLVMValueRef num,
+ LLVMValueRef multiplier,
+ LLVMValueRef pre_shift,
+ LLVMValueRef post_shift,
+ LLVMValueRef increment)
+{
+ LLVMBuilderRef builder = ctx->builder;
+
+ num = LLVMBuildLShr(builder, num, pre_shift, "");
+ num = LLVMBuildNUWAdd(builder, num, increment, "");
+ num = LLVMBuildMul(builder,
+ LLVMBuildZExt(builder, num, ctx->i64, ""),
+ LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
+ num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
+ num = LLVMBuildTrunc(builder, num, ctx->i32, "");
+ return LLVMBuildLShr(builder, num, post_shift, "");
+}
+
+/* See fast_idiv_by_const.h. */
+/* Both operands must fit in 31 bits and the divisor must not be 1. */
+LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx,
+ LLVMValueRef num,
+ LLVMValueRef multiplier,
+ LLVMValueRef post_shift)
+{
+ LLVMBuilderRef builder = ctx->builder;
+
+ num = LLVMBuildMul(builder,
+ LLVMBuildZExt(builder, num, ctx->i64, ""),
+ LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
+ num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
+ num = LLVMBuildTrunc(builder, num, ctx->i32, "");
+ return LLVMBuildLShr(builder, num, post_shift, "");
+}
+
/* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
* of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
* already multiplied by two. id is the cube face number.
LLVMValueRef invma;
if (is_array && !is_lod) {
- LLVMValueRef tmp = coords_arg[3];
- tmp = ac_build_intrinsic(ctx, "llvm.rint.f32", ctx->f32, &tmp, 1, 0);
+ LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
/* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
*
if (is_array) {
/* for cube arrays coord.z = coord.w(array_index) * 8 + face */
/* coords_arg.w component - array_index for cube arrays */
- LLVMValueRef tmp = LLVMBuildFMul(ctx->builder, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), "");
- coords[2] = LLVMBuildFAdd(ctx->builder, tmp, coords[2], "");
+ coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
}
memcpy(coords_arg, coords, sizeof(coords));
ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
}
+LLVMValueRef
+ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
+ LLVMValueRef llvm_chan,
+ LLVMValueRef attr_number,
+ LLVMValueRef params,
+ LLVMValueRef i,
+ LLVMValueRef j)
+{
+ LLVMValueRef args[6];
+ LLVMValueRef p1;
+
+ args[0] = i;
+ args[1] = llvm_chan;
+ args[2] = attr_number;
+ args[3] = ctx->i1false;
+ args[4] = params;
+
+ p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
+ ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
+
+ args[0] = p1;
+ args[1] = j;
+ args[2] = llvm_chan;
+ args[3] = attr_number;
+ args[4] = ctx->i1false;
+ args[5] = params;
+
+ return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
+ ctx->f16, args, 6, AC_FUNC_ATTR_READNONE);
+}
+
LLVMValueRef
ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
LLVMValueRef parameter,
ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
}
+LLVMValueRef
+ac_build_gep_ptr(struct ac_llvm_context *ctx,
+ LLVMValueRef base_ptr,
+ LLVMValueRef index)
+{
+ return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
+}
+
LLVMValueRef
ac_build_gep0(struct ac_llvm_context *ctx,
LLVMValueRef base_ptr,
LLVMValueRef index)
{
LLVMValueRef indices[2] = {
- LLVMConstInt(ctx->i32, 0, 0),
+ ctx->i32_0,
index,
};
- return LLVMBuildGEP(ctx->builder, base_ptr,
- indices, 2, "");
+ return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
+}
+
+LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr,
+ LLVMValueRef index)
+{
+ return LLVMBuildPointerCast(ctx->builder,
+ ac_build_gep0(ctx, ptr, index),
+ LLVMTypeOf(ptr), "");
}
void
* \param uniform Whether the base_ptr and index can be assumed to be
* dynamically uniform (i.e. load to an SGPR)
* \param invariant Whether the load is invariant (no other opcodes affect it)
+ * \param no_unsigned_wraparound
+ * For all possible re-associations and re-distributions of an expression
+ * "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
+ * without inbounds in base_ptr), this parameter is true if "addr + offset"
+ * does not result in an unsigned integer wraparound. This is used for
+ * optimal code generation of 32-bit pointer arithmetic.
+ *
+ * For example, a 32-bit immediate offset that causes a 32-bit unsigned
+ * integer wraparound can't be an imm offset in s_load_dword, because
+ * the instruction performs "addr + offset" in 64 bits.
+ *
+ * Expected usage for bindless textures by chaining GEPs:
+ * // possible unsigned wraparound, don't use InBounds:
+ * ptr1 = LLVMBuildGEP(base_ptr, index);
+ * image = load(ptr1); // becomes "s_load ptr1, 0"
+ *
+ * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
+ * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
*/
static LLVMValueRef
ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
- LLVMValueRef index, bool uniform, bool invariant)
+ LLVMValueRef index, bool uniform, bool invariant,
+ bool no_unsigned_wraparound)
{
LLVMValueRef pointer, result;
+ LLVMValueRef indices[2] = {ctx->i32_0, index};
+
+ if (no_unsigned_wraparound &&
+ LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
+ pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, indices, 2, "");
+ else
+ pointer = LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
- pointer = ac_build_gep0(ctx, base_ptr, index);
if (uniform)
LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
result = LLVMBuildLoad(ctx->builder, pointer, "");
LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
LLVMValueRef index)
{
- return ac_build_load_custom(ctx, base_ptr, index, false, false);
+ return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
}
LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx,
LLVMValueRef base_ptr, LLVMValueRef index)
{
- return ac_build_load_custom(ctx, base_ptr, index, false, true);
+ return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
}
+/* This assumes that there is no unsigned integer wraparound during the address
+ * computation, excluding all GEPs within base_ptr. */
LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx,
LLVMValueRef base_ptr, LLVMValueRef index)
{
- return ac_build_load_custom(ctx, base_ptr, index, true, true);
+ return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
+}
+
+/* See ac_build_load_custom() documentation. */
+LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
+ LLVMValueRef base_ptr, LLVMValueRef index)
+{
+ return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
+}
+
+static void
+ac_build_buffer_store_common(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef data,
+ LLVMValueRef vindex,
+ LLVMValueRef voffset,
+ unsigned num_channels,
+ bool glc,
+ bool slc,
+ bool writeonly_memory,
+ bool use_format)
+{
+ LLVMValueRef args[] = {
+ data,
+ LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
+ vindex ? vindex : ctx->i32_0,
+ voffset,
+ LLVMConstInt(ctx->i1, glc, 0),
+ LLVMConstInt(ctx->i1, slc, 0)
+ };
+ unsigned func = CLAMP(num_channels, 1, 3) - 1;
+
+ const char *type_names[] = {"f32", "v2f32", "v4f32"};
+ char name[256];
+
+ if (use_format) {
+ snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.format.%s",
+ type_names[func]);
+ } else {
+ snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
+ type_names[func]);
+ }
+
+ ac_build_intrinsic(ctx, name, ctx->voidt, args, ARRAY_SIZE(args),
+ ac_get_store_intr_attribs(writeonly_memory));
+}
+
+static void
+ac_build_llvm8_buffer_store_common(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef data,
+ LLVMValueRef vindex,
+ LLVMValueRef voffset,
+ LLVMValueRef soffset,
+ unsigned num_channels,
+ bool glc,
+ bool slc,
+ bool writeonly_memory,
+ bool use_format,
+ bool structurized)
+{
+ LLVMValueRef args[6];
+ int idx = 0;
+ args[idx++] = data;
+ args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
+ if (structurized)
+ args[idx++] = vindex ? vindex : ctx->i32_0;
+ args[idx++] = voffset ? voffset : ctx->i32_0;
+ args[idx++] = soffset ? soffset : ctx->i32_0;
+ args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+ unsigned func = CLAMP(num_channels, 1, 3) - 1;
+
+ const char *type_names[] = {"f32", "v2f32", "v4f32"};
+ const char *indexing_kind = structurized ? "struct" : "raw";
+ char name[256];
+
+ if (use_format) {
+ snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s",
+ indexing_kind, type_names[func]);
+ } else {
+ snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s",
+ indexing_kind, type_names[func]);
+ }
+
+ ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
+ ac_get_store_intr_attribs(writeonly_memory));
+}
+
+void
+ac_build_buffer_store_format(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef data,
+ LLVMValueRef vindex,
+ LLVMValueRef voffset,
+ unsigned num_channels,
+ bool glc,
+ bool writeonly_memory)
+{
+ if (HAVE_LLVM >= 0x800) {
+ ac_build_llvm8_buffer_store_common(ctx, rsrc, data, vindex,
+ voffset, NULL, num_channels,
+ glc, false, writeonly_memory,
+ true, true);
+ } else {
+ ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset,
+ num_channels, glc, false,
+ writeonly_memory, true);
+ }
}
/* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
bool writeonly_memory,
bool swizzle_enable_hint)
{
+ /* Split 3 channel stores, becase LLVM doesn't support 3-channel
+ * intrinsics. */
+ if (num_channels == 3) {
+ LLVMValueRef v[3], v01;
+
+ for (int i = 0; i < 3; i++) {
+ v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
+ LLVMConstInt(ctx->i32, i, 0), "");
+ }
+ v01 = ac_build_gather_values(ctx, v, 2);
+
+ ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
+ soffset, inst_offset, glc, slc,
+ writeonly_memory, swizzle_enable_hint);
+ ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
+ soffset, inst_offset + 8,
+ glc, slc,
+ writeonly_memory, swizzle_enable_hint);
+ return;
+ }
+
/* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
* (voffset is swizzled, but soffset isn't swizzled).
* llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
*/
if (!swizzle_enable_hint) {
- /* Split 3 channel stores, becase LLVM doesn't support 3-channel
- * intrinsics. */
- if (num_channels == 3) {
- LLVMValueRef v[3], v01;
-
- for (int i = 0; i < 3; i++) {
- v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
- LLVMConstInt(ctx->i32, i, 0), "");
- }
- v01 = ac_build_gather_values(ctx, v, 2);
-
- ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
- soffset, inst_offset, glc, slc,
- writeonly_memory, swizzle_enable_hint);
- ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
- soffset, inst_offset + 8,
- glc, slc,
- writeonly_memory, swizzle_enable_hint);
- return;
- }
+ LLVMValueRef offset = soffset;
- unsigned func = CLAMP(num_channels, 1, 3) - 1;
static const char *types[] = {"f32", "v2f32", "v4f32"};
- char name[256];
- LLVMValueRef offset = soffset;
if (inst_offset)
offset = LLVMBuildAdd(ctx->builder, offset,
LLVMValueRef args[] = {
ac_to_float(ctx, vdata),
LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
- LLVMConstInt(ctx->i32, 0, 0),
+ ctx->i32_0,
offset,
LLVMConstInt(ctx->i1, glc, 0),
LLVMConstInt(ctx->i1, slc, 0),
};
+ char name[256];
snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
- types[func]);
+ types[CLAMP(num_channels, 1, 3) - 1]);
ac_build_intrinsic(ctx, name, ctx->voidt,
args, ARRAY_SIZE(args),
- writeonly_memory ?
- AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
- AC_FUNC_ATTR_WRITEONLY);
+ ac_get_store_intr_attribs(writeonly_memory));
return;
}
- static unsigned dfmt[] = {
+ static const unsigned dfmt[] = {
V_008F0C_BUF_DATA_FORMAT_32,
V_008F0C_BUF_DATA_FORMAT_32_32,
V_008F0C_BUF_DATA_FORMAT_32_32_32,
V_008F0C_BUF_DATA_FORMAT_32_32_32_32
};
- assert(num_channels >= 1 && num_channels <= 4);
-
+ static const char *types[] = {"i32", "v2i32", "v4i32"};
LLVMValueRef args[] = {
- rsrc,
vdata,
- LLVMConstInt(ctx->i32, num_channels, 0),
- voffset ? voffset : LLVMGetUndef(ctx->i32),
+ LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
+ ctx->i32_0,
+ voffset ? voffset : ctx->i32_0,
soffset,
LLVMConstInt(ctx->i32, inst_offset, 0),
LLVMConstInt(ctx->i32, dfmt[num_channels - 1], 0),
LLVMConstInt(ctx->i32, V_008F0C_BUF_NUM_FORMAT_UINT, 0),
- LLVMConstInt(ctx->i32, voffset != NULL, 0),
- LLVMConstInt(ctx->i32, 0, 0), /* idxen */
- LLVMConstInt(ctx->i32, glc, 0),
- LLVMConstInt(ctx->i32, slc, 0),
- LLVMConstInt(ctx->i32, 0, 0), /* tfe*/
+ LLVMConstInt(ctx->i1, glc, 0),
+ LLVMConstInt(ctx->i1, slc, 0),
};
-
- /* The instruction offset field has 12 bits */
- assert(voffset || inst_offset < (1 << 12));
-
- /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
- unsigned func = CLAMP(num_channels, 1, 3) - 1;
- const char *types[] = {"i32", "v2i32", "v4i32"};
char name[256];
- snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
+ snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s",
+ types[CLAMP(num_channels, 1, 3) - 1]);
ac_build_intrinsic(ctx, name, ctx->voidt,
args, ARRAY_SIZE(args),
- AC_FUNC_ATTR_LEGACY);
+ ac_get_store_intr_attribs(writeonly_memory));
}
static LLVMValueRef
{
LLVMValueRef args[] = {
LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
- vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
+ vindex ? vindex : ctx->i32_0,
voffset,
LLVMConstInt(ctx->i1, glc, 0),
LLVMConstInt(ctx->i1, slc, 0)
ac_get_load_intr_attribs(can_speculate));
}
+static LLVMValueRef
+ac_build_llvm8_buffer_load_common(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef vindex,
+ LLVMValueRef voffset,
+ LLVMValueRef soffset,
+ unsigned num_channels,
+ bool glc,
+ bool slc,
+ bool can_speculate,
+ bool use_format,
+ bool structurized)
+{
+ LLVMValueRef args[5];
+ int idx = 0;
+ args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
+ if (structurized)
+ args[idx++] = vindex ? vindex : ctx->i32_0;
+ args[idx++] = voffset ? voffset : ctx->i32_0;
+ args[idx++] = soffset ? soffset : ctx->i32_0;
+ args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+ unsigned func = CLAMP(num_channels, 1, 3) - 1;
+
+ LLVMTypeRef types[] = {ctx->f32, ctx->v2f32, ctx->v4f32};
+ const char *type_names[] = {"f32", "v2f32", "v4f32"};
+ const char *indexing_kind = structurized ? "struct" : "raw";
+ char name[256];
+
+ if (use_format) {
+ snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s",
+ indexing_kind, type_names[func]);
+ } else {
+ snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s",
+ indexing_kind, type_names[func]);
+ }
+
+ return ac_build_intrinsic(ctx, name, types[func], args,
+ idx,
+ ac_get_load_intr_attribs(can_speculate));
+}
+
LLVMValueRef
ac_build_buffer_load(struct ac_llvm_context *ctx,
LLVMValueRef rsrc,
if (soffset)
offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
- /* TODO: VI and later generations can use SMEM with GLC=1.*/
- if (allow_smem && !glc && !slc) {
+ if (allow_smem && !slc &&
+ (!glc || (HAVE_LLVM >= 0x0800 && ctx->chip_class >= VI))) {
assert(vindex == NULL);
LLVMValueRef result[8];
offset = LLVMBuildAdd(ctx->builder, offset,
LLVMConstInt(ctx->i32, 4, 0), "");
}
- LLVMValueRef args[2] = {rsrc, offset};
- result[i] = ac_build_intrinsic(ctx, "llvm.SI.load.const.v4i32",
- ctx->f32, args, 2,
+ const char *intrname =
+ HAVE_LLVM >= 0x0800 ? "llvm.amdgcn.s.buffer.load.f32"
+ : "llvm.SI.load.const.v4i32";
+ unsigned num_args = HAVE_LLVM >= 0x0800 ? 3 : 2;
+ LLVMValueRef args[3] = {
+ rsrc,
+ offset,
+ glc ? ctx->i32_1 : ctx->i32_0,
+ };
+ result[i] = ac_build_intrinsic(ctx, intrname,
+ ctx->f32, args, num_args,
AC_FUNC_ATTR_READNONE |
- AC_FUNC_ATTR_LEGACY);
+ (HAVE_LLVM < 0x0800 ? AC_FUNC_ATTR_LEGACY : 0));
}
if (num_channels == 1)
return result[0];
return ac_build_gather_values(ctx, result, num_channels);
}
+ if (HAVE_LLVM >= 0x0800) {
+ return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex,
+ offset, ctx->i32_0,
+ num_channels, glc, slc,
+ can_speculate, false,
+ false);
+ }
+
return ac_build_buffer_load_common(ctx, rsrc, vindex, offset,
num_channels, glc, slc,
can_speculate, false);
bool glc,
bool can_speculate)
{
+ if (HAVE_LLVM >= 0x800) {
+ return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
+ num_channels, glc, false,
+ can_speculate, true, true);
+ }
return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset,
num_channels, glc, false,
can_speculate, true);
bool glc,
bool can_speculate)
{
+ if (HAVE_LLVM >= 0x800) {
+ return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
+ num_channels, glc, false,
+ can_speculate, true, true);
+ }
+
LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), "");
- LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 1, 0), "");
+ LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, ctx->i32_1, "");
stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), "");
LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->builder,
can_speculate, true);
}
+static LLVMValueRef
+ac_build_llvm8_tbuffer_load(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef vindex,
+ LLVMValueRef voffset,
+ LLVMValueRef soffset,
+ unsigned num_channels,
+ unsigned dfmt,
+ unsigned nfmt,
+ bool glc,
+ bool slc,
+ bool can_speculate,
+ bool structurized)
+{
+ LLVMValueRef args[6];
+ int idx = 0;
+ args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
+ if (structurized)
+ args[idx++] = vindex ? vindex : ctx->i32_0;
+ args[idx++] = voffset ? voffset : ctx->i32_0;
+ args[idx++] = soffset ? soffset : ctx->i32_0;
+ args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0);
+ args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+ unsigned func = CLAMP(num_channels, 1, 3) - 1;
+
+ LLVMTypeRef types[] = {ctx->i32, ctx->v2i32, ctx->v4i32};
+ const char *type_names[] = {"i32", "v2i32", "v4i32"};
+ const char *indexing_kind = structurized ? "struct" : "raw";
+ char name[256];
+
+ snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s",
+ indexing_kind, type_names[func]);
+
+ return ac_build_intrinsic(ctx, name, types[func], args,
+ idx,
+ ac_get_load_intr_attribs(can_speculate));
+}
+
+static LLVMValueRef
+ac_build_tbuffer_load(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef vindex,
+ LLVMValueRef voffset,
+ LLVMValueRef soffset,
+ LLVMValueRef immoffset,
+ unsigned num_channels,
+ unsigned dfmt,
+ unsigned nfmt,
+ bool glc,
+ bool slc,
+ bool can_speculate,
+ bool structurized) /* only matters for LLVM 8+ */
+{
+ if (HAVE_LLVM >= 0x800) {
+ voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
+
+ return ac_build_llvm8_tbuffer_load(ctx, rsrc, vindex, voffset,
+ soffset, num_channels,
+ dfmt, nfmt, glc, slc,
+ can_speculate, structurized);
+ }
+
+ LLVMValueRef args[] = {
+ rsrc,
+ vindex ? vindex : ctx->i32_0,
+ voffset,
+ soffset,
+ immoffset,
+ LLVMConstInt(ctx->i32, dfmt, false),
+ LLVMConstInt(ctx->i32, nfmt, false),
+ LLVMConstInt(ctx->i32, glc, false),
+ LLVMConstInt(ctx->i32, slc, false),
+ };
+ unsigned func = CLAMP(num_channels, 1, 3) - 1;
+ LLVMTypeRef types[] = {ctx->i32, ctx->v2i32, ctx->v4i32};
+ const char *type_names[] = {"i32", "v2i32", "v4i32"};
+ char name[256];
+
+ snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.load.%s",
+ type_names[func]);
+
+ return ac_build_intrinsic(ctx, name, types[func], args, 9,
+ ac_get_load_intr_attribs(can_speculate));
+}
+
+LLVMValueRef
+ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef vindex,
+ LLVMValueRef voffset,
+ LLVMValueRef soffset,
+ LLVMValueRef immoffset,
+ unsigned num_channels,
+ unsigned dfmt,
+ unsigned nfmt,
+ bool glc,
+ bool slc,
+ bool can_speculate)
+{
+ return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset,
+ immoffset, num_channels, dfmt, nfmt, glc,
+ slc, can_speculate, true);
+}
+
+LLVMValueRef
+ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef voffset,
+ LLVMValueRef soffset,
+ LLVMValueRef immoffset,
+ unsigned num_channels,
+ unsigned dfmt,
+ unsigned nfmt,
+ bool glc,
+ bool slc,
+ bool can_speculate)
+{
+ return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset,
+ immoffset, num_channels, dfmt, nfmt, glc,
+ slc, can_speculate, false);
+}
+
+LLVMValueRef
+ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef voffset,
+ LLVMValueRef soffset,
+ LLVMValueRef immoffset,
+ bool glc)
+{
+ unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
+ unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
+ LLVMValueRef res;
+
+ res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
+ immoffset, 1, dfmt, nfmt, glc, false,
+ false);
+
+ return LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
+}
+
+static void
+ac_build_llvm8_tbuffer_store(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef vdata,
+ LLVMValueRef vindex,
+ LLVMValueRef voffset,
+ LLVMValueRef soffset,
+ unsigned num_channels,
+ unsigned dfmt,
+ unsigned nfmt,
+ bool glc,
+ bool slc,
+ bool writeonly_memory,
+ bool structurized)
+{
+ LLVMValueRef args[7];
+ int idx = 0;
+ args[idx++] = vdata;
+ args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
+ if (structurized)
+ args[idx++] = vindex ? vindex : ctx->i32_0;
+ args[idx++] = voffset ? voffset : ctx->i32_0;
+ args[idx++] = soffset ? soffset : ctx->i32_0;
+ args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0);
+ args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+ unsigned func = CLAMP(num_channels, 1, 3) - 1;
+
+ const char *type_names[] = {"i32", "v2i32", "v4i32"};
+ const char *indexing_kind = structurized ? "struct" : "raw";
+ char name[256];
+
+ snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s",
+ indexing_kind, type_names[func]);
+
+ ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
+ ac_get_store_intr_attribs(writeonly_memory));
+}
+
+static void
+ac_build_tbuffer_store(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef vdata,
+ LLVMValueRef vindex,
+ LLVMValueRef voffset,
+ LLVMValueRef soffset,
+ LLVMValueRef immoffset,
+ unsigned num_channels,
+ unsigned dfmt,
+ unsigned nfmt,
+ bool glc,
+ bool slc,
+ bool writeonly_memory,
+ bool structurized) /* only matters for LLVM 8+ */
+{
+ if (HAVE_LLVM >= 0x800) {
+ voffset = LLVMBuildAdd(ctx->builder,
+ voffset ? voffset : ctx->i32_0,
+ immoffset, "");
+
+ ac_build_llvm8_tbuffer_store(ctx, rsrc, vdata, vindex, voffset,
+ soffset, num_channels, dfmt, nfmt,
+ glc, slc, writeonly_memory,
+ structurized);
+ } else {
+ LLVMValueRef params[] = {
+ vdata,
+ rsrc,
+ vindex ? vindex : ctx->i32_0,
+ voffset ? voffset : ctx->i32_0,
+ soffset ? soffset : ctx->i32_0,
+ immoffset,
+ LLVMConstInt(ctx->i32, dfmt, false),
+ LLVMConstInt(ctx->i32, nfmt, false),
+ LLVMConstInt(ctx->i32, glc, false),
+ LLVMConstInt(ctx->i32, slc, false),
+ };
+ unsigned func = CLAMP(num_channels, 1, 3) - 1;
+ const char *type_names[] = {"i32", "v2i32", "v4i32"};
+ char name[256];
+
+ snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s",
+ type_names[func]);
+
+ ac_build_intrinsic(ctx, name, ctx->voidt, params, 10,
+ ac_get_store_intr_attribs(writeonly_memory));
+ }
+}
+
+void
+ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef vdata,
+ LLVMValueRef vindex,
+ LLVMValueRef voffset,
+ LLVMValueRef soffset,
+ LLVMValueRef immoffset,
+ unsigned num_channels,
+ unsigned dfmt,
+ unsigned nfmt,
+ bool glc,
+ bool slc,
+ bool writeonly_memory)
+{
+ ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset,
+ immoffset, num_channels, dfmt, nfmt, glc, slc,
+ writeonly_memory, true);
+}
+
+void
+ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef vdata,
+ LLVMValueRef voffset,
+ LLVMValueRef soffset,
+ LLVMValueRef immoffset,
+ unsigned num_channels,
+ unsigned dfmt,
+ unsigned nfmt,
+ bool glc,
+ bool slc,
+ bool writeonly_memory)
+{
+ ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset,
+ immoffset, num_channels, dfmt, nfmt, glc, slc,
+ writeonly_memory, false);
+}
+
+void
+ac_build_tbuffer_store_short(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef vdata,
+ LLVMValueRef voffset,
+ LLVMValueRef soffset,
+ bool glc,
+ bool writeonly_memory)
+{
+ unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
+ unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
+
+ vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
+ vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
+
+ ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
+ ctx->i32_0, 1, dfmt, nfmt, glc, false,
+ writeonly_memory);
+}
+
/**
* Set range metadata on an instruction. This can only be used on load and
* call instructions. If you know an instruction can only produce the values
LLVMValueRef tid_args[2];
tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
- tid_args[1] = LLVMConstInt(ctx->i32, 0, false);
+ tid_args[1] = ctx->i32_0;
tid_args[1] = ac_build_intrinsic(ctx,
"llvm.amdgcn.mbcnt.lo", ctx->i32,
tid_args, 2, AC_FUNC_ATTR_READNONE);
int idx,
LLVMValueRef val)
{
- LLVMValueRef tl, trbl, args[2];
+ unsigned tl_lanes[4], trbl_lanes[4];
+ LLVMValueRef tl, trbl;
LLVMValueRef result;
- if (ctx->chip_class >= VI) {
- LLVMValueRef thread_id, tl_tid, trbl_tid;
- thread_id = ac_get_thread_id(ctx);
-
- tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
- LLVMConstInt(ctx->i32, mask, false), "");
-
- trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
- LLVMConstInt(ctx->i32, idx, false), "");
-
- args[0] = LLVMBuildMul(ctx->builder, tl_tid,
- LLVMConstInt(ctx->i32, 4, false), "");
- args[1] = val;
- tl = ac_build_intrinsic(ctx,
- "llvm.amdgcn.ds.bpermute", ctx->i32,
- args, 2,
- AC_FUNC_ATTR_READNONE |
- AC_FUNC_ATTR_CONVERGENT);
-
- args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
- LLVMConstInt(ctx->i32, 4, false), "");
- trbl = ac_build_intrinsic(ctx,
- "llvm.amdgcn.ds.bpermute", ctx->i32,
- args, 2,
- AC_FUNC_ATTR_READNONE |
- AC_FUNC_ATTR_CONVERGENT);
- } else {
- uint32_t masks[2] = {};
-
- switch (mask) {
- case AC_TID_MASK_TOP_LEFT:
- masks[0] = 0x8000;
- if (idx == 1)
- masks[1] = 0x8055;
- else
- masks[1] = 0x80aa;
-
- break;
- case AC_TID_MASK_TOP:
- masks[0] = 0x8044;
- masks[1] = 0x80ee;
- break;
- case AC_TID_MASK_LEFT:
- masks[0] = 0x80a0;
- masks[1] = 0x80f5;
- break;
- default:
- assert(0);
- }
-
- args[0] = val;
- args[1] = LLVMConstInt(ctx->i32, masks[0], false);
-
- tl = ac_build_intrinsic(ctx,
- "llvm.amdgcn.ds.swizzle", ctx->i32,
- args, 2,
- AC_FUNC_ATTR_READNONE |
- AC_FUNC_ATTR_CONVERGENT);
-
- args[1] = LLVMConstInt(ctx->i32, masks[1], false);
- trbl = ac_build_intrinsic(ctx,
- "llvm.amdgcn.ds.swizzle", ctx->i32,
- args, 2,
- AC_FUNC_ATTR_READNONE |
- AC_FUNC_ATTR_CONVERGENT);
+ for (unsigned i = 0; i < 4; ++i) {
+ tl_lanes[i] = i & mask;
+ trbl_lanes[i] = (i & mask) + idx;
}
+ tl = ac_build_quad_swizzle(ctx, val,
+ tl_lanes[0], tl_lanes[1],
+ tl_lanes[2], tl_lanes[3]);
+ trbl = ac_build_quad_swizzle(ctx, val,
+ trbl_lanes[0], trbl_lanes[1],
+ trbl_lanes[2], trbl_lanes[3]);
+
tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
- if (HAVE_LLVM >= 0x0700) {
- result = ac_build_intrinsic(ctx,
- "llvm.amdgcn.wqm.f32", ctx->f32,
- &result, 1, 0);
- }
+ result = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32,
+ &result, 1, 0);
return result;
}
LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
LLVMValueRef cond = LLVMBuildOr(ctx->builder,
LLVMBuildICmp(ctx->builder, LLVMIntEQ,
- arg, LLVMConstInt(ctx->i32, 0, 0), ""),
+ arg, ctx->i32_0, ""),
LLVMBuildICmp(ctx->builder, LLVMIntEQ,
arg, all_ones, ""), "");
LLVMTypeRef type;
LLVMValueRef highest_bit;
LLVMValueRef zero;
+ unsigned bitsize;
- if (ac_get_elem_bits(ctx, LLVMTypeOf(arg)) == 64) {
+ bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
+ switch (bitsize) {
+ case 64:
intrin_name = "llvm.ctlz.i64";
type = ctx->i64;
highest_bit = LLVMConstInt(ctx->i64, 63, false);
zero = ctx->i64_0;
- } else {
+ break;
+ case 32:
intrin_name = "llvm.ctlz.i32";
type = ctx->i32;
highest_bit = LLVMConstInt(ctx->i32, 31, false);
zero = ctx->i32_0;
+ break;
+ case 16:
+ intrin_name = "llvm.ctlz.i16";
+ type = ctx->i16;
+ highest_bit = LLVMConstInt(ctx->i16, 15, false);
+ zero = ctx->i16_0;
+ break;
+ default:
+ unreachable(!"invalid bitsize");
+ break;
}
LLVMValueRef params[2] = {
LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
LLVMValueRef b)
{
+ char name[64];
+ snprintf(name, sizeof(name), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
LLVMValueRef args[2] = {a, b};
- return ac_build_intrinsic(ctx, "llvm.minnum.f32", ctx->f32, args, 2,
+ return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
AC_FUNC_ATTR_READNONE);
}
LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
LLVMValueRef b)
{
+ char name[64];
+ snprintf(name, sizeof(name), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
LLVMValueRef args[2] = {a, b};
- return ac_build_intrinsic(ctx, "llvm.maxnum.f32", ctx->f32, args, 2,
+ return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
AC_FUNC_ATTR_READNONE);
}
LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
{
- if (HAVE_LLVM >= 0x0500) {
- return ac_build_fmin(ctx, ac_build_fmax(ctx, value, ctx->f32_0),
- ctx->f32_1);
- }
-
- LLVMValueRef args[3] = {
- value,
- LLVMConstReal(ctx->f32, 0),
- LLVMConstReal(ctx->f32, 1),
- };
-
- return ac_build_intrinsic(ctx, "llvm.AMDGPU.clamp.", ctx->f32, args, 3,
- AC_FUNC_ATTR_READNONE |
- AC_FUNC_ATTR_LEGACY);
+ LLVMTypeRef t = LLVMTypeOf(value);
+ return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
+ LLVMConstReal(t, 1.0));
}
void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
{
LLVMValueRef args[9];
- if (HAVE_LLVM >= 0x0500) {
- args[0] = LLVMConstInt(ctx->i32, a->target, 0);
- args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
-
- if (a->compr) {
- LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
- LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
-
- args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
- v2i16, "");
- args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
- v2i16, "");
- args[4] = LLVMConstInt(ctx->i1, a->done, 0);
- args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
-
- ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
- ctx->voidt, args, 6, 0);
- } else {
- args[2] = a->out[0];
- args[3] = a->out[1];
- args[4] = a->out[2];
- args[5] = a->out[3];
- args[6] = LLVMConstInt(ctx->i1, a->done, 0);
- args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
-
- ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
- ctx->voidt, args, 8, 0);
- }
- return;
- }
+ args[0] = LLVMConstInt(ctx->i32, a->target, 0);
+ args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
+
+ if (a->compr) {
+ LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
+ LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
- args[0] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
- args[1] = LLVMConstInt(ctx->i32, a->valid_mask, 0);
- args[2] = LLVMConstInt(ctx->i32, a->done, 0);
- args[3] = LLVMConstInt(ctx->i32, a->target, 0);
- args[4] = LLVMConstInt(ctx->i32, a->compr, 0);
- memcpy(args + 5, a->out, sizeof(a->out[0]) * 4);
+ args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
+ v2i16, "");
+ args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
+ v2i16, "");
+ args[4] = LLVMConstInt(ctx->i1, a->done, 0);
+ args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
- ac_build_intrinsic(ctx, "llvm.SI.export", ctx->voidt, args, 9,
- AC_FUNC_ATTR_LEGACY);
+ ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
+ ctx->voidt, args, 6, 0);
+ } else {
+ args[2] = a->out[0];
+ args[3] = a->out[1];
+ args[4] = a->out[2];
+ args[5] = a->out[3];
+ args[6] = LLVMConstInt(ctx->i1, a->done, 0);
+ args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
+
+ ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
+ ctx->voidt, args, 8, 0);
+ }
}
void ac_build_export_null(struct ac_llvm_context *ctx)
}
}
+static const char *get_atomic_name(enum ac_atomic_op op)
+{
+ switch (op) {
+ case ac_atomic_swap: return "swap";
+ case ac_atomic_add: return "add";
+ case ac_atomic_sub: return "sub";
+ case ac_atomic_smin: return "smin";
+ case ac_atomic_umin: return "umin";
+ case ac_atomic_smax: return "smax";
+ case ac_atomic_umax: return "umax";
+ case ac_atomic_and: return "and";
+ case ac_atomic_or: return "or";
+ case ac_atomic_xor: return "xor";
+ }
+ unreachable("bad atomic op");
+}
+
LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
struct ac_image_args *a)
{
- LLVMValueRef args[16];
- LLVMTypeRef retty = ctx->v4f32;
- const char *name = NULL;
- const char *atomic_subop = "";
- char intr_name[128], coords_type[64];
+ const char *overload[3] = { "", "", "" };
+ unsigned num_overloads = 0;
+ LLVMValueRef args[18];
+ unsigned num_args = 0;
+ enum ac_image_dim dim = a->dim;
assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
!a->level_zero);
assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
a->opcode != ac_image_store_mip) ||
a->lod);
+ assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
+ (!a->compare && !a->offset));
+ assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
+ a->opcode == ac_image_get_lod) ||
+ !a->bias);
assert((a->bias ? 1 : 0) +
(a->lod ? 1 : 0) +
(a->level_zero ? 1 : 0) +
(a->derivs[0] ? 1 : 0) <= 1);
- bool sample = a->opcode == ac_image_sample ||
- a->opcode == ac_image_gather4 ||
- a->opcode == ac_image_get_lod;
- bool atomic = a->opcode == ac_image_atomic ||
- a->opcode == ac_image_atomic_cmpswap;
- bool da = a->dim == ac_image_cube ||
- a->dim == ac_image_1darray ||
- a->dim == ac_image_2darray ||
- a->dim == ac_image_2darraymsaa;
- if (a->opcode == ac_image_get_lod)
- da = false;
-
- unsigned num_coords =
- a->opcode != ac_image_get_resinfo ? ac_num_coords(a->dim) : 0;
- LLVMValueRef addr;
- unsigned num_addr = 0;
-
if (a->opcode == ac_image_get_lod) {
- switch (a->dim) {
+ switch (dim) {
case ac_image_1darray:
- num_coords = 1;
+ dim = ac_image_1d;
break;
case ac_image_2darray:
case ac_image_cube:
- num_coords = 2;
+ dim = ac_image_2d;
break;
default:
break;
}
}
- if (a->offset)
- args[num_addr++] = ac_to_integer(ctx, a->offset);
- if (a->bias)
- args[num_addr++] = ac_to_integer(ctx, a->bias);
- if (a->compare)
- args[num_addr++] = ac_to_integer(ctx, a->compare);
- if (a->derivs[0]) {
- unsigned num_derivs = ac_num_derivs(a->dim);
- for (unsigned i = 0; i < num_derivs; ++i)
- args[num_addr++] = ac_to_integer(ctx, a->derivs[i]);
- }
- for (unsigned i = 0; i < num_coords; ++i)
- args[num_addr++] = ac_to_integer(ctx, a->coords[i]);
- if (a->lod)
- args[num_addr++] = ac_to_integer(ctx, a->lod);
-
- unsigned pad_goal = util_next_power_of_two(num_addr);
- while (num_addr < pad_goal)
- args[num_addr++] = LLVMGetUndef(ctx->i32);
-
- addr = ac_build_gather_values(ctx, args, num_addr);
+ bool sample = a->opcode == ac_image_sample ||
+ a->opcode == ac_image_gather4 ||
+ a->opcode == ac_image_get_lod;
+ bool atomic = a->opcode == ac_image_atomic ||
+ a->opcode == ac_image_atomic_cmpswap;
+ LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
- unsigned num_args = 0;
if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
args[num_args++] = a->data[0];
if (a->opcode == ac_image_atomic_cmpswap)
args[num_args++] = a->data[1];
}
- unsigned coords_arg = num_args;
- if (sample)
- args[num_args++] = ac_to_float(ctx, addr);
- else
- args[num_args++] = ac_to_integer(ctx, addr);
+ if (!atomic)
+ args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
+
+ if (a->offset)
+ args[num_args++] = ac_to_integer(ctx, a->offset);
+ if (a->bias) {
+ args[num_args++] = ac_to_float(ctx, a->bias);
+ overload[num_overloads++] = ".f32";
+ }
+ if (a->compare)
+ args[num_args++] = ac_to_float(ctx, a->compare);
+ if (a->derivs[0]) {
+ unsigned count = ac_num_derivs(dim);
+ for (unsigned i = 0; i < count; ++i)
+ args[num_args++] = ac_to_float(ctx, a->derivs[i]);
+ overload[num_overloads++] = ".f32";
+ }
+ unsigned num_coords =
+ a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
+ for (unsigned i = 0; i < num_coords; ++i)
+ args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
+ if (a->lod)
+ args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
+ overload[num_overloads++] = sample ? ".f32" : ".i32";
args[num_args++] = a->resource;
- if (sample)
+ if (sample) {
args[num_args++] = a->sampler;
- if (!atomic) {
- args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, 0);
- if (sample)
- args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, 0);
- args[num_args++] = a->cache_policy & ac_glc ? ctx->i1true : ctx->i1false;
- args[num_args++] = a->cache_policy & ac_slc ? ctx->i1true : ctx->i1false;
- args[num_args++] = ctx->i1false; /* lwe */
- args[num_args++] = LLVMConstInt(ctx->i1, da, 0);
- } else {
- args[num_args++] = ctx->i1false; /* r128 */
- args[num_args++] = LLVMConstInt(ctx->i1, da, 0);
- args[num_args++] = a->cache_policy & ac_slc ? ctx->i1true : ctx->i1false;
+ args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
}
+ args[num_args++] = ctx->i32_0; /* texfailctrl */
+ args[num_args++] = LLVMConstInt(ctx->i32, a->cache_policy, false);
+
+ const char *name;
+ const char *atomic_subop = "";
switch (a->opcode) {
- case ac_image_sample:
- name = "llvm.amdgcn.image.sample";
- break;
- case ac_image_gather4:
- name = "llvm.amdgcn.image.gather4";
- break;
- case ac_image_load:
- name = "llvm.amdgcn.image.load";
- break;
- case ac_image_load_mip:
- name = "llvm.amdgcn.image.load.mip";
- break;
- case ac_image_store:
- name = "llvm.amdgcn.image.store";
- retty = ctx->voidt;
- break;
- case ac_image_store_mip:
- name = "llvm.amdgcn.image.store.mip";
- retty = ctx->voidt;
- break;
+ case ac_image_sample: name = "sample"; break;
+ case ac_image_gather4: name = "gather4"; break;
+ case ac_image_load: name = "load"; break;
+ case ac_image_load_mip: name = "load.mip"; break;
+ case ac_image_store: name = "store"; break;
+ case ac_image_store_mip: name = "store.mip"; break;
case ac_image_atomic:
- case ac_image_atomic_cmpswap:
- name = "llvm.amdgcn.image.atomic.";
- retty = ctx->i32;
- if (a->opcode == ac_image_atomic_cmpswap) {
- atomic_subop = "cmpswap";
- } else {
- switch (a->atomic) {
- case ac_atomic_swap: atomic_subop = "swap"; break;
- case ac_atomic_add: atomic_subop = "add"; break;
- case ac_atomic_sub: atomic_subop = "sub"; break;
- case ac_atomic_smin: atomic_subop = "smin"; break;
- case ac_atomic_umin: atomic_subop = "umin"; break;
- case ac_atomic_smax: atomic_subop = "smax"; break;
- case ac_atomic_umax: atomic_subop = "umax"; break;
- case ac_atomic_and: atomic_subop = "and"; break;
- case ac_atomic_or: atomic_subop = "or"; break;
- case ac_atomic_xor: atomic_subop = "xor"; break;
- }
- }
- break;
- case ac_image_get_lod:
- name = "llvm.amdgcn.image.getlod";
+ name = "atomic.";
+ atomic_subop = get_atomic_name(a->atomic);
break;
- case ac_image_get_resinfo:
- name = "llvm.amdgcn.image.getresinfo";
+ case ac_image_atomic_cmpswap:
+ name = "atomic.";
+ atomic_subop = "cmpswap";
break;
- default:
- unreachable("invalid image opcode");
+ case ac_image_get_lod: name = "getlod"; break;
+ case ac_image_get_resinfo: name = "getresinfo"; break;
+ default: unreachable("invalid image opcode");
}
- ac_build_type_name_for_intr(LLVMTypeOf(args[coords_arg]), coords_type,
- sizeof(coords_type));
-
- if (atomic) {
- snprintf(intr_name, sizeof(intr_name), "llvm.amdgcn.image.atomic.%s.%s",
- atomic_subop, coords_type);
- } else {
- bool lod_suffix =
- a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
-
- snprintf(intr_name, sizeof(intr_name), "%s%s%s%s.v4f32.%s.v8i32",
- name,
- a->compare ? ".c" : "",
- a->bias ? ".b" :
- lod_suffix ? ".l" :
- a->derivs[0] ? ".d" :
- a->level_zero ? ".lz" : "",
- a->offset ? ".o" : "",
- coords_type);
- }
+ const char *dimname;
+ switch (dim) {
+ case ac_image_1d: dimname = "1d"; break;
+ case ac_image_2d: dimname = "2d"; break;
+ case ac_image_3d: dimname = "3d"; break;
+ case ac_image_cube: dimname = "cube"; break;
+ case ac_image_1darray: dimname = "1darray"; break;
+ case ac_image_2darray: dimname = "2darray"; break;
+ case ac_image_2dmsaa: dimname = "2dmsaa"; break;
+ case ac_image_2darraymsaa: dimname = "2darraymsaa"; break;
+ default: unreachable("invalid dim");
+ }
+
+ bool lod_suffix =
+ a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
+ char intr_name[96];
+ snprintf(intr_name, sizeof(intr_name),
+ "llvm.amdgcn.image.%s%s" /* base name */
+ "%s%s%s" /* sample/gather modifiers */
+ ".%s.%s%s%s%s", /* dimension and type overloads */
+ name, atomic_subop,
+ a->compare ? ".c" : "",
+ a->bias ? ".b" :
+ lod_suffix ? ".l" :
+ a->derivs[0] ? ".d" :
+ a->level_zero ? ".lz" : "",
+ a->offset ? ".o" : "",
+ dimname,
+ atomic ? "i32" : "v4f32",
+ overload[0], overload[1], overload[2]);
+
+ LLVMTypeRef retty;
+ if (atomic)
+ retty = ctx->i32;
+ else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
+ retty = ctx->voidt;
+ else
+ retty = ctx->v4f32;
LLVMValueRef result =
ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
LLVMValueRef args[2])
{
- if (HAVE_LLVM >= 0x0500) {
- LLVMTypeRef v2f16 =
- LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
- LLVMValueRef res =
- ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz",
- v2f16, args, 2,
- AC_FUNC_ATTR_READNONE);
- return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
- }
-
- return ac_build_intrinsic(ctx, "llvm.SI.packf16", ctx->i32, args, 2,
- AC_FUNC_ATTR_READNONE |
- AC_FUNC_ATTR_LEGACY);
-}
+ LLVMTypeRef v2f16 =
+ LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
-/* Upper 16 bits must be zero. */
-static LLVMValueRef ac_llvm_pack_two_int16(struct ac_llvm_context *ctx,
- LLVMValueRef val[2])
-{
- return LLVMBuildOr(ctx->builder, val[0],
- LLVMBuildShl(ctx->builder, val[1],
- LLVMConstInt(ctx->i32, 16, 0),
- ""), "");
-}
-
-/* Upper 16 bits are ignored and will be dropped. */
-static LLVMValueRef ac_llvm_pack_two_int32_as_int16(struct ac_llvm_context *ctx,
- LLVMValueRef val[2])
-{
- LLVMValueRef v[2] = {
- LLVMBuildAnd(ctx->builder, val[0],
- LLVMConstInt(ctx->i32, 0xffff, 0), ""),
- val[1],
- };
- return ac_llvm_pack_two_int16(ctx, v);
+ return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16,
+ args, 2, AC_FUNC_ATTR_READNONE);
}
LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx,
LLVMValueRef args[2])
{
- if (HAVE_LLVM >= 0x0600) {
- LLVMValueRef res =
- ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16",
- ctx->v2i16, args, 2,
- AC_FUNC_ATTR_READNONE);
- return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
- }
-
- LLVMValueRef val[2];
-
- for (int chan = 0; chan < 2; chan++) {
- /* Clamp between [-1, 1]. */
- val[chan] = ac_build_fmin(ctx, args[chan], ctx->f32_1);
- val[chan] = ac_build_fmax(ctx, val[chan], LLVMConstReal(ctx->f32, -1));
- /* Convert to a signed integer in [-32767, 32767]. */
- val[chan] = LLVMBuildFMul(ctx->builder, val[chan],
- LLVMConstReal(ctx->f32, 32767), "");
- /* If positive, add 0.5, else add -0.5. */
- val[chan] = LLVMBuildFAdd(ctx->builder, val[chan],
- LLVMBuildSelect(ctx->builder,
- LLVMBuildFCmp(ctx->builder, LLVMRealOGE,
- val[chan], ctx->f32_0, ""),
- LLVMConstReal(ctx->f32, 0.5),
- LLVMConstReal(ctx->f32, -0.5), ""), "");
- val[chan] = LLVMBuildFPToSI(ctx->builder, val[chan], ctx->i32, "");
- }
- return ac_llvm_pack_two_int32_as_int16(ctx, val);
+ LLVMValueRef res =
+ ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16",
+ ctx->v2i16, args, 2,
+ AC_FUNC_ATTR_READNONE);
+ return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
}
LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx,
LLVMValueRef args[2])
{
- if (HAVE_LLVM >= 0x0600) {
- LLVMValueRef res =
- ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16",
- ctx->v2i16, args, 2,
- AC_FUNC_ATTR_READNONE);
- return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
- }
-
- LLVMValueRef val[2];
-
- for (int chan = 0; chan < 2; chan++) {
- val[chan] = ac_build_clamp(ctx, args[chan]);
- val[chan] = LLVMBuildFMul(ctx->builder, val[chan],
- LLVMConstReal(ctx->f32, 65535), "");
- val[chan] = LLVMBuildFAdd(ctx->builder, val[chan],
- LLVMConstReal(ctx->f32, 0.5), "");
- val[chan] = LLVMBuildFPToUI(ctx->builder, val[chan],
- ctx->i32, "");
- }
- return ac_llvm_pack_two_int32_as_int16(ctx, val);
+ LLVMValueRef res =
+ ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16",
+ ctx->v2i16, args, 2,
+ AC_FUNC_ATTR_READNONE);
+ return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
}
/* The 8-bit and 10-bit clamping is for HW workarounds. */
bits != 10 ? max_rgb : ctx->i32_1;
LLVMValueRef min_alpha =
bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
- bool has_intrinsic = HAVE_LLVM >= 0x0600;
/* Clamp. */
- if (!has_intrinsic || bits != 16) {
+ if (bits != 16) {
for (int i = 0; i < 2; i++) {
bool alpha = hi && i == 1;
args[i] = ac_build_imin(ctx, args[i],
}
}
- if (has_intrinsic) {
- LLVMValueRef res =
- ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16",
- ctx->v2i16, args, 2,
- AC_FUNC_ATTR_READNONE);
- return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
- }
-
- return ac_llvm_pack_two_int32_as_int16(ctx, args);
+ LLVMValueRef res =
+ ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16",
+ ctx->v2i16, args, 2,
+ AC_FUNC_ATTR_READNONE);
+ return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
}
/* The 8-bit and 10-bit clamping is for HW workarounds. */
bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
LLVMValueRef max_alpha =
bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
- bool has_intrinsic = HAVE_LLVM >= 0x0600;
/* Clamp. */
- if (!has_intrinsic || bits != 16) {
+ if (bits != 16) {
for (int i = 0; i < 2; i++) {
bool alpha = hi && i == 1;
args[i] = ac_build_umin(ctx, args[i],
}
}
- if (has_intrinsic) {
- LLVMValueRef res =
- ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16",
- ctx->v2i16, args, 2,
- AC_FUNC_ATTR_READNONE);
- return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
- }
-
- return ac_llvm_pack_two_int16(ctx, args);
+ LLVMValueRef res =
+ ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16",
+ ctx->v2i16, args, 2,
+ AC_FUNC_ATTR_READNONE);
+ return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
}
LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
{
- assert(HAVE_LLVM >= 0x0600);
return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1,
&i1, 1, AC_FUNC_ATTR_READNONE);
}
void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
{
- if (HAVE_LLVM >= 0x0600) {
- ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt,
- &i1, 1, 0);
- return;
- }
-
- LLVMValueRef value = LLVMBuildSelect(ctx->builder, i1,
- LLVMConstReal(ctx->f32, 1),
- LLVMConstReal(ctx->f32, -1), "");
- ac_build_intrinsic(ctx, "llvm.AMDGPU.kill", ctx->voidt,
- &value, 1, AC_FUNC_ATTR_LEGACY);
+ ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt,
+ &i1, 1, 0);
}
LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
width,
};
- if (HAVE_LLVM >= 0x0500) {
- return ac_build_intrinsic(ctx,
- is_signed ? "llvm.amdgcn.sbfe.i32" :
- "llvm.amdgcn.ubfe.i32",
- ctx->i32, args, 3,
- AC_FUNC_ATTR_READNONE);
- }
-
return ac_build_intrinsic(ctx,
- is_signed ? "llvm.AMDGPU.bfe.i32" :
- "llvm.AMDGPU.bfe.u32",
+ is_signed ? "llvm.amdgcn.sbfe.i32" :
+ "llvm.amdgcn.ubfe.i32",
ctx->i32, args, 3,
- AC_FUNC_ATTR_READNONE |
- AC_FUNC_ATTR_LEGACY);
+ AC_FUNC_ATTR_READNONE);
+}
+
+LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0,
+ LLVMValueRef s1, LLVMValueRef s2)
+{
+ return LLVMBuildAdd(ctx->builder,
+ LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
+}
+
+LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0,
+ LLVMValueRef s1, LLVMValueRef s2)
+{
+ return LLVMBuildFAdd(ctx->builder,
+ LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
}
void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned simm16)
char *intr;
if (bitsize == 32) {
- intr = "llvm.floor.f32";
+ intr = "llvm.amdgcn.fract.f32";
type = ctx->f32;
} else {
- intr = "llvm.floor.f64";
+ intr = "llvm.amdgcn.fract.f64";
type = ctx->f64;
}
LLVMValueRef params[] = {
src0,
};
- LLVMValueRef floor = ac_build_intrinsic(ctx, intr, type, params, 1,
- AC_FUNC_ATTR_READNONE);
- return LLVMBuildFSub(ctx->builder, src0, floor, "");
+ return ac_build_intrinsic(ctx, intr, type, params, 1,
+ AC_FUNC_ATTR_READNONE);
}
LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
unsigned bitsize)
{
- LLVMValueRef cmp, val, zero, one;
- LLVMTypeRef type;
-
- if (bitsize == 32) {
- type = ctx->i32;
- zero = ctx->i32_0;
- one = ctx->i32_1;
- } else {
- type = ctx->i64;
- zero = ctx->i64_0;
- one = ctx->i64_1;
- }
+ LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize);
+ LLVMValueRef zero = LLVMConstInt(type, 0, false);
+ LLVMValueRef one = LLVMConstInt(type, 1, false);
+ LLVMValueRef cmp, val;
cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
return val;
}
-#define AC_EXP_TARGET (HAVE_LLVM >= 0x0500 ? 0 : 3)
-#define AC_EXP_ENABLED_CHANNELS (HAVE_LLVM >= 0x0500 ? 1 : 0)
-#define AC_EXP_OUT0 (HAVE_LLVM >= 0x0500 ? 2 : 5)
+LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
+{
+ LLVMValueRef result;
+ unsigned bitsize;
+
+ bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
+
+ switch (bitsize) {
+ case 64:
+ result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64,
+ (LLVMValueRef []) { src0 }, 1,
+ AC_FUNC_ATTR_READNONE);
+
+ result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
+ break;
+ case 32:
+ result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32,
+ (LLVMValueRef []) { src0 }, 1,
+ AC_FUNC_ATTR_READNONE);
+ break;
+ case 16:
+ result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16,
+ (LLVMValueRef []) { src0 }, 1,
+ AC_FUNC_ATTR_READNONE);
+ break;
+ default:
+ unreachable(!"invalid bitsize");
+ break;
+ }
+
+ return result;
+}
+
+LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
+ LLVMValueRef src0)
+{
+ LLVMValueRef result;
+ unsigned bitsize;
+
+ bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
+
+ switch (bitsize) {
+ case 32:
+ result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32,
+ (LLVMValueRef []) { src0 }, 1,
+ AC_FUNC_ATTR_READNONE);
+ break;
+ case 16:
+ result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16,
+ (LLVMValueRef []) { src0 }, 1,
+ AC_FUNC_ATTR_READNONE);
+ break;
+ default:
+ unreachable(!"invalid bitsize");
+ break;
+ }
+
+ return result;
+}
+
+#define AC_EXP_TARGET 0
+#define AC_EXP_ENABLED_CHANNELS 1
+#define AC_EXP_OUT0 2
enum ac_ir_type {
AC_IR_UNDEF,
{
unsigned lds_size = ctx->chip_class >= CIK ? 65536 : 32768;
ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
- LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_LOCAL_ADDR_SPACE),
+ LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS),
"lds");
}
const char *intrin_name;
LLVMTypeRef type;
LLVMValueRef zero;
- if (src0_bitsize == 64) {
+
+ switch (src0_bitsize) {
+ case 64:
intrin_name = "llvm.cttz.i64";
type = ctx->i64;
zero = ctx->i64_0;
- } else {
+ break;
+ case 32:
intrin_name = "llvm.cttz.i32";
type = ctx->i32;
zero = ctx->i32_0;
+ break;
+ case 16:
+ intrin_name = "llvm.cttz.i16";
+ type = ctx->i16;
+ zero = ctx->i16_0;
+ break;
+ default:
+ unreachable(!"invalid bitsize");
}
LLVMValueRef params[2] = {
*
* The hardware already implements the correct behavior.
*/
- LLVMConstInt(ctx->i1, 1, false),
+ ctx->i1true,
};
LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type,
LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
{
return LLVMPointerType(LLVMArrayType(elem_type, 0),
- AC_CONST_ADDR_SPACE);
+ AC_ADDR_SPACE_CONST);
}
LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
{
- if (!HAVE_32BIT_POINTERS)
- return ac_array_in_const_addr_space(elem_type);
-
return LLVMPointerType(LLVMArrayType(elem_type, 0),
- AC_CONST_32BIT_ADDR_SPACE);
+ AC_ADDR_SPACE_CONST_32BIT);
}
static struct ac_llvm_flow *
ctx->flow_depth--;
}
-static void if_cond_emit(struct ac_llvm_context *ctx, LLVMValueRef cond,
- int label_id)
+void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
{
struct ac_llvm_flow *flow = push_flow(ctx);
LLVMBasicBlockRef if_block;
{
LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
value, ctx->f32_0, "");
- if_cond_emit(ctx, cond, label_id);
+ ac_build_ifcc(ctx, cond, label_id);
}
void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
ac_to_integer(ctx, value),
ctx->i32_0, "");
- if_cond_emit(ctx, cond, label_id);
+ ac_build_ifcc(ctx, cond, label_id);
}
-LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type,
+LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type,
const char *name)
{
LLVMBuilderRef builder = ac->builder;
}
res = LLVMBuildAlloca(first_builder, type, name);
- LLVMBuildStore(builder, LLVMConstNull(type), res);
-
LLVMDisposeBuilder(first_builder);
-
return res;
}
-LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac,
+LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac,
LLVMTypeRef type, const char *name)
{
- LLVMValueRef ptr = ac_build_alloca(ac, type, name);
- LLVMBuildStore(ac->builder, LLVMGetUndef(type), ptr);
+ LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
+ LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
return ptr;
}
if (count == num_components)
return value;
- LLVMValueRef masks[] = {
- LLVMConstInt(ctx->i32, 0, false), LLVMConstInt(ctx->i32, 1, false),
- LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false)};
+ LLVMValueRef masks[MAX2(count, 2)];
+ masks[0] = ctx->i32_0;
+ masks[1] = ctx->i32_1;
+ for (unsigned i = 2; i < count; i++)
+ masks[i] = LLVMConstInt(ctx->i32, i, false);
if (count == 1)
return LLVMBuildExtractElement(ctx->builder, value, masks[0],
final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
LLVMConstInt(ac->i32, 4, 0), "");
final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
+ /* Mask the sample index by 0x7, because 0x8 means an unknown value
+ * with EQAA, so those will map to 0. */
final_sample = LLVMBuildAnd(ac->builder, final_sample,
- LLVMConstInt(ac->i32, 0xF, 0), "");
+ LLVMConstInt(ac->i32, 0x7, 0), "");
/* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
- * resource descriptor is 0 (invalid),
+ * resource descriptor is 0 (invalid).
*/
LLVMValueRef tmp;
tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
LLVMValueRef inactive)
{
- char name[32], type[8];
+ char name[33], type[8];
LLVMTypeRef src_type = LLVMTypeOf(src);
src = ac_to_integer(ctx, src);
inactive = ac_to_integer(ctx, inactive);
}
}
-/* TODO: add inclusive and excluse scan functions for SI chip class. */
+/**
+ * \param maxprefix specifies that the result only needs to be correct for a
+ * prefix of this many threads
+ *
+ * TODO: add inclusive and excluse scan functions for SI chip class.
+ */
static LLVMValueRef
-ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity)
+ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
+ unsigned maxprefix)
{
LLVMValueRef result, tmp;
result = src;
+ if (maxprefix <= 1)
+ return result;
tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 2)
+ return result;
tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 3)
+ return result;
tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 4)
+ return result;
tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 8)
+ return result;
tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 16)
+ return result;
tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 32)
+ return result;
tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
result = ac_build_alu_op(ctx, result, tmp, op);
return result;
LLVMValueRef
ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
{
- ac_build_optimization_barrier(ctx, &src);
LLVMValueRef result;
- LLVMValueRef identity = get_reduction_identity(ctx, op,
- ac_get_type_size(LLVMTypeOf(src)));
- result = LLVMBuildBitCast(ctx->builder,
- ac_build_set_inactive(ctx, src, identity),
- LLVMTypeOf(identity), "");
- result = ac_build_scan(ctx, op, result, identity);
+
+ if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
+ LLVMBuilderRef builder = ctx->builder;
+ src = LLVMBuildZExt(builder, src, ctx->i32, "");
+ result = ac_build_ballot(ctx, src);
+ result = ac_build_mbcnt(ctx, result);
+ result = LLVMBuildAdd(builder, result, src, "");
+ return result;
+ }
+
+ ac_build_optimization_barrier(ctx, &src);
+
+ LLVMValueRef identity =
+ get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
+ result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
+ LLVMTypeOf(identity), "");
+ result = ac_build_scan(ctx, op, result, identity, 64);
return ac_build_wwm(ctx, result);
}
LLVMValueRef
ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
{
- ac_build_optimization_barrier(ctx, &src);
LLVMValueRef result;
- LLVMValueRef identity = get_reduction_identity(ctx, op,
- ac_get_type_size(LLVMTypeOf(src)));
- result = LLVMBuildBitCast(ctx->builder,
- ac_build_set_inactive(ctx, src, identity),
- LLVMTypeOf(identity), "");
+
+ if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
+ LLVMBuilderRef builder = ctx->builder;
+ src = LLVMBuildZExt(builder, src, ctx->i32, "");
+ result = ac_build_ballot(ctx, src);
+ result = ac_build_mbcnt(ctx, result);
+ return result;
+ }
+
+ ac_build_optimization_barrier(ctx, &src);
+
+ LLVMValueRef identity =
+ get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
+ result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
+ LLVMTypeOf(identity), "");
result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
- result = ac_build_scan(ctx, op, result, identity);
+ result = ac_build_scan(ctx, op, result, identity, 64);
return ac_build_wwm(ctx, result);
}
}
}
+/**
+ * "Top half" of a scan that reduces per-wave values across an entire
+ * workgroup.
+ *
+ * The source value must be present in the highest lane of the wave, and the
+ * highest lane must be live.
+ */
+void
+ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ if (ws->maxwaves <= 1)
+ return;
+
+ const LLVMValueRef i32_63 = LLVMConstInt(ctx->i32, 63, false);
+ LLVMBuilderRef builder = ctx->builder;
+ LLVMValueRef tid = ac_get_thread_id(ctx);
+ LLVMValueRef tmp;
+
+ tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, i32_63, "");
+ ac_build_ifcc(ctx, tmp, 1000);
+ LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
+ ac_build_endif(ctx, 1000);
+}
+
+/**
+ * "Bottom half" of a scan that reduces per-wave values across an entire
+ * workgroup.
+ *
+ * The caller must place a barrier between the top and bottom halves.
+ */
+void
+ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ const LLVMTypeRef type = LLVMTypeOf(ws->src);
+ const LLVMValueRef identity =
+ get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
+
+ if (ws->maxwaves <= 1) {
+ ws->result_reduce = ws->src;
+ ws->result_inclusive = ws->src;
+ ws->result_exclusive = identity;
+ return;
+ }
+ assert(ws->maxwaves <= 32);
+
+ LLVMBuilderRef builder = ctx->builder;
+ LLVMValueRef tid = ac_get_thread_id(ctx);
+ LLVMBasicBlockRef bbs[2];
+ LLVMValueRef phivalues_scan[2];
+ LLVMValueRef tmp, tmp2;
+
+ bbs[0] = LLVMGetInsertBlock(builder);
+ phivalues_scan[0] = LLVMGetUndef(type);
+
+ if (ws->enable_reduce)
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
+ else if (ws->enable_inclusive)
+ tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
+ else
+ tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
+ ac_build_ifcc(ctx, tmp, 1001);
+ {
+ tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
+
+ ac_build_optimization_barrier(ctx, &tmp);
+
+ bbs[1] = LLVMGetInsertBlock(builder);
+ phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves);
+ }
+ ac_build_endif(ctx, 1001);
+
+ const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
+
+ if (ws->enable_reduce) {
+ tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
+ ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
+ }
+ if (ws->enable_inclusive)
+ ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
+ if (ws->enable_exclusive) {
+ tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
+ tmp = ac_build_readlane(ctx, scan, tmp);
+ tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
+ ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
+ }
+}
+
+/**
+ * Inclusive scan of a per-wave value across an entire workgroup.
+ *
+ * This implies an s_barrier instruction.
+ *
+ * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
+ * of the workgroup are live. (This requirement cannot easily be relaxed in a
+ * useful manner because of the barrier in the algorithm.)
+ */
+void
+ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ ac_build_wg_wavescan_top(ctx, ws);
+ ac_build_s_barrier(ctx);
+ ac_build_wg_wavescan_bottom(ctx, ws);
+}
+
+/**
+ * "Top half" of a scan that reduces per-thread values across an entire
+ * workgroup.
+ *
+ * All lanes must be active when this code runs.
+ */
+void
+ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ if (ws->enable_exclusive) {
+ ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
+ if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
+ ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
+ ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
+ } else {
+ ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
+ }
+
+ bool enable_inclusive = ws->enable_inclusive;
+ bool enable_exclusive = ws->enable_exclusive;
+ ws->enable_inclusive = false;
+ ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
+ ac_build_wg_wavescan_top(ctx, ws);
+ ws->enable_inclusive = enable_inclusive;
+ ws->enable_exclusive = enable_exclusive;
+}
+
+/**
+ * "Bottom half" of a scan that reduces per-thread values across an entire
+ * workgroup.
+ *
+ * The caller must place a barrier between the top and bottom halves.
+ */
+void
+ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ bool enable_inclusive = ws->enable_inclusive;
+ bool enable_exclusive = ws->enable_exclusive;
+ ws->enable_inclusive = false;
+ ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
+ ac_build_wg_wavescan_bottom(ctx, ws);
+ ws->enable_inclusive = enable_inclusive;
+ ws->enable_exclusive = enable_exclusive;
+
+ /* ws->result_reduce is already the correct value */
+ if (ws->enable_inclusive)
+ ws->result_inclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->src, ws->op);
+ if (ws->enable_exclusive)
+ ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
+}
+
+/**
+ * A scan that reduces per-thread values across an entire workgroup.
+ *
+ * The caller must ensure that all lanes are active when this code runs
+ * (WWM is insufficient!), because there is an implied barrier.
+ */
+void
+ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
+{
+ ac_build_wg_scan_top(ctx, ws);
+ ac_build_s_barrier(ctx);
+ ac_build_wg_scan_bottom(ctx, ws);
+}
+
LLVMValueRef
ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
{
unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
- if (ctx->chip_class >= VI && HAVE_LLVM >= 0x0600) {
+ if (ctx->chip_class >= VI) {
return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
} else {
return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);