#include "util/bitscan.h"
#include "util/macros.h"
#include "util/u_atomic.h"
+#include "util/u_math.h"
#include "sid.h"
#include "shader_enums.h"
+#define AC_LLVM_INITIAL_CF_DEPTH 4
+
+/* Data for if/else/endif and bgnloop/endloop control flow structures.
+ */
+struct ac_llvm_flow {
+ /* Loop exit or next part of if/else/endif. */
+ LLVMBasicBlockRef next_block;
+ LLVMBasicBlockRef loop_entry_block;
+};
+
/* Initialize module-independent parts of the context.
*
* The caller is responsible for initializing ctx::module and ctx::builder.
*/
void
-ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context,
+ac_llvm_context_init(struct ac_llvm_context *ctx,
enum chip_class chip_class, enum radeon_family family)
{
LLVMValueRef args[1];
+ ctx->context = LLVMContextCreate();
+
ctx->chip_class = chip_class;
ctx->family = family;
-
- ctx->context = context;
ctx->module = NULL;
ctx->builder = NULL;
ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
+ ctx->intptr = HAVE_32BIT_POINTERS ? ctx->i32 : ctx->i64;
ctx->f16 = LLVMHalfTypeInContext(ctx->context);
ctx->f32 = LLVMFloatTypeInContext(ctx->context);
ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
}
+void
+ac_llvm_context_dispose(struct ac_llvm_context *ctx)
+{
+ free(ctx->flow);
+ ctx->flow = NULL;
+ ctx->flow_depth_max = 0;
+}
+
int
ac_get_llvm_num_components(LLVMValueRef value)
{
LLVMConstInt(ac->i32, index, false), "");
}
+int
+ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
+{
+ if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
+ type = LLVMGetElementType(type);
+
+ if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
+ return LLVMGetIntTypeWidth(type);
+
+ if (type == ctx->f16)
+ return 16;
+ if (type == ctx->f32)
+ return 32;
+ if (type == ctx->f64)
+ return 64;
+
+ unreachable("Unhandled type kind in get_elem_bits");
+}
+
unsigned
ac_get_type_size(LLVMTypeRef type)
{
switch (kind) {
case LLVMIntegerTypeKind:
return LLVMGetIntTypeWidth(type) / 8;
+ case LLVMHalfTypeKind:
+ return 2;
case LLVMFloatTypeKind:
return 4;
case LLVMDoubleTypeKind:
+ return 8;
case LLVMPointerTypeKind:
+ if (LLVMGetPointerAddressSpace(type) == AC_CONST_32BIT_ADDR_SPACE)
+ return 4;
return 8;
case LLVMVectorTypeKind:
return LLVMGetVectorSize(type) *
case LLVMIntegerTypeKind:
snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
break;
+ case LLVMHalfTypeKind:
+ snprintf(buf, bufsize, "f16");
+ break;
case LLVMFloatTypeKind:
snprintf(buf, bufsize, "f32");
break;
*/
ac_build_optimization_barrier(ctx, &args[0]);
- if (LLVMTypeOf(args[0]) != ctx->i32)
- args[0] = LLVMBuildBitCast(ctx->builder, args[0], ctx->i32, "");
+ args[0] = ac_to_integer(ctx, args[0]);
return ac_build_intrinsic(ctx,
"llvm.amdgcn.icmp.i32",
bool writeonly_memory,
bool swizzle_enable_hint)
{
+ /* Split 3 channel stores, becase LLVM doesn't support 3-channel
+ * intrinsics. */
+ if (num_channels == 3) {
+ LLVMValueRef v[3], v01;
+
+ for (int i = 0; i < 3; i++) {
+ v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
+ LLVMConstInt(ctx->i32, i, 0), "");
+ }
+ v01 = ac_build_gather_values(ctx, v, 2);
+
+ ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
+ soffset, inst_offset, glc, slc,
+ writeonly_memory, swizzle_enable_hint);
+ ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
+ soffset, inst_offset + 8,
+ glc, slc,
+ writeonly_memory, swizzle_enable_hint);
+ return;
+ }
+
/* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
* (voffset is swizzled, but soffset isn't swizzled).
* llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
*/
if (!swizzle_enable_hint) {
- /* Split 3 channel stores, becase LLVM doesn't support 3-channel
- * intrinsics. */
- if (num_channels == 3) {
- LLVMValueRef v[3], v01;
-
- for (int i = 0; i < 3; i++) {
- v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
- LLVMConstInt(ctx->i32, i, 0), "");
- }
- v01 = ac_build_gather_values(ctx, v, 2);
-
- ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
- soffset, inst_offset, glc, slc,
- writeonly_memory, swizzle_enable_hint);
- ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
- soffset, inst_offset + 8,
- glc, slc,
- writeonly_memory, swizzle_enable_hint);
- return;
- }
+ LLVMValueRef offset = soffset;
- unsigned func = CLAMP(num_channels, 1, 3) - 1;
static const char *types[] = {"f32", "v2f32", "v4f32"};
- char name[256];
- LLVMValueRef offset = soffset;
if (inst_offset)
offset = LLVMBuildAdd(ctx->builder, offset,
LLVMConstInt(ctx->i1, slc, 0),
};
+ char name[256];
snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
- types[func]);
+ types[CLAMP(num_channels, 1, 3) - 1]);
ac_build_intrinsic(ctx, name, ctx->voidt,
args, ARRAY_SIZE(args),
writeonly_memory ?
- AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
- AC_FUNC_ATTR_WRITEONLY);
+ AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
+ AC_FUNC_ATTR_WRITEONLY);
return;
}
- static unsigned dfmt[] = {
+ static const unsigned dfmt[] = {
V_008F0C_BUF_DATA_FORMAT_32,
V_008F0C_BUF_DATA_FORMAT_32_32,
V_008F0C_BUF_DATA_FORMAT_32_32_32,
V_008F0C_BUF_DATA_FORMAT_32_32_32_32
};
- assert(num_channels >= 1 && num_channels <= 4);
-
+ static const char *types[] = {"i32", "v2i32", "v4i32"};
LLVMValueRef args[] = {
- rsrc,
vdata,
- LLVMConstInt(ctx->i32, num_channels, 0),
- voffset ? voffset : LLVMGetUndef(ctx->i32),
+ LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
+ LLVMConstInt(ctx->i32, 0, 0),
+ voffset ? voffset : LLVMConstInt(ctx->i32, 0, 0),
soffset,
LLVMConstInt(ctx->i32, inst_offset, 0),
LLVMConstInt(ctx->i32, dfmt[num_channels - 1], 0),
LLVMConstInt(ctx->i32, V_008F0C_BUF_NUM_FORMAT_UINT, 0),
- LLVMConstInt(ctx->i32, voffset != NULL, 0),
- LLVMConstInt(ctx->i32, 0, 0), /* idxen */
- LLVMConstInt(ctx->i32, glc, 0),
- LLVMConstInt(ctx->i32, slc, 0),
- LLVMConstInt(ctx->i32, 0, 0), /* tfe*/
+ LLVMConstInt(ctx->i1, glc, 0),
+ LLVMConstInt(ctx->i1, slc, 0),
};
-
- /* The instruction offset field has 12 bits */
- assert(voffset || inst_offset < (1 << 12));
-
- /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
- unsigned func = CLAMP(num_channels, 1, 3) - 1;
- const char *types[] = {"i32", "v2i32", "v4i32"};
char name[256];
- snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
+ snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s",
+ types[CLAMP(num_channels, 1, 3) - 1]);
ac_build_intrinsic(ctx, name, ctx->voidt,
args, ARRAY_SIZE(args),
- AC_FUNC_ATTR_LEGACY);
+ writeonly_memory ?
+ AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
+ AC_FUNC_ATTR_WRITEONLY);
}
static LLVMValueRef
can_speculate, true);
}
+LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef vindex,
+ LLVMValueRef voffset,
+ unsigned num_channels,
+ bool glc,
+ bool can_speculate)
+{
+ LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), "");
+ LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 1, 0), "");
+ stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), "");
+
+ LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->builder,
+ LLVMBuildICmp(ctx->builder, LLVMIntUGT, elem_count, stride, ""),
+ elem_count, stride, "");
+
+ LLVMValueRef new_rsrc = LLVMBuildInsertElement(ctx->builder, rsrc, new_elem_count,
+ LLVMConstInt(ctx->i32, 2, 0), "");
+
+ return ac_build_buffer_load_common(ctx, new_rsrc, vindex, voffset,
+ num_channels, glc, false,
+ can_speculate, true);
+}
+
+LLVMValueRef
+ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
+ LLVMValueRef rsrc,
+ LLVMValueRef vindex,
+ LLVMValueRef voffset,
+ LLVMValueRef soffset,
+ LLVMValueRef immoffset)
+{
+ const char *name = "llvm.amdgcn.tbuffer.load.i32";
+ LLVMTypeRef type = ctx->i32;
+ LLVMValueRef params[] = {
+ rsrc,
+ vindex,
+ voffset,
+ soffset,
+ immoffset,
+ LLVMConstInt(ctx->i32, V_008F0C_BUF_DATA_FORMAT_16, false),
+ LLVMConstInt(ctx->i32, V_008F0C_BUF_NUM_FORMAT_UINT, false),
+ ctx->i1false,
+ ctx->i1false,
+ };
+ LLVMValueRef res = ac_build_intrinsic(ctx, name, type, params, 9, 0);
+ return LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
+}
+
/**
* Set range metadata on an instruction. This can only be used on load and
* call instructions. If you know an instruction can only produce the values
LLVMValueRef tl, trbl, args[2];
LLVMValueRef result;
- if (ctx->chip_class >= VI) {
+ if (HAVE_LLVM >= 0x0700) {
+ unsigned tl_lanes[4], trbl_lanes[4];
+
+ for (unsigned i = 0; i < 4; ++i) {
+ tl_lanes[i] = i & mask;
+ trbl_lanes[i] = (i & mask) + idx;
+ }
+
+ tl = ac_build_quad_swizzle(ctx, val,
+ tl_lanes[0], tl_lanes[1],
+ tl_lanes[2], tl_lanes[3]);
+ trbl = ac_build_quad_swizzle(ctx, val,
+ trbl_lanes[0], trbl_lanes[1],
+ trbl_lanes[2], trbl_lanes[3]);
+ } else if (ctx->chip_class >= VI) {
LLVMValueRef thread_id, tl_tid, trbl_tid;
thread_id = ac_get_thread_id(ctx);
tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
+
+ if (HAVE_LLVM >= 0x0700) {
+ result = ac_build_intrinsic(ctx,
+ "llvm.amdgcn.wqm.f32", ctx->f32,
+ &result, 1, 0);
+ }
+
return result;
}
LLVMValueRef arg,
LLVMTypeRef dst_type)
{
- LLVMValueRef args[2] = {
+ const char *intrin_name;
+ LLVMTypeRef type;
+ LLVMValueRef highest_bit;
+ LLVMValueRef zero;
+
+ if (ac_get_elem_bits(ctx, LLVMTypeOf(arg)) == 64) {
+ intrin_name = "llvm.ctlz.i64";
+ type = ctx->i64;
+ highest_bit = LLVMConstInt(ctx->i64, 63, false);
+ zero = ctx->i64_0;
+ } else {
+ intrin_name = "llvm.ctlz.i32";
+ type = ctx->i32;
+ highest_bit = LLVMConstInt(ctx->i32, 31, false);
+ zero = ctx->i32_0;
+ }
+
+ LLVMValueRef params[2] = {
arg,
ctx->i1true,
};
- LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.ctlz.i32",
- dst_type, args, ARRAY_SIZE(args),
+
+ LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type,
+ params, 2,
AC_FUNC_ATTR_READNONE);
/* The HW returns the last bit index from MSB, but TGSI/NIR wants
* the index from LSB. Invert it by doing "31 - msb". */
- msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
- msb, "");
+ msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
+ msb = LLVMBuildTruncOrBitCast(ctx->builder, msb, ctx->i32, "");
/* check for zero */
return LLVMBuildSelect(ctx->builder,
- LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg,
- LLVMConstInt(ctx->i32, 0, 0), ""),
+ LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
LLVMConstInt(ctx->i32, -1, true), msb, "");
}
LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
{
- if (HAVE_LLVM >= 0x0500) {
- return ac_build_fmin(ctx, ac_build_fmax(ctx, value, ctx->f32_0),
- ctx->f32_1);
- }
-
- LLVMValueRef args[3] = {
- value,
- LLVMConstReal(ctx->f32, 0),
- LLVMConstReal(ctx->f32, 1),
- };
-
- return ac_build_intrinsic(ctx, "llvm.AMDGPU.clamp.", ctx->f32, args, 3,
- AC_FUNC_ATTR_READNONE |
- AC_FUNC_ATTR_LEGACY);
+ return ac_build_fmin(ctx, ac_build_fmax(ctx, value, ctx->f32_0),
+ ctx->f32_1);
}
void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
{
LLVMValueRef args[9];
- if (HAVE_LLVM >= 0x0500) {
- args[0] = LLVMConstInt(ctx->i32, a->target, 0);
- args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
+ args[0] = LLVMConstInt(ctx->i32, a->target, 0);
+ args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
- if (a->compr) {
- LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
- LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
+ if (a->compr) {
+ LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
+ LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
- args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
- v2i16, "");
- args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
- v2i16, "");
- args[4] = LLVMConstInt(ctx->i1, a->done, 0);
- args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
+ args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
+ v2i16, "");
+ args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
+ v2i16, "");
+ args[4] = LLVMConstInt(ctx->i1, a->done, 0);
+ args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
- ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
- ctx->voidt, args, 6, 0);
- } else {
- args[2] = a->out[0];
- args[3] = a->out[1];
- args[4] = a->out[2];
- args[5] = a->out[3];
- args[6] = LLVMConstInt(ctx->i1, a->done, 0);
- args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
-
- ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
- ctx->voidt, args, 8, 0);
- }
- return;
+ ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
+ ctx->voidt, args, 6, 0);
+ } else {
+ args[2] = a->out[0];
+ args[3] = a->out[1];
+ args[4] = a->out[2];
+ args[5] = a->out[3];
+ args[6] = LLVMConstInt(ctx->i1, a->done, 0);
+ args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
+
+ ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
+ ctx->voidt, args, 8, 0);
+ }
+}
+
+void ac_build_export_null(struct ac_llvm_context *ctx)
+{
+ struct ac_export_args args;
+
+ args.enabled_channels = 0x0; /* enabled channels */
+ args.valid_mask = 1; /* whether the EXEC mask is valid */
+ args.done = 1; /* DONE bit */
+ args.target = V_008DFC_SQ_EXP_NULL;
+ args.compr = 0; /* COMPR flag (0 = 32-bit export) */
+ args.out[0] = LLVMGetUndef(ctx->f32); /* R */
+ args.out[1] = LLVMGetUndef(ctx->f32); /* G */
+ args.out[2] = LLVMGetUndef(ctx->f32); /* B */
+ args.out[3] = LLVMGetUndef(ctx->f32); /* A */
+
+ ac_build_export(ctx, &args);
+}
+
+static unsigned ac_num_coords(enum ac_image_dim dim)
+{
+ switch (dim) {
+ case ac_image_1d:
+ return 1;
+ case ac_image_2d:
+ case ac_image_1darray:
+ return 2;
+ case ac_image_3d:
+ case ac_image_cube:
+ case ac_image_2darray:
+ case ac_image_2dmsaa:
+ return 3;
+ case ac_image_2darraymsaa:
+ return 4;
+ default:
+ unreachable("ac_num_coords: bad dim");
}
+}
- args[0] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
- args[1] = LLVMConstInt(ctx->i32, a->valid_mask, 0);
- args[2] = LLVMConstInt(ctx->i32, a->done, 0);
- args[3] = LLVMConstInt(ctx->i32, a->target, 0);
- args[4] = LLVMConstInt(ctx->i32, a->compr, 0);
- memcpy(args + 5, a->out, sizeof(a->out[0]) * 4);
+static unsigned ac_num_derivs(enum ac_image_dim dim)
+{
+ switch (dim) {
+ case ac_image_1d:
+ case ac_image_1darray:
+ return 2;
+ case ac_image_2d:
+ case ac_image_2darray:
+ case ac_image_cube:
+ return 4;
+ case ac_image_3d:
+ return 6;
+ case ac_image_2dmsaa:
+ case ac_image_2darraymsaa:
+ default:
+ unreachable("derivatives not supported");
+ }
+}
- ac_build_intrinsic(ctx, "llvm.SI.export", ctx->voidt, args, 9,
- AC_FUNC_ATTR_LEGACY);
+static const char *get_atomic_name(enum ac_atomic_op op)
+{
+ switch (op) {
+ case ac_atomic_swap: return "swap";
+ case ac_atomic_add: return "add";
+ case ac_atomic_sub: return "sub";
+ case ac_atomic_smin: return "smin";
+ case ac_atomic_umin: return "umin";
+ case ac_atomic_smax: return "smax";
+ case ac_atomic_umax: return "umax";
+ case ac_atomic_and: return "and";
+ case ac_atomic_or: return "or";
+ case ac_atomic_xor: return "xor";
+ }
+ unreachable("bad atomic op");
}
-LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
- struct ac_image_args *a)
+/* LLVM 6 and older */
+static LLVMValueRef ac_build_image_opcode_llvm6(struct ac_llvm_context *ctx,
+ struct ac_image_args *a)
{
- LLVMValueRef args[11];
- unsigned num_args = 0;
+ LLVMValueRef args[16];
+ LLVMTypeRef retty = ctx->v4f32;
const char *name = NULL;
- char intr_name[128], type[64];
+ const char *atomic_subop = "";
+ char intr_name[128], coords_type[64];
bool sample = a->opcode == ac_image_sample ||
a->opcode == ac_image_gather4 ||
a->opcode == ac_image_get_lod;
+ bool atomic = a->opcode == ac_image_atomic ||
+ a->opcode == ac_image_atomic_cmpswap;
+ bool da = a->dim == ac_image_cube ||
+ a->dim == ac_image_1darray ||
+ a->dim == ac_image_2darray ||
+ a->dim == ac_image_2darraymsaa;
+ if (a->opcode == ac_image_get_lod)
+ da = false;
+
+ unsigned num_coords =
+ a->opcode != ac_image_get_resinfo ? ac_num_coords(a->dim) : 0;
+ LLVMValueRef addr;
+ unsigned num_addr = 0;
+
+ if (a->opcode == ac_image_get_lod) {
+ switch (a->dim) {
+ case ac_image_1darray:
+ num_coords = 1;
+ break;
+ case ac_image_2darray:
+ case ac_image_cube:
+ num_coords = 2;
+ break;
+ default:
+ break;
+ }
+ }
+ if (a->offset)
+ args[num_addr++] = ac_to_integer(ctx, a->offset);
+ if (a->bias)
+ args[num_addr++] = ac_to_integer(ctx, a->bias);
+ if (a->compare)
+ args[num_addr++] = ac_to_integer(ctx, a->compare);
+ if (a->derivs[0]) {
+ unsigned num_derivs = ac_num_derivs(a->dim);
+ for (unsigned i = 0; i < num_derivs; ++i)
+ args[num_addr++] = ac_to_integer(ctx, a->derivs[i]);
+ }
+ for (unsigned i = 0; i < num_coords; ++i)
+ args[num_addr++] = ac_to_integer(ctx, a->coords[i]);
+ if (a->lod)
+ args[num_addr++] = ac_to_integer(ctx, a->lod);
+
+ unsigned pad_goal = util_next_power_of_two(num_addr);
+ while (num_addr < pad_goal)
+ args[num_addr++] = LLVMGetUndef(ctx->i32);
+
+ addr = ac_build_gather_values(ctx, args, num_addr);
+
+ unsigned num_args = 0;
+ if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
+ args[num_args++] = a->data[0];
+ if (a->opcode == ac_image_atomic_cmpswap)
+ args[num_args++] = a->data[1];
+ }
+
+ unsigned coords_arg = num_args;
if (sample)
- args[num_args++] = ac_to_float(ctx, a->addr);
+ args[num_args++] = ac_to_float(ctx, addr);
else
- args[num_args++] = a->addr;
+ args[num_args++] = ac_to_integer(ctx, addr);
args[num_args++] = a->resource;
if (sample)
args[num_args++] = a->sampler;
- args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, 0);
- if (sample)
- args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, 0);
- args[num_args++] = ctx->i1false; /* glc */
- args[num_args++] = ctx->i1false; /* slc */
- args[num_args++] = ctx->i1false; /* lwe */
- args[num_args++] = LLVMConstInt(ctx->i1, a->da, 0);
+ if (!atomic) {
+ args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, 0);
+ if (sample)
+ args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, 0);
+ args[num_args++] = a->cache_policy & ac_glc ? ctx->i1true : ctx->i1false;
+ args[num_args++] = a->cache_policy & ac_slc ? ctx->i1true : ctx->i1false;
+ args[num_args++] = ctx->i1false; /* lwe */
+ args[num_args++] = LLVMConstInt(ctx->i1, da, 0);
+ } else {
+ args[num_args++] = ctx->i1false; /* r128 */
+ args[num_args++] = LLVMConstInt(ctx->i1, da, 0);
+ args[num_args++] = a->cache_policy & ac_slc ? ctx->i1true : ctx->i1false;
+ }
switch (a->opcode) {
case ac_image_sample:
case ac_image_load_mip:
name = "llvm.amdgcn.image.load.mip";
break;
+ case ac_image_store:
+ name = "llvm.amdgcn.image.store";
+ retty = ctx->voidt;
+ break;
+ case ac_image_store_mip:
+ name = "llvm.amdgcn.image.store.mip";
+ retty = ctx->voidt;
+ break;
+ case ac_image_atomic:
+ case ac_image_atomic_cmpswap:
+ name = "llvm.amdgcn.image.atomic.";
+ retty = ctx->i32;
+ if (a->opcode == ac_image_atomic_cmpswap) {
+ atomic_subop = "cmpswap";
+ } else {
+ atomic_subop = get_atomic_name(a->atomic);
+ }
+ break;
case ac_image_get_lod:
name = "llvm.amdgcn.image.getlod";
break;
unreachable("invalid image opcode");
}
- ac_build_type_name_for_intr(LLVMTypeOf(args[0]), type,
- sizeof(type));
+ ac_build_type_name_for_intr(LLVMTypeOf(args[coords_arg]), coords_type,
+ sizeof(coords_type));
- snprintf(intr_name, sizeof(intr_name), "%s%s%s%s.v4f32.%s.v8i32",
- name,
- a->compare ? ".c" : "",
- a->bias ? ".b" :
- a->lod ? ".l" :
- a->deriv ? ".d" :
- a->level_zero ? ".lz" : "",
- a->offset ? ".o" : "",
- type);
+ if (atomic) {
+ snprintf(intr_name, sizeof(intr_name), "llvm.amdgcn.image.atomic.%s.%s",
+ atomic_subop, coords_type);
+ } else {
+ bool lod_suffix =
+ a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
+
+ snprintf(intr_name, sizeof(intr_name), "%s%s%s%s.v4f32.%s.v8i32",
+ name,
+ a->compare ? ".c" : "",
+ a->bias ? ".b" :
+ lod_suffix ? ".l" :
+ a->derivs[0] ? ".d" :
+ a->level_zero ? ".lz" : "",
+ a->offset ? ".o" : "",
+ coords_type);
+ }
LLVMValueRef result =
- ac_build_intrinsic(ctx, intr_name,
- ctx->v4f32, args, num_args,
- AC_FUNC_ATTR_READNONE);
- if (!sample) {
+ ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
+ a->attributes);
+ if (!sample && retty == ctx->v4f32) {
+ result = LLVMBuildBitCast(ctx->builder, result,
+ ctx->v4i32, "");
+ }
+ return result;
+}
+
+LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
+ struct ac_image_args *a)
+{
+ const char *overload[3] = { "", "", "" };
+ unsigned num_overloads = 0;
+ LLVMValueRef args[18];
+ unsigned num_args = 0;
+ enum ac_image_dim dim = a->dim;
+
+ assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
+ !a->level_zero);
+ assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
+ a->opcode != ac_image_store_mip) ||
+ a->lod);
+ assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
+ (!a->compare && !a->offset));
+ assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
+ a->opcode == ac_image_get_lod) ||
+ !a->bias);
+ assert((a->bias ? 1 : 0) +
+ (a->lod ? 1 : 0) +
+ (a->level_zero ? 1 : 0) +
+ (a->derivs[0] ? 1 : 0) <= 1);
+
+ if (HAVE_LLVM < 0x0700)
+ return ac_build_image_opcode_llvm6(ctx, a);
+
+ if (a->opcode == ac_image_get_lod) {
+ switch (dim) {
+ case ac_image_1darray:
+ dim = ac_image_1d;
+ break;
+ case ac_image_2darray:
+ case ac_image_cube:
+ dim = ac_image_2d;
+ break;
+ default:
+ break;
+ }
+ }
+
+ bool sample = a->opcode == ac_image_sample ||
+ a->opcode == ac_image_gather4 ||
+ a->opcode == ac_image_get_lod;
+ bool atomic = a->opcode == ac_image_atomic ||
+ a->opcode == ac_image_atomic_cmpswap;
+ LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
+
+ if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
+ args[num_args++] = a->data[0];
+ if (a->opcode == ac_image_atomic_cmpswap)
+ args[num_args++] = a->data[1];
+ }
+
+ if (!atomic)
+ args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
+
+ if (a->offset)
+ args[num_args++] = ac_to_integer(ctx, a->offset);
+ if (a->bias) {
+ args[num_args++] = ac_to_float(ctx, a->bias);
+ overload[num_overloads++] = ".f32";
+ }
+ if (a->compare)
+ args[num_args++] = ac_to_float(ctx, a->compare);
+ if (a->derivs[0]) {
+ unsigned count = ac_num_derivs(dim);
+ for (unsigned i = 0; i < count; ++i)
+ args[num_args++] = ac_to_float(ctx, a->derivs[i]);
+ overload[num_overloads++] = ".f32";
+ }
+ unsigned num_coords =
+ a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
+ for (unsigned i = 0; i < num_coords; ++i)
+ args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
+ if (a->lod)
+ args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
+ overload[num_overloads++] = sample ? ".f32" : ".i32";
+
+ args[num_args++] = a->resource;
+ if (sample) {
+ args[num_args++] = a->sampler;
+ args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
+ }
+
+ args[num_args++] = ctx->i32_0; /* texfailctrl */
+ args[num_args++] = LLVMConstInt(ctx->i32, a->cache_policy, false);
+
+ const char *name;
+ const char *atomic_subop = "";
+ switch (a->opcode) {
+ case ac_image_sample: name = "sample"; break;
+ case ac_image_gather4: name = "gather4"; break;
+ case ac_image_load: name = "load"; break;
+ case ac_image_load_mip: name = "load.mip"; break;
+ case ac_image_store: name = "store"; break;
+ case ac_image_store_mip: name = "store.mip"; break;
+ case ac_image_atomic:
+ name = "atomic.";
+ atomic_subop = get_atomic_name(a->atomic);
+ break;
+ case ac_image_atomic_cmpswap:
+ name = "atomic.";
+ atomic_subop = "cmpswap";
+ break;
+ case ac_image_get_lod: name = "getlod"; break;
+ case ac_image_get_resinfo: name = "getresinfo"; break;
+ default: unreachable("invalid image opcode");
+ }
+
+ const char *dimname;
+ switch (dim) {
+ case ac_image_1d: dimname = "1d"; break;
+ case ac_image_2d: dimname = "2d"; break;
+ case ac_image_3d: dimname = "3d"; break;
+ case ac_image_cube: dimname = "cube"; break;
+ case ac_image_1darray: dimname = "1darray"; break;
+ case ac_image_2darray: dimname = "2darray"; break;
+ case ac_image_2dmsaa: dimname = "2dmsaa"; break;
+ case ac_image_2darraymsaa: dimname = "2darraymsaa"; break;
+ default: unreachable("invalid dim");
+ }
+
+ bool lod_suffix =
+ a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
+ char intr_name[96];
+ snprintf(intr_name, sizeof(intr_name),
+ "llvm.amdgcn.image.%s%s" /* base name */
+ "%s%s%s" /* sample/gather modifiers */
+ ".%s.%s%s%s%s", /* dimension and type overloads */
+ name, atomic_subop,
+ a->compare ? ".c" : "",
+ a->bias ? ".b" :
+ lod_suffix ? ".l" :
+ a->derivs[0] ? ".d" :
+ a->level_zero ? ".lz" : "",
+ a->offset ? ".o" : "",
+ dimname,
+ atomic ? "i32" : "v4f32",
+ overload[0], overload[1], overload[2]);
+
+ LLVMTypeRef retty;
+ if (atomic)
+ retty = ctx->i32;
+ else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
+ retty = ctx->voidt;
+ else
+ retty = ctx->v4f32;
+
+ LLVMValueRef result =
+ ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
+ a->attributes);
+ if (!sample && retty == ctx->v4f32) {
result = LLVMBuildBitCast(ctx->builder, result,
ctx->v4i32, "");
}
LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
LLVMValueRef args[2])
{
- if (HAVE_LLVM >= 0x0500) {
- LLVMTypeRef v2f16 =
- LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
- LLVMValueRef res =
- ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz",
- v2f16, args, 2,
- AC_FUNC_ATTR_READNONE);
- return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
- }
+ LLVMTypeRef v2f16 =
+ LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
- return ac_build_intrinsic(ctx, "llvm.SI.packf16", ctx->i32, args, 2,
- AC_FUNC_ATTR_READNONE |
- AC_FUNC_ATTR_LEGACY);
+ return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16,
+ args, 2, AC_FUNC_ATTR_READNONE);
}
/* Upper 16 bits must be zero. */
width,
};
- if (HAVE_LLVM >= 0x0500) {
- return ac_build_intrinsic(ctx,
- is_signed ? "llvm.amdgcn.sbfe.i32" :
- "llvm.amdgcn.ubfe.i32",
- ctx->i32, args, 3,
- AC_FUNC_ATTR_READNONE);
- }
-
return ac_build_intrinsic(ctx,
- is_signed ? "llvm.AMDGPU.bfe.i32" :
- "llvm.AMDGPU.bfe.u32",
+ is_signed ? "llvm.amdgcn.sbfe.i32" :
+ "llvm.amdgcn.ubfe.i32",
ctx->i32, args, 3,
- AC_FUNC_ATTR_READNONE |
- AC_FUNC_ATTR_LEGACY);
+ AC_FUNC_ATTR_READNONE);
}
void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned simm16)
ctx->voidt, args, 1, 0);
}
-void ac_get_image_intr_name(const char *base_name,
- LLVMTypeRef data_type,
- LLVMTypeRef coords_type,
- LLVMTypeRef rsrc_type,
- char *out_name, unsigned out_len)
+LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
+ unsigned bitsize)
{
- char coords_type_name[8];
+ LLVMTypeRef type;
+ char *intr;
- ac_build_type_name_for_intr(coords_type, coords_type_name,
- sizeof(coords_type_name));
+ if (bitsize == 32) {
+ intr = "llvm.floor.f32";
+ type = ctx->f32;
+ } else {
+ intr = "llvm.floor.f64";
+ type = ctx->f64;
+ }
- char data_type_name[8];
- char rsrc_type_name[8];
+ LLVMValueRef params[] = {
+ src0,
+ };
+ LLVMValueRef floor = ac_build_intrinsic(ctx, intr, type, params, 1,
+ AC_FUNC_ATTR_READNONE);
+ return LLVMBuildFSub(ctx->builder, src0, floor, "");
+}
- ac_build_type_name_for_intr(data_type, data_type_name,
- sizeof(data_type_name));
- ac_build_type_name_for_intr(rsrc_type, rsrc_type_name,
- sizeof(rsrc_type_name));
- snprintf(out_name, out_len, "%s.%s.%s.%s", base_name,
- data_type_name, coords_type_name, rsrc_type_name);
+LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
+ unsigned bitsize)
+{
+ LLVMValueRef cmp, val, zero, one;
+ LLVMTypeRef type;
+
+ if (bitsize == 32) {
+ type = ctx->i32;
+ zero = ctx->i32_0;
+ one = ctx->i32_1;
+ } else {
+ type = ctx->i64;
+ zero = ctx->i64_0;
+ one = ctx->i64_1;
+ }
+
+ cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
+ val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
+ cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
+ val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), "");
+ return val;
}
-#define AC_EXP_TARGET (HAVE_LLVM >= 0x0500 ? 0 : 3)
-#define AC_EXP_OUT0 (HAVE_LLVM >= 0x0500 ? 2 : 5)
+LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
+ unsigned bitsize)
+{
+ LLVMValueRef cmp, val, zero, one;
+ LLVMTypeRef type;
+
+ if (bitsize == 32) {
+ type = ctx->f32;
+ zero = ctx->f32_0;
+ one = ctx->f32_1;
+ } else {
+ type = ctx->f64;
+ zero = ctx->f64_0;
+ one = ctx->f64_1;
+ }
+
+ cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
+ val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
+ cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
+ val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), "");
+ return val;
+}
+
+#define AC_EXP_TARGET 0
+#define AC_EXP_ENABLED_CHANNELS 1
+#define AC_EXP_OUT0 2
enum ac_ir_type {
AC_IR_UNDEF,
return true;
}
-static bool ac_eliminate_duplicated_output(uint8_t *vs_output_param_offset,
+static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
+ uint8_t *vs_output_param_offset,
uint32_t num_outputs,
struct ac_vs_exports *processed,
struct ac_vs_exp_inst *exp)
*/
struct ac_vs_exp_inst *match = &processed->exp[p];
+ /* Get current enabled channels mask. */
+ LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
+ unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
+
while (copy_back_channels) {
unsigned chan = u_bit_scan(©_back_channels);
LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan,
exp->chan[chan].value);
match->chan[chan] = exp->chan[chan];
+
+ /* Update number of enabled channels because the original mask
+ * is not always 0xf.
+ */
+ enabled_channels |= (1 << chan);
+ LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
+ LLVMConstInt(ctx->i32, enabled_channels, 0));
}
/* The PARAM export is duplicated. Kill it. */
/* Eliminate constant and duplicated PARAM exports. */
if (ac_eliminate_const_output(vs_output_param_offset,
num_outputs, &exp) ||
- ac_eliminate_duplicated_output(vs_output_param_offset,
+ ac_eliminate_duplicated_output(ctx,
+ vs_output_param_offset,
num_outputs, &exports,
&exp)) {
removed_any = true;
LLVMTypeRef dst_type,
LLVMValueRef src0)
{
+ unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
+ const char *intrin_name;
+ LLVMTypeRef type;
+ LLVMValueRef zero;
+ if (src0_bitsize == 64) {
+ intrin_name = "llvm.cttz.i64";
+ type = ctx->i64;
+ zero = ctx->i64_0;
+ } else {
+ intrin_name = "llvm.cttz.i32";
+ type = ctx->i32;
+ zero = ctx->i32_0;
+ }
+
LLVMValueRef params[2] = {
src0,
LLVMConstInt(ctx->i1, 1, false),
};
- LLVMValueRef lsb = ac_build_intrinsic(ctx, "llvm.cttz.i32", ctx->i32,
+ LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type,
params, 2,
AC_FUNC_ATTR_READNONE);
+ if (src0_bitsize == 64) {
+ lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
+ }
+
/* TODO: We need an intrinsic to skip this conditional. */
/* Check for zero: */
return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder,
LLVMIntEQ, src0,
- ctx->i32_0, ""),
+ zero, ""),
LLVMConstInt(ctx->i32, -1, 0), lsb, "");
}
return LLVMPointerType(LLVMArrayType(elem_type, 0),
AC_CONST_ADDR_SPACE);
}
+
+LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
+{
+ if (!HAVE_32BIT_POINTERS)
+ return ac_array_in_const_addr_space(elem_type);
+
+ return LLVMPointerType(LLVMArrayType(elem_type, 0),
+ AC_CONST_32BIT_ADDR_SPACE);
+}
+
+static struct ac_llvm_flow *
+get_current_flow(struct ac_llvm_context *ctx)
+{
+ if (ctx->flow_depth > 0)
+ return &ctx->flow[ctx->flow_depth - 1];
+ return NULL;
+}
+
+static struct ac_llvm_flow *
+get_innermost_loop(struct ac_llvm_context *ctx)
+{
+ for (unsigned i = ctx->flow_depth; i > 0; --i) {
+ if (ctx->flow[i - 1].loop_entry_block)
+ return &ctx->flow[i - 1];
+ }
+ return NULL;
+}
+
+static struct ac_llvm_flow *
+push_flow(struct ac_llvm_context *ctx)
+{
+ struct ac_llvm_flow *flow;
+
+ if (ctx->flow_depth >= ctx->flow_depth_max) {
+ unsigned new_max = MAX2(ctx->flow_depth << 1,
+ AC_LLVM_INITIAL_CF_DEPTH);
+
+ ctx->flow = realloc(ctx->flow, new_max * sizeof(*ctx->flow));
+ ctx->flow_depth_max = new_max;
+ }
+
+ flow = &ctx->flow[ctx->flow_depth];
+ ctx->flow_depth++;
+
+ flow->next_block = NULL;
+ flow->loop_entry_block = NULL;
+ return flow;
+}
+
+static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base,
+ int label_id)
+{
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%s%d", base, label_id);
+ LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
+}
+
+/* Append a basic block at the level of the parent flow.
+ */
+static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx,
+ const char *name)
+{
+ assert(ctx->flow_depth >= 1);
+
+ if (ctx->flow_depth >= 2) {
+ struct ac_llvm_flow *flow = &ctx->flow[ctx->flow_depth - 2];
+
+ return LLVMInsertBasicBlockInContext(ctx->context,
+ flow->next_block, name);
+ }
+
+ LLVMValueRef main_fn =
+ LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
+ return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
+}
+
+/* Emit a branch to the given default target for the current block if
+ * applicable -- that is, if the current block does not already contain a
+ * branch from a break or continue.
+ */
+static void emit_default_branch(LLVMBuilderRef builder,
+ LLVMBasicBlockRef target)
+{
+ if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
+ LLVMBuildBr(builder, target);
+}
+
+void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
+{
+ struct ac_llvm_flow *flow = push_flow(ctx);
+ flow->loop_entry_block = append_basic_block(ctx, "LOOP");
+ flow->next_block = append_basic_block(ctx, "ENDLOOP");
+ set_basicblock_name(flow->loop_entry_block, "loop", label_id);
+ LLVMBuildBr(ctx->builder, flow->loop_entry_block);
+ LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
+}
+
+void ac_build_break(struct ac_llvm_context *ctx)
+{
+ struct ac_llvm_flow *flow = get_innermost_loop(ctx);
+ LLVMBuildBr(ctx->builder, flow->next_block);
+}
+
+void ac_build_continue(struct ac_llvm_context *ctx)
+{
+ struct ac_llvm_flow *flow = get_innermost_loop(ctx);
+ LLVMBuildBr(ctx->builder, flow->loop_entry_block);
+}
+
+void ac_build_else(struct ac_llvm_context *ctx, int label_id)
+{
+ struct ac_llvm_flow *current_branch = get_current_flow(ctx);
+ LLVMBasicBlockRef endif_block;
+
+ assert(!current_branch->loop_entry_block);
+
+ endif_block = append_basic_block(ctx, "ENDIF");
+ emit_default_branch(ctx->builder, endif_block);
+
+ LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
+ set_basicblock_name(current_branch->next_block, "else", label_id);
+
+ current_branch->next_block = endif_block;
+}
+
+void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
+{
+ struct ac_llvm_flow *current_branch = get_current_flow(ctx);
+
+ assert(!current_branch->loop_entry_block);
+
+ emit_default_branch(ctx->builder, current_branch->next_block);
+ LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
+ set_basicblock_name(current_branch->next_block, "endif", label_id);
+
+ ctx->flow_depth--;
+}
+
+void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
+{
+ struct ac_llvm_flow *current_loop = get_current_flow(ctx);
+
+ assert(current_loop->loop_entry_block);
+
+ emit_default_branch(ctx->builder, current_loop->loop_entry_block);
+
+ LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
+ set_basicblock_name(current_loop->next_block, "endloop", label_id);
+ ctx->flow_depth--;
+}
+
+static void if_cond_emit(struct ac_llvm_context *ctx, LLVMValueRef cond,
+ int label_id)
+{
+ struct ac_llvm_flow *flow = push_flow(ctx);
+ LLVMBasicBlockRef if_block;
+
+ if_block = append_basic_block(ctx, "IF");
+ flow->next_block = append_basic_block(ctx, "ELSE");
+ set_basicblock_name(if_block, "if", label_id);
+ LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
+ LLVMPositionBuilderAtEnd(ctx->builder, if_block);
+}
+
+void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
+ int label_id)
+{
+ LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
+ value, ctx->f32_0, "");
+ if_cond_emit(ctx, cond, label_id);
+}
+
+void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
+ int label_id)
+{
+ LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+ ac_to_integer(ctx, value),
+ ctx->i32_0, "");
+ if_cond_emit(ctx, cond, label_id);
+}
+
+LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type,
+ const char *name)
+{
+ LLVMBuilderRef builder = ac->builder;
+ LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
+ LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
+ LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
+ LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
+ LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
+ LLVMValueRef res;
+
+ if (first_instr) {
+ LLVMPositionBuilderBefore(first_builder, first_instr);
+ } else {
+ LLVMPositionBuilderAtEnd(first_builder, first_block);
+ }
+
+ res = LLVMBuildAlloca(first_builder, type, name);
+ LLVMBuildStore(builder, LLVMConstNull(type), res);
+
+ LLVMDisposeBuilder(first_builder);
+
+ return res;
+}
+
+LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac,
+ LLVMTypeRef type, const char *name)
+{
+ LLVMValueRef ptr = ac_build_alloca(ac, type, name);
+ LLVMBuildStore(ac->builder, LLVMGetUndef(type), ptr);
+ return ptr;
+}
+
+LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr,
+ LLVMTypeRef type)
+{
+ int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
+ return LLVMBuildBitCast(ctx->builder, ptr,
+ LLVMPointerType(type, addr_space), "");
+}
+
+LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value,
+ unsigned count)
+{
+ unsigned num_components = ac_get_llvm_num_components(value);
+ if (count == num_components)
+ return value;
+
+ LLVMValueRef masks[] = {
+ LLVMConstInt(ctx->i32, 0, false), LLVMConstInt(ctx->i32, 1, false),
+ LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false)};
+
+ if (count == 1)
+ return LLVMBuildExtractElement(ctx->builder, value, masks[0],
+ "");
+
+ LLVMValueRef swizzle = LLVMConstVector(masks, count);
+ return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
+}
+
+LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param,
+ unsigned rshift, unsigned bitwidth)
+{
+ LLVMValueRef value = param;
+ if (rshift)
+ value = LLVMBuildLShr(ctx->builder, value,
+ LLVMConstInt(ctx->i32, rshift, false), "");
+
+ if (rshift + bitwidth < 32) {
+ unsigned mask = (1 << bitwidth) - 1;
+ value = LLVMBuildAnd(ctx->builder, value,
+ LLVMConstInt(ctx->i32, mask, false), "");
+ }
+ return value;
+}
+
+/* Adjust the sample index according to FMASK.
+ *
+ * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
+ * which is the identity mapping. Each nibble says which physical sample
+ * should be fetched to get that sample.
+ *
+ * For example, 0x11111100 means there are only 2 samples stored and
+ * the second sample covers 3/4 of the pixel. When reading samples 0
+ * and 1, return physical sample 0 (determined by the first two 0s
+ * in FMASK), otherwise return physical sample 1.
+ *
+ * The sample index should be adjusted as follows:
+ * addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
+ */
+void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
+ LLVMValueRef *addr, bool is_array_tex)
+{
+ struct ac_image_args fmask_load = {};
+ fmask_load.opcode = ac_image_load;
+ fmask_load.resource = fmask;
+ fmask_load.dmask = 0xf;
+ fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
+
+ fmask_load.coords[0] = addr[0];
+ fmask_load.coords[1] = addr[1];
+ if (is_array_tex)
+ fmask_load.coords[2] = addr[2];
+
+ LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
+ fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value,
+ ac->i32_0, "");
+
+ /* Apply the formula. */
+ unsigned sample_chan = is_array_tex ? 3 : 2;
+ LLVMValueRef final_sample;
+ final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
+ LLVMConstInt(ac->i32, 4, 0), "");
+ final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
+ /* Mask the sample index by 0x7, because 0x8 means an unknown value
+ * with EQAA, so those will map to 0. */
+ final_sample = LLVMBuildAnd(ac->builder, final_sample,
+ LLVMConstInt(ac->i32, 0x7, 0), "");
+
+ /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
+ * resource descriptor is 0 (invalid).
+ */
+ LLVMValueRef tmp;
+ tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
+ tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
+ tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
+
+ /* Replace the MSAA sample index. */
+ addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample,
+ addr[sample_chan], "");
+}
+
+static LLVMValueRef
+_ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
+{
+ ac_build_optimization_barrier(ctx, &src);
+ return ac_build_intrinsic(ctx,
+ lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
+ LLVMTypeOf(src), (LLVMValueRef []) {
+ src, lane },
+ lane == NULL ? 1 : 2,
+ AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+}
+
+/**
+ * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
+ * @param ctx
+ * @param src
+ * @param lane - id of the lane or NULL for the first active lane
+ * @return value of the lane
+ */
+LLVMValueRef
+ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
+{
+ LLVMTypeRef src_type = LLVMTypeOf(src);
+ src = ac_to_integer(ctx, src);
+ unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+ LLVMValueRef ret;
+
+ if (bits == 32) {
+ ret = _ac_build_readlane(ctx, src, lane);
+ } else {
+ assert(bits % 32 == 0);
+ LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+ LLVMValueRef src_vector =
+ LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+ ret = LLVMGetUndef(vec_type);
+ for (unsigned i = 0; i < bits / 32; i++) {
+ src = LLVMBuildExtractElement(ctx->builder, src_vector,
+ LLVMConstInt(ctx->i32, i, 0), "");
+ LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane);
+ ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp,
+ LLVMConstInt(ctx->i32, i, 0), "");
+ }
+ }
+ return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+LLVMValueRef
+ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane)
+{
+ /* TODO: Use the actual instruction when LLVM adds an intrinsic for it.
+ */
+ LLVMValueRef pred = LLVMBuildICmp(ctx->builder, LLVMIntEQ, lane,
+ ac_get_thread_id(ctx), "");
+ return LLVMBuildSelect(ctx->builder, pred, value, src, "");
+}
+
+LLVMValueRef
+ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
+{
+ LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask,
+ LLVMVectorType(ctx->i32, 2),
+ "");
+ LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec,
+ ctx->i32_0, "");
+ LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec,
+ ctx->i32_1, "");
+ LLVMValueRef val =
+ ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
+ (LLVMValueRef []) { mask_lo, ctx->i32_0 },
+ 2, AC_FUNC_ATTR_READNONE);
+ val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32,
+ (LLVMValueRef []) { mask_hi, val },
+ 2, AC_FUNC_ATTR_READNONE);
+ return val;
+}
+
+enum dpp_ctrl {
+ _dpp_quad_perm = 0x000,
+ _dpp_row_sl = 0x100,
+ _dpp_row_sr = 0x110,
+ _dpp_row_rr = 0x120,
+ dpp_wf_sl1 = 0x130,
+ dpp_wf_rl1 = 0x134,
+ dpp_wf_sr1 = 0x138,
+ dpp_wf_rr1 = 0x13C,
+ dpp_row_mirror = 0x140,
+ dpp_row_half_mirror = 0x141,
+ dpp_row_bcast15 = 0x142,
+ dpp_row_bcast31 = 0x143
+};
+
+static inline enum dpp_ctrl
+dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
+{
+ assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
+ return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
+}
+
+static inline enum dpp_ctrl
+dpp_row_sl(unsigned amount)
+{
+ assert(amount > 0 && amount < 16);
+ return _dpp_row_sl | amount;
+}
+
+static inline enum dpp_ctrl
+dpp_row_sr(unsigned amount)
+{
+ assert(amount > 0 && amount < 16);
+ return _dpp_row_sr | amount;
+}
+
+static LLVMValueRef
+_ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
+ enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
+ bool bound_ctrl)
+{
+ return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32",
+ LLVMTypeOf(old),
+ (LLVMValueRef[]) {
+ old, src,
+ LLVMConstInt(ctx->i32, dpp_ctrl, 0),
+ LLVMConstInt(ctx->i32, row_mask, 0),
+ LLVMConstInt(ctx->i32, bank_mask, 0),
+ LLVMConstInt(ctx->i1, bound_ctrl, 0) },
+ 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
+}
+
+static LLVMValueRef
+ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
+ enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
+ bool bound_ctrl)
+{
+ LLVMTypeRef src_type = LLVMTypeOf(src);
+ src = ac_to_integer(ctx, src);
+ old = ac_to_integer(ctx, old);
+ unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+ LLVMValueRef ret;
+ if (bits == 32) {
+ ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask,
+ bank_mask, bound_ctrl);
+ } else {
+ assert(bits % 32 == 0);
+ LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+ LLVMValueRef src_vector =
+ LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+ LLVMValueRef old_vector =
+ LLVMBuildBitCast(ctx->builder, old, vec_type, "");
+ ret = LLVMGetUndef(vec_type);
+ for (unsigned i = 0; i < bits / 32; i++) {
+ src = LLVMBuildExtractElement(ctx->builder, src_vector,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ old = LLVMBuildExtractElement(ctx->builder, old_vector,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src,
+ dpp_ctrl,
+ row_mask,
+ bank_mask,
+ bound_ctrl);
+ ret = LLVMBuildInsertElement(ctx->builder, ret,
+ ret_comp,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ }
+ }
+ return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+static inline unsigned
+ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
+{
+ assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
+ return and_mask | (or_mask << 5) | (xor_mask << 10);
+}
+
+static LLVMValueRef
+_ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
+{
+ return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle",
+ LLVMTypeOf(src), (LLVMValueRef []) {
+ src, LLVMConstInt(ctx->i32, mask, 0) },
+ 2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
+}
+
+LLVMValueRef
+ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
+{
+ LLVMTypeRef src_type = LLVMTypeOf(src);
+ src = ac_to_integer(ctx, src);
+ unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+ LLVMValueRef ret;
+ if (bits == 32) {
+ ret = _ac_build_ds_swizzle(ctx, src, mask);
+ } else {
+ assert(bits % 32 == 0);
+ LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+ LLVMValueRef src_vector =
+ LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+ ret = LLVMGetUndef(vec_type);
+ for (unsigned i = 0; i < bits / 32; i++) {
+ src = LLVMBuildExtractElement(ctx->builder, src_vector,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src,
+ mask);
+ ret = LLVMBuildInsertElement(ctx->builder, ret,
+ ret_comp,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ }
+ }
+ return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+static LLVMValueRef
+ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
+{
+ char name[32], type[8];
+ ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
+ snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
+ return ac_build_intrinsic(ctx, name, LLVMTypeOf(src),
+ (LLVMValueRef []) { src }, 1,
+ AC_FUNC_ATTR_READNONE);
+}
+
+static LLVMValueRef
+ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
+ LLVMValueRef inactive)
+{
+ char name[33], type[8];
+ LLVMTypeRef src_type = LLVMTypeOf(src);
+ src = ac_to_integer(ctx, src);
+ inactive = ac_to_integer(ctx, inactive);
+ ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
+ snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
+ LLVMValueRef ret =
+ ac_build_intrinsic(ctx, name,
+ LLVMTypeOf(src), (LLVMValueRef []) {
+ src, inactive }, 2,
+ AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+ return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+static LLVMValueRef
+get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size)
+{
+ if (type_size == 4) {
+ switch (op) {
+ case nir_op_iadd: return ctx->i32_0;
+ case nir_op_fadd: return ctx->f32_0;
+ case nir_op_imul: return ctx->i32_1;
+ case nir_op_fmul: return ctx->f32_1;
+ case nir_op_imin: return LLVMConstInt(ctx->i32, INT32_MAX, 0);
+ case nir_op_umin: return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
+ case nir_op_fmin: return LLVMConstReal(ctx->f32, INFINITY);
+ case nir_op_imax: return LLVMConstInt(ctx->i32, INT32_MIN, 0);
+ case nir_op_umax: return ctx->i32_0;
+ case nir_op_fmax: return LLVMConstReal(ctx->f32, -INFINITY);
+ case nir_op_iand: return LLVMConstInt(ctx->i32, -1, 0);
+ case nir_op_ior: return ctx->i32_0;
+ case nir_op_ixor: return ctx->i32_0;
+ default:
+ unreachable("bad reduction intrinsic");
+ }
+ } else { /* type_size == 64bit */
+ switch (op) {
+ case nir_op_iadd: return ctx->i64_0;
+ case nir_op_fadd: return ctx->f64_0;
+ case nir_op_imul: return ctx->i64_1;
+ case nir_op_fmul: return ctx->f64_1;
+ case nir_op_imin: return LLVMConstInt(ctx->i64, INT64_MAX, 0);
+ case nir_op_umin: return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
+ case nir_op_fmin: return LLVMConstReal(ctx->f64, INFINITY);
+ case nir_op_imax: return LLVMConstInt(ctx->i64, INT64_MIN, 0);
+ case nir_op_umax: return ctx->i64_0;
+ case nir_op_fmax: return LLVMConstReal(ctx->f64, -INFINITY);
+ case nir_op_iand: return LLVMConstInt(ctx->i64, -1, 0);
+ case nir_op_ior: return ctx->i64_0;
+ case nir_op_ixor: return ctx->i64_0;
+ default:
+ unreachable("bad reduction intrinsic");
+ }
+ }
+}
+
+static LLVMValueRef
+ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op)
+{
+ bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
+ switch (op) {
+ case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
+ case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
+ case nir_op_imul: return LLVMBuildMul(ctx->builder, lhs, rhs, "");
+ case nir_op_fmul: return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
+ case nir_op_imin: return LLVMBuildSelect(ctx->builder,
+ LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
+ lhs, rhs, "");
+ case nir_op_umin: return LLVMBuildSelect(ctx->builder,
+ LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
+ lhs, rhs, "");
+ case nir_op_fmin: return ac_build_intrinsic(ctx,
+ _64bit ? "llvm.minnum.f64" : "llvm.minnum.f32",
+ _64bit ? ctx->f64 : ctx->f32,
+ (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
+ case nir_op_imax: return LLVMBuildSelect(ctx->builder,
+ LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
+ lhs, rhs, "");
+ case nir_op_umax: return LLVMBuildSelect(ctx->builder,
+ LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
+ lhs, rhs, "");
+ case nir_op_fmax: return ac_build_intrinsic(ctx,
+ _64bit ? "llvm.maxnum.f64" : "llvm.maxnum.f32",
+ _64bit ? ctx->f64 : ctx->f32,
+ (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
+ case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
+ case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
+ case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
+ default:
+ unreachable("bad reduction intrinsic");
+ }
+}
+
+/* TODO: add inclusive and excluse scan functions for SI chip class. */
+static LLVMValueRef
+ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity)
+{
+ LLVMValueRef result, tmp;
+ result = src;
+ tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ return result;
+}
+
+LLVMValueRef
+ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
+{
+ ac_build_optimization_barrier(ctx, &src);
+ LLVMValueRef result;
+ LLVMValueRef identity = get_reduction_identity(ctx, op,
+ ac_get_type_size(LLVMTypeOf(src)));
+ result = LLVMBuildBitCast(ctx->builder,
+ ac_build_set_inactive(ctx, src, identity),
+ LLVMTypeOf(identity), "");
+ result = ac_build_scan(ctx, op, result, identity);
+
+ return ac_build_wwm(ctx, result);
+}
+
+LLVMValueRef
+ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
+{
+ ac_build_optimization_barrier(ctx, &src);
+ LLVMValueRef result;
+ LLVMValueRef identity = get_reduction_identity(ctx, op,
+ ac_get_type_size(LLVMTypeOf(src)));
+ result = LLVMBuildBitCast(ctx->builder,
+ ac_build_set_inactive(ctx, src, identity),
+ LLVMTypeOf(identity), "");
+ result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
+ result = ac_build_scan(ctx, op, result, identity);
+
+ return ac_build_wwm(ctx, result);
+}
+
+LLVMValueRef
+ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
+{
+ if (cluster_size == 1) return src;
+ ac_build_optimization_barrier(ctx, &src);
+ LLVMValueRef result, swap;
+ LLVMValueRef identity = get_reduction_identity(ctx, op,
+ ac_get_type_size(LLVMTypeOf(src)));
+ result = LLVMBuildBitCast(ctx->builder,
+ ac_build_set_inactive(ctx, src, identity),
+ LLVMTypeOf(identity), "");
+ swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 2) return ac_build_wwm(ctx, result);
+
+ swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 4) return ac_build_wwm(ctx, result);
+
+ if (ctx->chip_class >= VI)
+ swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
+ else
+ swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 8) return ac_build_wwm(ctx, result);
+
+ if (ctx->chip_class >= VI)
+ swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
+ else
+ swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 16) return ac_build_wwm(ctx, result);
+
+ if (ctx->chip_class >= VI && cluster_size != 32)
+ swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
+ else
+ swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 32) return ac_build_wwm(ctx, result);
+
+ if (ctx->chip_class >= VI) {
+ swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
+ result = ac_build_alu_op(ctx, result, swap, op);
+ result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
+ return ac_build_wwm(ctx, result);
+ } else {
+ swap = ac_build_readlane(ctx, result, ctx->i32_0);
+ result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
+ result = ac_build_alu_op(ctx, result, swap, op);
+ return ac_build_wwm(ctx, result);
+ }
+}
+
+LLVMValueRef
+ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
+ unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
+{
+ unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
+ if (ctx->chip_class >= VI && HAVE_LLVM >= 0x0600) {
+ return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
+ } else {
+ return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
+ }
+}
+
+LLVMValueRef
+ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
+{
+ index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
+ return ac_build_intrinsic(ctx,
+ "llvm.amdgcn.ds.bpermute", ctx->i32,
+ (LLVMValueRef []) {index, src}, 2,
+ AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+}