+
+void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
+{
+ LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
+ ac_build_intrinsic(ctx,
+ "llvm.amdgcn.init.exec", ctx->voidt,
+ &full_mask, 1, AC_FUNC_ATTR_CONVERGENT);
+}
+
+void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
+{
+ unsigned lds_size = ctx->chip_class >= CIK ? 65536 : 32768;
+ ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
+ LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_LOCAL_ADDR_SPACE),
+ "lds");
+}
+
+LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx,
+ LLVMValueRef dw_addr)
+{
+ return ac_build_load(ctx, ctx->lds, dw_addr);
+}
+
+void ac_lds_store(struct ac_llvm_context *ctx,
+ LLVMValueRef dw_addr,
+ LLVMValueRef value)
+{
+ value = ac_to_integer(ctx, value);
+ ac_build_indexed_store(ctx, ctx->lds,
+ dw_addr, value);
+}
+
+LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
+ LLVMTypeRef dst_type,
+ LLVMValueRef src0)
+{
+ unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
+ const char *intrin_name;
+ LLVMTypeRef type;
+ LLVMValueRef zero;
+ if (src0_bitsize == 64) {
+ intrin_name = "llvm.cttz.i64";
+ type = ctx->i64;
+ zero = ctx->i64_0;
+ } else {
+ intrin_name = "llvm.cttz.i32";
+ type = ctx->i32;
+ zero = ctx->i32_0;
+ }
+
+ LLVMValueRef params[2] = {
+ src0,
+
+ /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
+ * add special code to check for x=0. The reason is that
+ * the LLVM behavior for x=0 is different from what we
+ * need here. However, LLVM also assumes that ffs(x) is
+ * in [0, 31], but GLSL expects that ffs(0) = -1, so
+ * a conditional assignment to handle 0 is still required.
+ *
+ * The hardware already implements the correct behavior.
+ */
+ LLVMConstInt(ctx->i1, 1, false),
+ };
+
+ LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type,
+ params, 2,
+ AC_FUNC_ATTR_READNONE);
+
+ if (src0_bitsize == 64) {
+ lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
+ }
+
+ /* TODO: We need an intrinsic to skip this conditional. */
+ /* Check for zero: */
+ return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder,
+ LLVMIntEQ, src0,
+ zero, ""),
+ LLVMConstInt(ctx->i32, -1, 0), lsb, "");
+}
+
+LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
+{
+ return LLVMPointerType(LLVMArrayType(elem_type, 0),
+ AC_CONST_ADDR_SPACE);
+}
+
+LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
+{
+ if (!HAVE_32BIT_POINTERS)
+ return ac_array_in_const_addr_space(elem_type);
+
+ return LLVMPointerType(LLVMArrayType(elem_type, 0),
+ AC_CONST_32BIT_ADDR_SPACE);
+}
+
+static struct ac_llvm_flow *
+get_current_flow(struct ac_llvm_context *ctx)
+{
+ if (ctx->flow_depth > 0)
+ return &ctx->flow[ctx->flow_depth - 1];
+ return NULL;
+}
+
+static struct ac_llvm_flow *
+get_innermost_loop(struct ac_llvm_context *ctx)
+{
+ for (unsigned i = ctx->flow_depth; i > 0; --i) {
+ if (ctx->flow[i - 1].loop_entry_block)
+ return &ctx->flow[i - 1];
+ }
+ return NULL;
+}
+
+static struct ac_llvm_flow *
+push_flow(struct ac_llvm_context *ctx)
+{
+ struct ac_llvm_flow *flow;
+
+ if (ctx->flow_depth >= ctx->flow_depth_max) {
+ unsigned new_max = MAX2(ctx->flow_depth << 1,
+ AC_LLVM_INITIAL_CF_DEPTH);
+
+ ctx->flow = realloc(ctx->flow, new_max * sizeof(*ctx->flow));
+ ctx->flow_depth_max = new_max;
+ }
+
+ flow = &ctx->flow[ctx->flow_depth];
+ ctx->flow_depth++;
+
+ flow->next_block = NULL;
+ flow->loop_entry_block = NULL;
+ return flow;
+}
+
+static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base,
+ int label_id)
+{
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%s%d", base, label_id);
+ LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
+}
+
+/* Append a basic block at the level of the parent flow.
+ */
+static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx,
+ const char *name)
+{
+ assert(ctx->flow_depth >= 1);
+
+ if (ctx->flow_depth >= 2) {
+ struct ac_llvm_flow *flow = &ctx->flow[ctx->flow_depth - 2];
+
+ return LLVMInsertBasicBlockInContext(ctx->context,
+ flow->next_block, name);
+ }
+
+ LLVMValueRef main_fn =
+ LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
+ return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
+}
+
+/* Emit a branch to the given default target for the current block if
+ * applicable -- that is, if the current block does not already contain a
+ * branch from a break or continue.
+ */
+static void emit_default_branch(LLVMBuilderRef builder,
+ LLVMBasicBlockRef target)
+{
+ if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
+ LLVMBuildBr(builder, target);
+}
+
+void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
+{
+ struct ac_llvm_flow *flow = push_flow(ctx);
+ flow->loop_entry_block = append_basic_block(ctx, "LOOP");
+ flow->next_block = append_basic_block(ctx, "ENDLOOP");
+ set_basicblock_name(flow->loop_entry_block, "loop", label_id);
+ LLVMBuildBr(ctx->builder, flow->loop_entry_block);
+ LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
+}
+
+void ac_build_break(struct ac_llvm_context *ctx)
+{
+ struct ac_llvm_flow *flow = get_innermost_loop(ctx);
+ LLVMBuildBr(ctx->builder, flow->next_block);
+}
+
+void ac_build_continue(struct ac_llvm_context *ctx)
+{
+ struct ac_llvm_flow *flow = get_innermost_loop(ctx);
+ LLVMBuildBr(ctx->builder, flow->loop_entry_block);
+}
+
+void ac_build_else(struct ac_llvm_context *ctx, int label_id)
+{
+ struct ac_llvm_flow *current_branch = get_current_flow(ctx);
+ LLVMBasicBlockRef endif_block;
+
+ assert(!current_branch->loop_entry_block);
+
+ endif_block = append_basic_block(ctx, "ENDIF");
+ emit_default_branch(ctx->builder, endif_block);
+
+ LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
+ set_basicblock_name(current_branch->next_block, "else", label_id);
+
+ current_branch->next_block = endif_block;
+}
+
+void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
+{
+ struct ac_llvm_flow *current_branch = get_current_flow(ctx);
+
+ assert(!current_branch->loop_entry_block);
+
+ emit_default_branch(ctx->builder, current_branch->next_block);
+ LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
+ set_basicblock_name(current_branch->next_block, "endif", label_id);
+
+ ctx->flow_depth--;
+}
+
+void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
+{
+ struct ac_llvm_flow *current_loop = get_current_flow(ctx);
+
+ assert(current_loop->loop_entry_block);
+
+ emit_default_branch(ctx->builder, current_loop->loop_entry_block);
+
+ LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
+ set_basicblock_name(current_loop->next_block, "endloop", label_id);
+ ctx->flow_depth--;
+}
+
+static void if_cond_emit(struct ac_llvm_context *ctx, LLVMValueRef cond,
+ int label_id)
+{
+ struct ac_llvm_flow *flow = push_flow(ctx);
+ LLVMBasicBlockRef if_block;
+
+ if_block = append_basic_block(ctx, "IF");
+ flow->next_block = append_basic_block(ctx, "ELSE");
+ set_basicblock_name(if_block, "if", label_id);
+ LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
+ LLVMPositionBuilderAtEnd(ctx->builder, if_block);
+}
+
+void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
+ int label_id)
+{
+ LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
+ value, ctx->f32_0, "");
+ if_cond_emit(ctx, cond, label_id);
+}
+
+void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
+ int label_id)
+{
+ LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+ ac_to_integer(ctx, value),
+ ctx->i32_0, "");
+ if_cond_emit(ctx, cond, label_id);
+}
+
+LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type,
+ const char *name)
+{
+ LLVMBuilderRef builder = ac->builder;
+ LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
+ LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
+ LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
+ LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
+ LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
+ LLVMValueRef res;
+
+ if (first_instr) {
+ LLVMPositionBuilderBefore(first_builder, first_instr);
+ } else {
+ LLVMPositionBuilderAtEnd(first_builder, first_block);
+ }
+
+ res = LLVMBuildAlloca(first_builder, type, name);
+ LLVMBuildStore(builder, LLVMConstNull(type), res);
+
+ LLVMDisposeBuilder(first_builder);
+
+ return res;
+}
+
+LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac,
+ LLVMTypeRef type, const char *name)
+{
+ LLVMValueRef ptr = ac_build_alloca(ac, type, name);
+ LLVMBuildStore(ac->builder, LLVMGetUndef(type), ptr);
+ return ptr;
+}
+
+LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr,
+ LLVMTypeRef type)
+{
+ int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
+ return LLVMBuildBitCast(ctx->builder, ptr,
+ LLVMPointerType(type, addr_space), "");
+}
+
+LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value,
+ unsigned count)
+{
+ unsigned num_components = ac_get_llvm_num_components(value);
+ if (count == num_components)
+ return value;
+
+ LLVMValueRef masks[] = {
+ LLVMConstInt(ctx->i32, 0, false), LLVMConstInt(ctx->i32, 1, false),
+ LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false)};
+
+ if (count == 1)
+ return LLVMBuildExtractElement(ctx->builder, value, masks[0],
+ "");
+
+ LLVMValueRef swizzle = LLVMConstVector(masks, count);
+ return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
+}
+
+LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param,
+ unsigned rshift, unsigned bitwidth)
+{
+ LLVMValueRef value = param;
+ if (rshift)
+ value = LLVMBuildLShr(ctx->builder, value,
+ LLVMConstInt(ctx->i32, rshift, false), "");
+
+ if (rshift + bitwidth < 32) {
+ unsigned mask = (1 << bitwidth) - 1;
+ value = LLVMBuildAnd(ctx->builder, value,
+ LLVMConstInt(ctx->i32, mask, false), "");
+ }
+ return value;
+}
+
+/* Adjust the sample index according to FMASK.
+ *
+ * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
+ * which is the identity mapping. Each nibble says which physical sample
+ * should be fetched to get that sample.
+ *
+ * For example, 0x11111100 means there are only 2 samples stored and
+ * the second sample covers 3/4 of the pixel. When reading samples 0
+ * and 1, return physical sample 0 (determined by the first two 0s
+ * in FMASK), otherwise return physical sample 1.
+ *
+ * The sample index should be adjusted as follows:
+ * addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
+ */
+void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
+ LLVMValueRef *addr, bool is_array_tex)
+{
+ struct ac_image_args fmask_load = {};
+ fmask_load.opcode = ac_image_load;
+ fmask_load.resource = fmask;
+ fmask_load.dmask = 0xf;
+ fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
+
+ fmask_load.coords[0] = addr[0];
+ fmask_load.coords[1] = addr[1];
+ if (is_array_tex)
+ fmask_load.coords[2] = addr[2];
+
+ LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
+ fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value,
+ ac->i32_0, "");
+
+ /* Apply the formula. */
+ unsigned sample_chan = is_array_tex ? 3 : 2;
+ LLVMValueRef final_sample;
+ final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
+ LLVMConstInt(ac->i32, 4, 0), "");
+ final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
+ /* Mask the sample index by 0x7, because 0x8 means an unknown value
+ * with EQAA, so those will map to 0. */
+ final_sample = LLVMBuildAnd(ac->builder, final_sample,
+ LLVMConstInt(ac->i32, 0x7, 0), "");
+
+ /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
+ * resource descriptor is 0 (invalid).
+ */
+ LLVMValueRef tmp;
+ tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
+ tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
+ tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
+
+ /* Replace the MSAA sample index. */
+ addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample,
+ addr[sample_chan], "");
+}
+
+static LLVMValueRef
+_ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
+{
+ ac_build_optimization_barrier(ctx, &src);
+ return ac_build_intrinsic(ctx,
+ lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
+ LLVMTypeOf(src), (LLVMValueRef []) {
+ src, lane },
+ lane == NULL ? 1 : 2,
+ AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+}
+
+/**
+ * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
+ * @param ctx
+ * @param src
+ * @param lane - id of the lane or NULL for the first active lane
+ * @return value of the lane
+ */
+LLVMValueRef
+ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
+{
+ LLVMTypeRef src_type = LLVMTypeOf(src);
+ src = ac_to_integer(ctx, src);
+ unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+ LLVMValueRef ret;
+
+ if (bits == 32) {
+ ret = _ac_build_readlane(ctx, src, lane);
+ } else {
+ assert(bits % 32 == 0);
+ LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+ LLVMValueRef src_vector =
+ LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+ ret = LLVMGetUndef(vec_type);
+ for (unsigned i = 0; i < bits / 32; i++) {
+ src = LLVMBuildExtractElement(ctx->builder, src_vector,
+ LLVMConstInt(ctx->i32, i, 0), "");
+ LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane);
+ ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp,
+ LLVMConstInt(ctx->i32, i, 0), "");
+ }
+ }
+ return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+LLVMValueRef
+ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane)
+{
+ /* TODO: Use the actual instruction when LLVM adds an intrinsic for it.
+ */
+ LLVMValueRef pred = LLVMBuildICmp(ctx->builder, LLVMIntEQ, lane,
+ ac_get_thread_id(ctx), "");
+ return LLVMBuildSelect(ctx->builder, pred, value, src, "");
+}
+
+LLVMValueRef
+ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
+{
+ LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask,
+ LLVMVectorType(ctx->i32, 2),
+ "");
+ LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec,
+ ctx->i32_0, "");
+ LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec,
+ ctx->i32_1, "");
+ LLVMValueRef val =
+ ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
+ (LLVMValueRef []) { mask_lo, ctx->i32_0 },
+ 2, AC_FUNC_ATTR_READNONE);
+ val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32,
+ (LLVMValueRef []) { mask_hi, val },
+ 2, AC_FUNC_ATTR_READNONE);
+ return val;
+}
+
+enum dpp_ctrl {
+ _dpp_quad_perm = 0x000,
+ _dpp_row_sl = 0x100,
+ _dpp_row_sr = 0x110,
+ _dpp_row_rr = 0x120,
+ dpp_wf_sl1 = 0x130,
+ dpp_wf_rl1 = 0x134,
+ dpp_wf_sr1 = 0x138,
+ dpp_wf_rr1 = 0x13C,
+ dpp_row_mirror = 0x140,
+ dpp_row_half_mirror = 0x141,
+ dpp_row_bcast15 = 0x142,
+ dpp_row_bcast31 = 0x143
+};
+
+static inline enum dpp_ctrl
+dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
+{
+ assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
+ return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
+}
+
+static inline enum dpp_ctrl
+dpp_row_sl(unsigned amount)
+{
+ assert(amount > 0 && amount < 16);
+ return _dpp_row_sl | amount;
+}
+
+static inline enum dpp_ctrl
+dpp_row_sr(unsigned amount)
+{
+ assert(amount > 0 && amount < 16);
+ return _dpp_row_sr | amount;
+}
+
+static LLVMValueRef
+_ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
+ enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
+ bool bound_ctrl)
+{
+ return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32",
+ LLVMTypeOf(old),
+ (LLVMValueRef[]) {
+ old, src,
+ LLVMConstInt(ctx->i32, dpp_ctrl, 0),
+ LLVMConstInt(ctx->i32, row_mask, 0),
+ LLVMConstInt(ctx->i32, bank_mask, 0),
+ LLVMConstInt(ctx->i1, bound_ctrl, 0) },
+ 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
+}
+
+static LLVMValueRef
+ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
+ enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
+ bool bound_ctrl)
+{
+ LLVMTypeRef src_type = LLVMTypeOf(src);
+ src = ac_to_integer(ctx, src);
+ old = ac_to_integer(ctx, old);
+ unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+ LLVMValueRef ret;
+ if (bits == 32) {
+ ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask,
+ bank_mask, bound_ctrl);
+ } else {
+ assert(bits % 32 == 0);
+ LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+ LLVMValueRef src_vector =
+ LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+ LLVMValueRef old_vector =
+ LLVMBuildBitCast(ctx->builder, old, vec_type, "");
+ ret = LLVMGetUndef(vec_type);
+ for (unsigned i = 0; i < bits / 32; i++) {
+ src = LLVMBuildExtractElement(ctx->builder, src_vector,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ old = LLVMBuildExtractElement(ctx->builder, old_vector,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src,
+ dpp_ctrl,
+ row_mask,
+ bank_mask,
+ bound_ctrl);
+ ret = LLVMBuildInsertElement(ctx->builder, ret,
+ ret_comp,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ }
+ }
+ return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+static inline unsigned
+ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
+{
+ assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
+ return and_mask | (or_mask << 5) | (xor_mask << 10);
+}
+
+static LLVMValueRef
+_ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
+{
+ return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle",
+ LLVMTypeOf(src), (LLVMValueRef []) {
+ src, LLVMConstInt(ctx->i32, mask, 0) },
+ 2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
+}
+
+LLVMValueRef
+ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
+{
+ LLVMTypeRef src_type = LLVMTypeOf(src);
+ src = ac_to_integer(ctx, src);
+ unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+ LLVMValueRef ret;
+ if (bits == 32) {
+ ret = _ac_build_ds_swizzle(ctx, src, mask);
+ } else {
+ assert(bits % 32 == 0);
+ LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+ LLVMValueRef src_vector =
+ LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+ ret = LLVMGetUndef(vec_type);
+ for (unsigned i = 0; i < bits / 32; i++) {
+ src = LLVMBuildExtractElement(ctx->builder, src_vector,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src,
+ mask);
+ ret = LLVMBuildInsertElement(ctx->builder, ret,
+ ret_comp,
+ LLVMConstInt(ctx->i32, i,
+ 0), "");
+ }
+ }
+ return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+static LLVMValueRef
+ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
+{
+ char name[32], type[8];
+ ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
+ snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
+ return ac_build_intrinsic(ctx, name, LLVMTypeOf(src),
+ (LLVMValueRef []) { src }, 1,
+ AC_FUNC_ATTR_READNONE);
+}
+
+static LLVMValueRef
+ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
+ LLVMValueRef inactive)
+{
+ char name[33], type[8];
+ LLVMTypeRef src_type = LLVMTypeOf(src);
+ src = ac_to_integer(ctx, src);
+ inactive = ac_to_integer(ctx, inactive);
+ ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
+ snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
+ LLVMValueRef ret =
+ ac_build_intrinsic(ctx, name,
+ LLVMTypeOf(src), (LLVMValueRef []) {
+ src, inactive }, 2,
+ AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+ return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+static LLVMValueRef
+get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size)
+{
+ if (type_size == 4) {
+ switch (op) {
+ case nir_op_iadd: return ctx->i32_0;
+ case nir_op_fadd: return ctx->f32_0;
+ case nir_op_imul: return ctx->i32_1;
+ case nir_op_fmul: return ctx->f32_1;
+ case nir_op_imin: return LLVMConstInt(ctx->i32, INT32_MAX, 0);
+ case nir_op_umin: return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
+ case nir_op_fmin: return LLVMConstReal(ctx->f32, INFINITY);
+ case nir_op_imax: return LLVMConstInt(ctx->i32, INT32_MIN, 0);
+ case nir_op_umax: return ctx->i32_0;
+ case nir_op_fmax: return LLVMConstReal(ctx->f32, -INFINITY);
+ case nir_op_iand: return LLVMConstInt(ctx->i32, -1, 0);
+ case nir_op_ior: return ctx->i32_0;
+ case nir_op_ixor: return ctx->i32_0;
+ default:
+ unreachable("bad reduction intrinsic");
+ }
+ } else { /* type_size == 64bit */
+ switch (op) {
+ case nir_op_iadd: return ctx->i64_0;
+ case nir_op_fadd: return ctx->f64_0;
+ case nir_op_imul: return ctx->i64_1;
+ case nir_op_fmul: return ctx->f64_1;
+ case nir_op_imin: return LLVMConstInt(ctx->i64, INT64_MAX, 0);
+ case nir_op_umin: return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
+ case nir_op_fmin: return LLVMConstReal(ctx->f64, INFINITY);
+ case nir_op_imax: return LLVMConstInt(ctx->i64, INT64_MIN, 0);
+ case nir_op_umax: return ctx->i64_0;
+ case nir_op_fmax: return LLVMConstReal(ctx->f64, -INFINITY);
+ case nir_op_iand: return LLVMConstInt(ctx->i64, -1, 0);
+ case nir_op_ior: return ctx->i64_0;
+ case nir_op_ixor: return ctx->i64_0;
+ default:
+ unreachable("bad reduction intrinsic");
+ }
+ }
+}
+
+static LLVMValueRef
+ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op)
+{
+ bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
+ switch (op) {
+ case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
+ case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
+ case nir_op_imul: return LLVMBuildMul(ctx->builder, lhs, rhs, "");
+ case nir_op_fmul: return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
+ case nir_op_imin: return LLVMBuildSelect(ctx->builder,
+ LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
+ lhs, rhs, "");
+ case nir_op_umin: return LLVMBuildSelect(ctx->builder,
+ LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
+ lhs, rhs, "");
+ case nir_op_fmin: return ac_build_intrinsic(ctx,
+ _64bit ? "llvm.minnum.f64" : "llvm.minnum.f32",
+ _64bit ? ctx->f64 : ctx->f32,
+ (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
+ case nir_op_imax: return LLVMBuildSelect(ctx->builder,
+ LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
+ lhs, rhs, "");
+ case nir_op_umax: return LLVMBuildSelect(ctx->builder,
+ LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
+ lhs, rhs, "");
+ case nir_op_fmax: return ac_build_intrinsic(ctx,
+ _64bit ? "llvm.maxnum.f64" : "llvm.maxnum.f32",
+ _64bit ? ctx->f64 : ctx->f32,
+ (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
+ case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
+ case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
+ case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
+ default:
+ unreachable("bad reduction intrinsic");
+ }
+}
+
+/* TODO: add inclusive and excluse scan functions for SI chip class. */
+static LLVMValueRef
+ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity)
+{
+ LLVMValueRef result, tmp;
+ result = src;
+ tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ return result;
+}
+
+LLVMValueRef
+ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
+{
+ ac_build_optimization_barrier(ctx, &src);
+ LLVMValueRef result;
+ LLVMValueRef identity = get_reduction_identity(ctx, op,
+ ac_get_type_size(LLVMTypeOf(src)));
+ result = LLVMBuildBitCast(ctx->builder,
+ ac_build_set_inactive(ctx, src, identity),
+ LLVMTypeOf(identity), "");
+ result = ac_build_scan(ctx, op, result, identity);
+
+ return ac_build_wwm(ctx, result);
+}
+
+LLVMValueRef
+ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
+{
+ ac_build_optimization_barrier(ctx, &src);
+ LLVMValueRef result;
+ LLVMValueRef identity = get_reduction_identity(ctx, op,
+ ac_get_type_size(LLVMTypeOf(src)));
+ result = LLVMBuildBitCast(ctx->builder,
+ ac_build_set_inactive(ctx, src, identity),
+ LLVMTypeOf(identity), "");
+ result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
+ result = ac_build_scan(ctx, op, result, identity);
+
+ return ac_build_wwm(ctx, result);
+}
+
+LLVMValueRef
+ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
+{
+ if (cluster_size == 1) return src;
+ ac_build_optimization_barrier(ctx, &src);
+ LLVMValueRef result, swap;
+ LLVMValueRef identity = get_reduction_identity(ctx, op,
+ ac_get_type_size(LLVMTypeOf(src)));
+ result = LLVMBuildBitCast(ctx->builder,
+ ac_build_set_inactive(ctx, src, identity),
+ LLVMTypeOf(identity), "");
+ swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 2) return ac_build_wwm(ctx, result);
+
+ swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 4) return ac_build_wwm(ctx, result);
+
+ if (ctx->chip_class >= VI)
+ swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
+ else
+ swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 8) return ac_build_wwm(ctx, result);
+
+ if (ctx->chip_class >= VI)
+ swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
+ else
+ swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 16) return ac_build_wwm(ctx, result);
+
+ if (ctx->chip_class >= VI && cluster_size != 32)
+ swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
+ else
+ swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 32) return ac_build_wwm(ctx, result);
+
+ if (ctx->chip_class >= VI) {
+ swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
+ result = ac_build_alu_op(ctx, result, swap, op);
+ result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
+ return ac_build_wwm(ctx, result);
+ } else {
+ swap = ac_build_readlane(ctx, result, ctx->i32_0);
+ result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
+ result = ac_build_alu_op(ctx, result, swap, op);
+ return ac_build_wwm(ctx, result);
+ }
+}
+
+LLVMValueRef
+ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
+ unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
+{
+ unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
+ if (ctx->chip_class >= VI && HAVE_LLVM >= 0x0600) {
+ return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
+ } else {
+ return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
+ }
+}
+
+LLVMValueRef
+ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
+{
+ index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
+ return ac_build_intrinsic(ctx,
+ "llvm.amdgcn.ds.bpermute", ctx->i32,
+ (LLVMValueRef []) {index, src}, 2,
+ AC_FUNC_ATTR_READNONE |
+ AC_FUNC_ATTR_CONVERGENT);
+}