-static LLVMValueRef
-ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
- unsigned maxprefix, bool inclusive)
-{
- LLVMValueRef result, tmp;
-
- if (inclusive) {
- result = src;
- } else if (ctx->chip_class >= GFX10) {
- result = identity;
- } else if (ctx->chip_class >= GFX8) {
- src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
- result = src;
- } else {
- /* wavefront shift_right by 1 on SI/CI */
- LLVMValueRef active, tmp1, tmp2;
- LLVMValueRef tid = ac_get_thread_id(ctx);
- tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
- tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));
- active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
- LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
- LLVMConstInt(ctx->i32, 0x4, 0), "");
- tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
- tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));
- active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
- LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
- LLVMConstInt(ctx->i32, 0x8, 0), "");
- tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
- tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));
- active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
- LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
- LLVMConstInt(ctx->i32, 0x10, 0), "");
- tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
- tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));
- active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
- tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
- active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), "");
- src = LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
- result = src;
- }
-
- if (ctx->chip_class <= GFX7) {
- assert(maxprefix == 64);
- LLVMValueRef tid = ac_get_thread_id(ctx);
- LLVMValueRef active;
- tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
- active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
- LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""),
- ctx->i32_0, "");
- tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
- result = ac_build_alu_op(ctx, result, tmp, op);
- tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
- active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
- LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
- ctx->i32_0, "");
- tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
- result = ac_build_alu_op(ctx, result, tmp, op);
- tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
- active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
- LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
- ctx->i32_0, "");
- tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
- result = ac_build_alu_op(ctx, result, tmp, op);
- tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
- active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
- LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
- ctx->i32_0, "");
- tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
- result = ac_build_alu_op(ctx, result, tmp, op);
- tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
- active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
- LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
- ctx->i32_0, "");
- tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
- result = ac_build_alu_op(ctx, result, tmp, op);
- tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
- active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
- LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
- ctx->i32_0, "");
- tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
- result = ac_build_alu_op(ctx, result, tmp, op);
- return result;
- }
-
- if (maxprefix <= 1)
- return result;
- tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
- result = ac_build_alu_op(ctx, result, tmp, op);
- if (maxprefix <= 2)
- return result;
- tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
- result = ac_build_alu_op(ctx, result, tmp, op);
- if (maxprefix <= 3)
- return result;
- tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
- result = ac_build_alu_op(ctx, result, tmp, op);
- if (maxprefix <= 4)
- return result;
- tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
- result = ac_build_alu_op(ctx, result, tmp, op);
- if (maxprefix <= 8)
- return result;
- tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
- result = ac_build_alu_op(ctx, result, tmp, op);
- if (maxprefix <= 16)
- return result;
-
- if (ctx->chip_class >= GFX10) {
- /* dpp_row_bcast{15,31} are not supported on gfx10. */
- LLVMBuilderRef builder = ctx->builder;
- LLVMValueRef tid = ac_get_thread_id(ctx);
- LLVMValueRef cc;
- /* TODO-GFX10: Can we get better code-gen by putting this into
- * a branch so that LLVM generates EXEC mask manipulations? */
- if (inclusive)
- tmp = result;
- else
- tmp = ac_build_alu_op(ctx, result, src, op);
- tmp = ac_build_permlane16(ctx, tmp, ~(uint64_t)0, true, false);
- tmp = ac_build_alu_op(ctx, result, tmp, op);
- cc = LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->i32, 16, false), "");
- cc = LLVMBuildICmp(builder, LLVMIntNE, cc, ctx->i32_0, "");
- result = LLVMBuildSelect(builder, cc, tmp, result, "");
- if (maxprefix <= 32)
- return result;
-
- if (inclusive)
- tmp = result;
- else
- tmp = ac_build_alu_op(ctx, result, src, op);
- tmp = ac_build_readlane(ctx, tmp, LLVMConstInt(ctx->i32, 31, false));
- tmp = ac_build_alu_op(ctx, result, tmp, op);
- cc = LLVMBuildICmp(builder, LLVMIntUGE, tid,
- LLVMConstInt(ctx->i32, 32, false), "");
- result = LLVMBuildSelect(builder, cc, tmp, result, "");
- return result;
- }
-
- tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
- result = ac_build_alu_op(ctx, result, tmp, op);
- if (maxprefix <= 32)
- return result;
- tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
- result = ac_build_alu_op(ctx, result, tmp, op);
- return result;
-}
-
-LLVMValueRef
-ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
-{
- LLVMValueRef result;
-
- if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
- LLVMBuilderRef builder = ctx->builder;
- src = LLVMBuildZExt(builder, src, ctx->i32, "");
- result = ac_build_ballot(ctx, src);
- result = ac_build_mbcnt(ctx, result);
- result = LLVMBuildAdd(builder, result, src, "");
- return result;
- }
-
- ac_build_optimization_barrier(ctx, &src);
-
- LLVMValueRef identity =
- get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
- result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
- LLVMTypeOf(identity), "");
- result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
-
- return ac_build_wwm(ctx, result);
-}
-
-LLVMValueRef
-ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
-{
- LLVMValueRef result;
-
- if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
- LLVMBuilderRef builder = ctx->builder;
- src = LLVMBuildZExt(builder, src, ctx->i32, "");
- result = ac_build_ballot(ctx, src);
- result = ac_build_mbcnt(ctx, result);
- return result;
- }
-
- ac_build_optimization_barrier(ctx, &src);
-
- LLVMValueRef identity =
- get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
- result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
- LLVMTypeOf(identity), "");
- result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
-
- return ac_build_wwm(ctx, result);
-}
-
-LLVMValueRef
-ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
-{
- if (cluster_size == 1) return src;
- ac_build_optimization_barrier(ctx, &src);
- LLVMValueRef result, swap;
- LLVMValueRef identity = get_reduction_identity(ctx, op,
- ac_get_type_size(LLVMTypeOf(src)));
- result = LLVMBuildBitCast(ctx->builder,
- ac_build_set_inactive(ctx, src, identity),
- LLVMTypeOf(identity), "");
- swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
- result = ac_build_alu_op(ctx, result, swap, op);
- if (cluster_size == 2) return ac_build_wwm(ctx, result);
-
- swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
- result = ac_build_alu_op(ctx, result, swap, op);
- if (cluster_size == 4) return ac_build_wwm(ctx, result);
-
- if (ctx->chip_class >= GFX8)
- swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
- else
- swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
- result = ac_build_alu_op(ctx, result, swap, op);
- if (cluster_size == 8) return ac_build_wwm(ctx, result);
-
- if (ctx->chip_class >= GFX8)
- swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
- else
- swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
- result = ac_build_alu_op(ctx, result, swap, op);
- if (cluster_size == 16) return ac_build_wwm(ctx, result);
-
- if (ctx->chip_class >= GFX10)
- swap = ac_build_permlane16(ctx, result, 0, true, false);
- else if (ctx->chip_class >= GFX8 && cluster_size != 32)
- swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
- else
- swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
- result = ac_build_alu_op(ctx, result, swap, op);
- if (cluster_size == 32) return ac_build_wwm(ctx, result);
-
- if (ctx->chip_class >= GFX8) {
- if (ctx->chip_class >= GFX10)
- swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
- else
- swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
- result = ac_build_alu_op(ctx, result, swap, op);
- result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
- return ac_build_wwm(ctx, result);
- } else {
- swap = ac_build_readlane(ctx, result, ctx->i32_0);
- result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
- result = ac_build_alu_op(ctx, result, swap, op);
- return ac_build_wwm(ctx, result);
- }
+static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src,
+ LLVMValueRef identity, unsigned maxprefix)
+{
+ if (ctx->chip_class >= GFX10) {
+ /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
+ LLVMValueRef active, tmp1, tmp2;
+ LLVMValueRef tid = ac_get_thread_id(ctx);
+
+ tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
+
+ tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
+
+ if (maxprefix > 32) {
+ active =
+ LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), "");
+
+ tmp2 = LLVMBuildSelect(ctx->builder, active,
+ ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)),
+ tmp2, "");
+
+ active = LLVMBuildOr(
+ ctx->builder, active,
+ LLVMBuildICmp(ctx->builder, LLVMIntEQ,
+ LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""),
+ LLVMConstInt(ctx->i32, 0x10, false), ""),
+ "");
+ return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+ } else if (maxprefix > 16) {
+ active =
+ LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), "");
+
+ return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+ }
+ } else if (ctx->chip_class >= GFX8) {
+ return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
+ }
+
+ /* wavefront shift_right by 1 on SI/CI */
+ LLVMValueRef active, tmp1, tmp2;
+ LLVMValueRef tid = ac_get_thread_id(ctx);
+ tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
+ tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));
+ active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
+ LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
+ LLVMConstInt(ctx->i32, 0x4, 0), "");
+ tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+ tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));
+ active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
+ LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
+ LLVMConstInt(ctx->i32, 0x8, 0), "");
+ tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+ tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));
+ active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
+ LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
+ LLVMConstInt(ctx->i32, 0x10, 0), "");
+ tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+ tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));
+ active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
+ tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+ active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), "");
+ return LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
+}
+
+/**
+ * \param maxprefix specifies that the result only needs to be correct for a
+ * prefix of this many threads
+ */
+static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src,
+ LLVMValueRef identity, unsigned maxprefix, bool inclusive)
+{
+ LLVMValueRef result, tmp;
+
+ if (!inclusive)
+ src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix);
+
+ result = src;
+
+ if (ctx->chip_class <= GFX7) {
+ assert(maxprefix == 64);
+ LLVMValueRef tid = ac_get_thread_id(ctx);
+ LLVMValueRef active;
+ tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
+ active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+ LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, "");
+ tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
+ active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+ LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
+ ctx->i32_0, "");
+ tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
+ active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+ LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
+ ctx->i32_0, "");
+ tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
+ active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+ LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
+ ctx->i32_0, "");
+ tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
+ active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+ LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
+ ctx->i32_0, "");
+ tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
+ active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+ LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
+ ctx->i32_0, "");
+ tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ return result;
+ }
+
+ if (maxprefix <= 1)
+ return result;
+ tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 2)
+ return result;
+ tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 3)
+ return result;
+ tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 4)
+ return result;
+ tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 8)
+ return result;
+ tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 16)
+ return result;
+
+ if (ctx->chip_class >= GFX10) {
+ LLVMValueRef tid = ac_get_thread_id(ctx);
+ LLVMValueRef active;
+
+ tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
+
+ active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+ LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""),
+ ctx->i32_0, "");
+
+ tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+
+ result = ac_build_alu_op(ctx, result, tmp, op);
+
+ if (maxprefix <= 32)
+ return result;
+
+ tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
+
+ active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), "");
+
+ tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ return result;
+ }
+
+ tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ if (maxprefix <= 32)
+ return result;
+ tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
+ result = ac_build_alu_op(ctx, result, tmp, op);
+ return result;
+}
+
+LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
+{
+ LLVMValueRef result;
+
+ if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
+ LLVMBuilderRef builder = ctx->builder;
+ src = LLVMBuildZExt(builder, src, ctx->i32, "");
+ result = ac_build_ballot(ctx, src);
+ result = ac_build_mbcnt(ctx, result);
+ result = LLVMBuildAdd(builder, result, src, "");
+ return result;
+ }
+
+ ac_build_optimization_barrier(ctx, &src);
+
+ LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
+ result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
+ LLVMTypeOf(identity), "");
+ result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
+
+ return ac_build_wwm(ctx, result);
+}
+
+LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
+{
+ LLVMValueRef result;
+
+ if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
+ LLVMBuilderRef builder = ctx->builder;
+ src = LLVMBuildZExt(builder, src, ctx->i32, "");
+ result = ac_build_ballot(ctx, src);
+ result = ac_build_mbcnt(ctx, result);
+ return result;
+ }
+
+ ac_build_optimization_barrier(ctx, &src);
+
+ LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
+ result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
+ LLVMTypeOf(identity), "");
+ result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
+
+ return ac_build_wwm(ctx, result);
+}
+
+LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
+ unsigned cluster_size)
+{
+ if (cluster_size == 1)
+ return src;
+ ac_build_optimization_barrier(ctx, &src);
+ LLVMValueRef result, swap;
+ LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
+ result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
+ LLVMTypeOf(identity), "");
+ swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 2)
+ return ac_build_wwm(ctx, result);
+
+ swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 4)
+ return ac_build_wwm(ctx, result);
+
+ if (ctx->chip_class >= GFX8)
+ swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
+ else
+ swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 8)
+ return ac_build_wwm(ctx, result);
+
+ if (ctx->chip_class >= GFX8)
+ swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
+ else
+ swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 16)
+ return ac_build_wwm(ctx, result);
+
+ if (ctx->chip_class >= GFX10)
+ swap = ac_build_permlane16(ctx, result, 0, true, false);
+ else if (ctx->chip_class >= GFX8 && cluster_size != 32)
+ swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
+ else
+ swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
+ result = ac_build_alu_op(ctx, result, swap, op);
+ if (cluster_size == 32)
+ return ac_build_wwm(ctx, result);
+
+ if (ctx->chip_class >= GFX8) {
+ if (ctx->wave_size == 64) {
+ if (ctx->chip_class >= GFX10)
+ swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
+ else
+ swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
+ result = ac_build_alu_op(ctx, result, swap, op);
+ result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
+ }
+
+ return ac_build_wwm(ctx, result);
+ } else {
+ swap = ac_build_readlane(ctx, result, ctx->i32_0);
+ result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
+ result = ac_build_alu_op(ctx, result, swap, op);
+ return ac_build_wwm(ctx, result);
+ }