radv/gfx10: fix implementation of exclusive scans
authorSamuel Pitoiset <samuel.pitoiset@gmail.com>
Fri, 23 Aug 2019 15:53:05 +0000 (17:53 +0200)
committerSamuel Pitoiset <samuel.pitoiset@gmail.com>
Wed, 27 Nov 2019 06:39:26 +0000 (07:39 +0100)
This implementation is loosely based on ROCm.
https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/master/ockl/src/wfredscan.cl

This fixes dEQP-VK.subgroups.arithmetic.*.subgroupexclusive* on GFX10.

Fixes: 227c29a80de ("amd/common/gfx10: implement scan & reduce operations")
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
src/amd/llvm/ac_llvm_build.c

index 1cd687366d724a06d793cc081b8b743a8b022f28..54513d79922baf9cb55cb18aade6b235a16dfb32 100644 (file)
@@ -4049,7 +4049,37 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu
        if (inclusive) {
                result = src;
        } else if (ctx->chip_class >= GFX10) {
-               result = identity;
+               /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
+               LLVMValueRef active, tmp1, tmp2;
+               LLVMValueRef tid = ac_get_thread_id(ctx);
+
+               tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
+
+               tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
+
+               if (maxprefix > 32) {
+                       active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid,
+                                              LLVMConstInt(ctx->i32, 32, false), "");
+
+                       tmp2 = LLVMBuildSelect(ctx->builder, active,
+                                              ac_build_readlane(ctx, src,
+                                                                LLVMConstInt(ctx->i32, 31, false)),
+                                              tmp2, "");
+
+                       active = LLVMBuildOr(ctx->builder, active,
+                                            LLVMBuildICmp(ctx->builder, LLVMIntEQ,
+                                                          LLVMBuildAnd(ctx->builder, tid,
+                                                                       LLVMConstInt(ctx->i32, 0x1f, false), ""),
+                                                          LLVMConstInt(ctx->i32, 0x10, false), ""), "");
+                       src = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+               } else if (maxprefix > 16) {
+                       active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid,
+                                              LLVMConstInt(ctx->i32, 16, false), "");
+
+                       src = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+               }
+
+               result = src;
        } else if (ctx->chip_class >= GFX8) {
                src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
                result = src;
@@ -4148,33 +4178,31 @@ ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValu
                return result;
 
        if (ctx->chip_class >= GFX10) {
-               /* dpp_row_bcast{15,31} are not supported on gfx10. */
-               LLVMBuilderRef builder = ctx->builder;
                LLVMValueRef tid = ac_get_thread_id(ctx);
-               LLVMValueRef cc;
-               /* TODO-GFX10: Can we get better code-gen by putting this into
-                * a branch so that LLVM generates EXEC mask manipulations? */
-               if (inclusive)
-                       tmp = result;
-               else
-                       tmp = ac_build_alu_op(ctx, result, src, op);
-               tmp = ac_build_permlane16(ctx, tmp, ~(uint64_t)0, true, false);
-               tmp = ac_build_alu_op(ctx, result, tmp, op);
-               cc = LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->i32, 16, false), "");
-               cc = LLVMBuildICmp(builder, LLVMIntNE, cc, ctx->i32_0, "");
-               result = LLVMBuildSelect(builder, cc, tmp, result, "");
+               LLVMValueRef active;
+
+               tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
+
+               active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+                                      LLVMBuildAnd(ctx->builder, tid,
+                                                   LLVMConstInt(ctx->i32, 16, false), ""),
+                                      ctx->i32_0, "");
+
+               tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+
+               result = ac_build_alu_op(ctx, result, tmp, op);
+
                if (maxprefix <= 32)
                        return result;
 
-               if (inclusive)
-                       tmp = result;
-               else
-                       tmp = ac_build_alu_op(ctx, result, src, op);
-               tmp = ac_build_readlane(ctx, tmp, LLVMConstInt(ctx->i32, 31, false));
-               tmp = ac_build_alu_op(ctx, result, tmp, op);
-               cc = LLVMBuildICmp(builder, LLVMIntUGE, tid,
-                                  LLVMConstInt(ctx->i32, 32, false), "");
-               result = LLVMBuildSelect(builder, cc, tmp, result, "");
+               tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
+
+               active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid,
+                                      LLVMConstInt(ctx->i32, 32, false), "");
+
+               tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+
+               result = ac_build_alu_op(ctx, result, tmp, op);
                return result;
        }