radv/llvm: fix subgroup shuffle for chips without bpermute
authorSamuel Pitoiset <samuel.pitoiset@gmail.com>
Mon, 23 Mar 2020 11:02:15 +0000 (12:02 +0100)
committerMarge Bot <eric+marge@anholt.net>
Mon, 23 Mar 2020 14:19:03 +0000 (14:19 +0000)
bpermute only exists on GFX8+ and only with Wave32 on GFX10. Instead
we have to use readlane with a waterfall loop to defeat the LLVM
backend.

This fixes DOOM Eternal which requires subgroup shuffle.

Cc: <mesa-stable@lists.freedesktop.org>
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Tested-by: Marge Bot <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4284>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4284>

src/amd/llvm/ac_nir_to_llvm.c
src/amd/vulkan/radv_device.c

index 49627990163e27ca4daeaa468b3d439fbe5dedf0..871c6abc17fb4c5b8afe4de521c84ad532dbcb0b 100644 (file)
@@ -3950,8 +3950,33 @@ static void visit_intrinsic(struct ac_nir_context *ctx,
                break;
        }
        case nir_intrinsic_shuffle:
-               result = ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]),
-                               get_src(ctx, instr->src[1]));
+               if (ctx->ac.chip_class == GFX8 ||
+                   ctx->ac.chip_class == GFX9 ||
+                   (ctx->ac.chip_class == GFX10 && ctx->ac.wave_size == 32)) {
+                       result = ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]),
+                                                 get_src(ctx, instr->src[1]));
+               } else {
+                       LLVMValueRef src = get_src(ctx, instr->src[0]);
+                       LLVMValueRef index = get_src(ctx, instr->src[1]);
+                       LLVMTypeRef type = LLVMTypeOf(src);
+                       struct waterfall_context wctx;
+                       LLVMValueRef index_val;
+
+                       index_val = enter_waterfall(ctx, &wctx, index, true);
+
+                       src = LLVMBuildZExt(ctx->ac.builder, src,
+                                           ctx->ac.i32, "");
+
+                       result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.readlane",
+                                                   ctx->ac.i32,
+                                                   (LLVMValueRef []) { src, index_val }, 2,
+                                                   AC_FUNC_ATTR_READNONE |
+                                                   AC_FUNC_ATTR_CONVERGENT);
+
+                       result = LLVMBuildTrunc(ctx->ac.builder, result, type, "");
+
+                       result = exit_waterfall(ctx, &wctx, result);
+               }
                break;
        case nir_intrinsic_reduce:
                result = ac_build_reduce(&ctx->ac,
index 2f44b279ac18f72327b053e8cb5f084cf656583d..1ecac0c1b553c369e489a963fcff339e620caaed 100644 (file)
@@ -1481,9 +1481,9 @@ radv_get_physical_device_properties_1_1(struct radv_physical_device *pdevice,
                                         VK_SUBGROUP_FEATURE_CLUSTERED_BIT |
                                         VK_SUBGROUP_FEATURE_QUAD_BIT;
 
-       if (pdevice->rad_info.chip_class == GFX8 ||
-           pdevice->rad_info.chip_class == GFX9 ||
-           (pdevice->rad_info.chip_class == GFX10 && pdevice->use_aco)) {
+       if (((pdevice->rad_info.chip_class == GFX6 ||
+             pdevice->rad_info.chip_class == GFX7) && !pdevice->use_aco) ||
+           pdevice->rad_info.chip_class >= GFX8) {
                p->subgroupSupportedOperations |= VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
                                                  VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT;
        }