ac: Use DPP for build_ddxy where possible.
authorBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Wed, 23 May 2018 09:34:15 +0000 (11:34 +0200)
committerBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Wed, 23 May 2018 19:02:45 +0000 (21:02 +0200)
WQM is pretty reliable now on LLVM 7, so let us just use
DPP + WQM.

This gives approximately a 1.5% performance increase on the
vrcompositor built-in benchmark.

v2: Use ac_build_quad_swizzle.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
src/amd/common/ac_llvm_build.c

index 36c1d62637b15a87d3f7a8b453af3a38cb30a74c..4eebbbd4d9d0c55021ad9dd72db8cec8693151b8 100644 (file)
@@ -1170,7 +1170,21 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
        LLVMValueRef tl, trbl, args[2];
        LLVMValueRef result;
 
-       if (ctx->chip_class >= VI) {
+       if (HAVE_LLVM >= 0x0700) {
+               unsigned tl_lanes[4], trbl_lanes[4];
+
+               for (unsigned i = 0; i < 4; ++i) {
+                       tl_lanes[i] = i & mask;
+                       trbl_lanes[i] = (i & mask) + idx;
+               }
+
+               tl = ac_build_quad_swizzle(ctx, val,
+                                          tl_lanes[0], tl_lanes[1],
+                                          tl_lanes[2], tl_lanes[3]);
+               trbl = ac_build_quad_swizzle(ctx, val,
+                                            trbl_lanes[0], trbl_lanes[1],
+                                            trbl_lanes[2], trbl_lanes[3]);
+       } else if (ctx->chip_class >= VI) {
                LLVMValueRef thread_id, tl_tid, trbl_tid;
                thread_id = ac_get_thread_id(ctx);