From f4821daed1ec185d16f9ee3cc0951d306ce6e2b9 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Thu, 22 Dec 2016 03:55:17 +0100
Subject: [PATCH] llvmpipe: do transpose/untwiddle after conversion for 8bit
 formats

Generally we should do tranpose after conversion, if the format has less than
32 bits per channel (if it has 32 bits, conversion is going to be a no-op
anyway...). This is obviously because there's less vectors to deal with.
Though the advantage for 16 bit formats isn't that big, and in fact with AVX
there isn't really any (as the 32bit unpacks can be done with 256bit, but
the smaller ones cannot, although that would change again with proper AVX2
support).
Only makes sense for 2d and not 1d cases. And to keep things easy, only handle
1,2 and 4 channels (rgbx is just fine).
For rgba unorm8 format the backend conversion sums up to these instruction
totals (not counting the movs for SSE2 due to 2-op syntax - generally every 2
unpacks need an additional mov).
                     SSE2                    AVX
transpose:           32 unpack               16 unpack
untwiddle:           0                       8 (128bit low/high permutes)
convert:             16 mul + 16 cvt         8 mul + 8 cvt
32->8bit:            12 pack                 8 (128bit extract) + 12 pack

When doing transpose/untwiddle afterwards we get:
convert:             16 mul + 16 cvt         8 mul + 8 cvt
32->8bit:            12 pack                 8 (128bit extract) + 12 pack
transpose/untwiddle  12 unpack               12 unpack

So for SSE2, this drops 20 unpacks (total instruction count 76->56)
whereas for AVX it replaces the 16 256bit unpacks with 8 128bit ones
and drops the 8 lo/hi permutes (in total 60->48). (Albeit to be fair,
the permutes could be dropped even when doing the transpose first,
they are extremely pointless but we'd need to be able to tell
lp_build_conv to reorder the vectors, for AVX2 we're going to need to
be able to tell lp_build_conv about ordering in any case.)

(With different ordering going into conversion, it would be possible
to do 4 unpacks + 4 pshufbs instead of 12 unpacks, but that might not
be better, and not all cpus can do it. Proper AVX2 support should eliminate
the 8 128bit extracts, reduce these 12 packs to 6 and the 12 unpacks to 2
pshufb + 2 permq ideally (+ 2 final 128bit extracts).)

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/drivers/llvmpipe/lp_state_fs.c | 150 ++++++++++++++++++++-
 1 file changed, 143 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 2c0339cad60..af47b5280ce 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -734,6 +734,10 @@ generate_fs_twiddle(struct gallivm_state *gallivm,
       }
    } else if (twiddle) {
       /* Twiddle pixels across elements of array */
+      /*
+       * XXX: we should avoid this in some cases, but would need to tell
+       * lp_build_conv to reorder (or deal with it ourselves).
+       */
       lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
    } else {
       /* Do nothing */
@@ -764,6 +768,94 @@ generate_fs_twiddle(struct gallivm_state *gallivm,
 }
 
 
+/*
+ * Untwiddle and transpose, much like the above.
+ * However, this is after conversion, so we get packed vectors.
+ * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data,
+ * the vectors will look like:
+ * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may
+ * be swizzled here). Extending to 16bit should be trivial.
+ * Should also be extended to handle twice wide vectors with AVX2...
+ */
+static void
+fs_twiddle_transpose(struct gallivm_state *gallivm,
+                     struct lp_type type,
+                     LLVMValueRef *src,
+                     unsigned src_count,
+                     LLVMValueRef *dst)
+{
+   unsigned i, j;
+   struct lp_type type64, type16, type32;
+   LLVMTypeRef type64_t, type8_t, type16_t, type32_t;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef tmp[4], shuf[8];
+   for (j = 0; j < 2; j++) {
+      shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0);
+      shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2);
+      shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1);
+      shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3);
+   }
+
+   assert(src_count == 4 || src_count == 2 || src_count == 1);
+   assert(type.width == 8);
+   assert(type.length == 16);
+
+   type8_t = lp_build_vec_type(gallivm, type);
+
+   type64 = type;
+   type64.length /= 8;
+   type64.width *= 8;
+   type64_t = lp_build_vec_type(gallivm, type64);
+
+   type16 = type;
+   type16.length /= 2;
+   type16.width *= 2;
+   type16_t = lp_build_vec_type(gallivm, type16);
+
+   type32 = type;
+   type32.length /= 4;
+   type32.width *= 4;
+   type32_t = lp_build_vec_type(gallivm, type32);
+
+   lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp);
+
+   if (src_count == 1) {
+      /* transpose was no-op, just untwiddle */
+      LLVMValueRef shuf_vec;
+      shuf_vec = LLVMConstVector(shuf, 8);
+      tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, "");
+      tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, "");
+      dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, "");
+   } else if (src_count == 2) {
+      LLVMValueRef shuf_vec;
+      shuf_vec = LLVMConstVector(shuf, 4);
+
+      for (i = 0; i < 2; i++) {
+         tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, "");
+         tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, "");
+         dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, "");
+      }
+   } else {
+      for (j = 0; j < 2; j++) {
+         LLVMValueRef lo, hi, lo2, hi2;
+          /*
+          * Note that if we only really have 3 valid channels (rgb)
+          * and we don't need alpha we could substitute a undef here
+          * for the respective channel (causing llvm to drop conversion
+          * for alpha).
+          */
+         /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */
+         lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, "");
+         hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, "");
+         lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0);
+         hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1);
+         dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, "");
+         dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, "");
+      }
+   }
+}
+
+
 /**
  * Load an unswizzled block of pixels from memory
  */
@@ -1656,6 +1748,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
                                      util_blend_state_is_dual(&variant->key.blend, 0);
 
    const boolean is_1d = variant->key.resource_1d;
+   boolean twiddle_after_convert = FALSE;
    unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
    LLVMValueRef fpstate = 0;
 
@@ -1872,14 +1965,45 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
       }
    }
 
+   /*
+    * We actually should generally do conversion first (for non-1d cases)
+    * when the blend format is 8 or 16 bits. The reason is obvious,
+    * there's 2 or 4 times less vectors to deal with for the interleave...
+    * Albeit for the AVX (not AVX2) case there's no benefit with 16 bit
+    * vectors (as it can do 32bit unpack with 256bit vectors, but 8/16bit
+    * unpack only with 128bit vectors).
+    * Note: for 16bit sizes really need matching pack conversion code
+    */
+   if (!is_1d && dst_channels != 3 && dst_type.width == 8) {
+      twiddle_after_convert = TRUE;
+   }
+
    /*
     * Pixel twiddle from fragment shader order to memory order
     */
-   src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
-                                   dst_channels, fs_src, src, pad_inline);
-   if (dual_source_blend) {
-      generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
-                          fs_src1, src1, pad_inline);
+   if (!twiddle_after_convert) {
+      src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
+                                      dst_channels, fs_src, src, pad_inline);
+      if (dual_source_blend) {
+         generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
+                             fs_src1, src1, pad_inline);
+      }
+   } else {
+      src_count = num_fullblock_fs * dst_channels;
+      /*
+       * We reorder things a bit here, so the cases for 4-wide and 8-wide
+       * (AVX) turn out the same later when untwiddling/transpose (albeit
+       * for true AVX2 path untwiddle needs to be different).
+       * For now just order by colors first (so we can use unpack later).
+       */
+      for (j = 0; j < num_fullblock_fs; j++) {
+         for (i = 0; i < dst_channels; i++) {
+            src[i*num_fullblock_fs + j] = fs_src[j][i];
+            if (dual_source_blend) {
+               src1[i*num_fullblock_fs + j] = fs_src1[j][i];
+            }
+         }
+      }
    }
 
    src_channels = dst_channels < 3 ? dst_channels : 4;
@@ -1923,6 +2047,12 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
       assert(bits == 128 || bits == 256);
    }
 
+   if (twiddle_after_convert) {
+      fs_twiddle_transpose(gallivm, row_type, src, src_count, src);
+      if (dual_source_blend) {
+         fs_twiddle_transpose(gallivm, row_type, src1, src_count, src1);
+      }
+   }
 
    /*
     * Blend Colour conversion
@@ -2008,13 +2138,19 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
          mask_type.length = pixels;
          mask_type.width = row_type.width * dst_channels;
 
-         src_mask[i] = LLVMBuildIntCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
+         /*
+          * If mask_type width is smaller than 32bit, this doesn't quite
+          * generate the most efficient code (could use some pack).
+          */
+         src_mask[i] = LLVMBuildIntCast(builder, src_mask[i],
+                                        lp_build_int_vec_type(gallivm, mask_type), "");
 
          mask_type.length *= dst_channels;
          mask_type.width /= dst_channels;
       }
 
-      src_mask[i] = LLVMBuildBitCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
+      src_mask[i] = LLVMBuildBitCast(builder, src_mask[i],
+                                     lp_build_int_vec_type(gallivm, mask_type), "");
       src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length);
    }
 
-- 
2.30.2