From bc86e829a5c87714a7f3798fe9096c75692e5157 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Wed, 21 Dec 2016 04:55:34 +0100
Subject: [PATCH] gallivm: optimize lp_build_unpack_arith_rgba_aos slightly

This code uses a vector shift which has to be emulated on x86 unless
there's AVX2. Luckily in some cases we can actually avoid the shift
altogether, so do that.
Also make sure we hit the fast lp_build_conv() path when applicable,
albeit that's quite the hack...
That said, this path is taken for AoS sampling for small unorm (smaller
than rgba8) formats, and it is completely hopeless even with those
changes, with or without AVX.
(Probably should have some code similar to the one in the llvmpipe fs
backend code, using bit replication to extend to rgba8888 - rounding
is not quite 100% accurate but if it's good enough there it should be
here as well.)

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 .../auxiliary/gallivm/lp_bld_format_aos.c     | 116 +++++++++++++++---
 1 file changed, 97 insertions(+), 19 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 322e7b817db..574bb64c917 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -38,6 +38,7 @@
 #include "util/u_math.h"
 #include "util/u_pointer.h"
 #include "util/u_string.h"
+#include "util/u_cpu_detect.h"
 
 #include "lp_bld_arit.h"
 #include "lp_bld_init.h"
@@ -49,6 +50,7 @@
 #include "lp_bld_gather.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_format.h"
+#include "lp_bld_pack.h"
 #include "lp_bld_intr.h"
 
 
@@ -156,6 +158,7 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
    LLVMValueRef shifts[4];
    LLVMValueRef masks[4];
    LLVMValueRef scales[4];
+   LLVMTypeRef vec32_type;
 
    boolean normalized;
    boolean needs_uitofp;
@@ -171,19 +174,17 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
     * matches floating point size */
    assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context));
 
+   vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
+
    /* Broadcast the packed value to all four channels
     * before: packed = BGRA
     * after: packed = {BGRA, BGRA, BGRA, BGRA}
     */
-   packed = LLVMBuildInsertElement(builder,
-                                   LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
-                                   packed,
+   packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed,
                                    LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)),
                                    "");
-   packed = LLVMBuildShuffleVector(builder,
-                                   packed,
-                                   LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
-                                   LLVMConstNull(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
+   packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type),
+                                   LLVMConstNull(vec32_type),
                                    "");
 
    /* Initialize vector constants */
@@ -224,9 +225,40 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
    /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
     * into masked = {X, Y, Z, W}
     */
-   /* Note: we cannot do this shift on x86 natively until AVX2. */
-   shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
-   masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
+   if (desc->block.bits < 32 && normalized) {
+      /*
+       * Note: we cannot do the shift below on x86 natively until AVX2.
+       *
+       * Old llvm versions will resort to scalar extract/shift insert,
+       * which is definitely terrible, new versions will just do
+       * several vector shifts and shuffle/blend results together.
+       * We could turn this into a variable left shift plus a constant
+       * right shift, and llvm would then turn the variable left shift
+       * into a mul for us (albeit without sse41 the mul needs emulation
+       * too...). However, since we're going to do a float mul
+       * anyway, we just adjust that mul instead (plus the mask), skipping
+       * the shift completely.
+       * We could also use a extra mul when the format isn't normalized and
+       * we don't have AVX2 support, but don't bother for now. Unfortunately,
+       * this strategy doesn't work for 32bit formats (such as rgb10a2 or even
+       * rgba8 if it ends up here), as that would require UIToFP, albeit that
+       * would be fixable with easy 16bit shuffle (unless there's channels
+       * crossing 16bit boundaries).
+       */
+      for (i = 0; i < 4; ++i) {
+         if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
+            unsigned bits = desc->channel[i].size;
+            unsigned shift = desc->channel[i].shift;
+            unsigned long long mask = ((1ULL << bits) - 1) << shift;
+            scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
+            masks[i] = lp_build_const_int32(gallivm, mask);
+         }
+      }
+      masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), "");
+   } else {
+      shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
+      masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
+   }
 
    if (!needs_uitofp) {
       /* UIToFP can't be expressed in SSE2 */
@@ -235,8 +267,10 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
       casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
    }
 
-   /* At this point 'casted' may be a vector of floats such as
-    * {255.0, 255.0, 255.0, 255.0}.  Next, if the pixel values are normalized
+   /*
+    * At this point 'casted' may be a vector of floats such as
+    * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied
+    * by powers of two). Next, if the pixel values are normalized
     * we'll scale this to {1.0, 1.0, 1.0, 1.0}.
     */
 
@@ -392,6 +426,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
 
    if (format_matches_type(format_desc, type) &&
        format_desc->block.bits <= type.width * 4 &&
+       /* XXX this shouldn't be needed */
        util_is_power_of_two(format_desc->block.bits)) {
       LLVMValueRef packed;
       LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
@@ -424,6 +459,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
        format_desc->block.width == 1 &&
        format_desc->block.height == 1 &&
+       /* XXX this shouldn't be needed */
        util_is_power_of_two(format_desc->block.bits) &&
        format_desc->block.bits <= 32 &&
        format_desc->is_bitmask &&
@@ -433,8 +469,24 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
        !format_desc->channel[0].pure_integer) {
 
       LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
-      LLVMValueRef res;
-      unsigned k;
+      LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128];
+      struct lp_type conv_type;
+      unsigned k, num_conv_src, num_conv_dst;
+
+      /*
+       * XXX: We end up here for the AoS unorm8 sampling (if the format wasn't some
+       * 888(8) variant), so things like rgb565. This is _really_ suboptimal.
+       * Not only do we a single pixel at a time but we convert to float,
+       * do a normalize mul, un-normalize mul, convert back to int, finally pack
+       * down to 8 bits. At the end throw in a couple of shifts/ands/ors for aos
+       * swizzle (well rgb565 is ok but bgrx5551 not for instance) for good
+       * measure. (And if we're not extra careful we get some pointless min/max
+       * too for clamping values to range). This is a disaster of epic proportions,
+       * simply forcing SoA sampling would be way faster (even when we don't have
+       * AVX support).
+       * We should make sure we cannot hit this code path for anything but single
+       * pixels.
+       */
 
       /*
        * Unpack a pixel at a time into a <4 x float> RGBA vector
@@ -464,12 +516,38 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
                       __FUNCTION__, format_desc->short_name);
       }
 
-      lp_build_conv(gallivm,
-                    lp_float32_vec4_type(),
-                    type,
-                    tmps, num_pixels, &res, 1);
+      conv_type = lp_float32_vec4_type();
+      num_conv_src = num_pixels;
+      num_conv_dst = 1;
+
+      if (num_pixels % 8 == 0) {
+         lp_build_concat_n(gallivm, lp_float32_vec4_type(),
+                           tmps, num_pixels, tmps, num_pixels / 2);
+         conv_type.length *= num_pixels / 4;
+         num_conv_src = 4 * num_pixels / 8;
+         if (type.width == 8 && type.floating == 0 && type.fixed == 0) {
+            /*
+             * FIXME: The fast float->unorm path (which is basically
+             * skipping the MIN/MAX which are extremely pointless in any
+             * case) requires that there's 2 destinations...
+             * In any case, we really should make sure we don't hit this
+             * code with multiple pixels for unorm8 dst types, it's
+             * completely hopeless even if we do hit the right conversion.
+             */
+            type.length /= num_pixels / 4;
+            num_conv_dst = num_pixels / 4;
+         }
+      }
+
+      lp_build_conv(gallivm, conv_type, type,
+                    tmps, num_conv_src, res, num_conv_dst);
+
+      if (num_pixels % 8 == 0 &&
+          (type.width == 8 && type.floating == 0 && type.fixed == 0)) {
+         lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1);
+      }
 
-      return lp_build_format_swizzle_aos(format_desc, &bld, res);
+      return lp_build_format_swizzle_aos(format_desc, &bld, res[0]);
    }
 
    /* If all channels are of same type and we are not using half-floats */
-- 
2.30.2