gallivm: optimize lp_build_unpack_arith_rgba_aos slightly

author Roland Scheidegger <sroland@vmware.com>

Wed, 21 Dec 2016 03:55:34 +0000 (04:55 +0100)

committer Roland Scheidegger <sroland@vmware.com>

Thu, 5 Jan 2017 22:59:38 +0000 (23:59 +0100)
author Roland Scheidegger <sroland@vmware.com>
Wed, 21 Dec 2016 03:55:34 +0000 (04:55 +0100)
committer Roland Scheidegger <sroland@vmware.com>
Thu, 5 Jan 2017 22:59:38 +0000 (23:59 +0100)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c

index 322e7b817dbae30cb1a90b0a4eb3ad32131badb7..574bb64c9171cf0e3c2bf8917b546c4ab3990d7e 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -38,6 +38,7 @@
  #include "util/u_math.h"
  #include "util/u_pointer.h"
  #include "util/u_string.h"
+#include "util/u_cpu_detect.h"
  
  #include "lp_bld_arit.h"
  #include "lp_bld_init.h"
@@ -49,6 +50,7 @@
  #include "lp_bld_gather.h"
  #include "lp_bld_debug.h"
  #include "lp_bld_format.h"
+#include "lp_bld_pack.h"
  #include "lp_bld_intr.h"
  
  
@@ -156,6 +158,7 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
     LLVMValueRef shifts[4];
     LLVMValueRef masks[4];
     LLVMValueRef scales[4];
+   LLVMTypeRef vec32_type;
  
     boolean normalized;
     boolean needs_uitofp;
@@ -171,19 +174,17 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
      * matches floating point size */
     assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context));
  
+   vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
+
     /* Broadcast the packed value to all four channels
      * before: packed = BGRA
      * after: packed = {BGRA, BGRA, BGRA, BGRA}
      */
-   packed = LLVMBuildInsertElement(builder,
-                                   LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
-                                   packed,
+   packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed,
                                     LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)),
                                     "");
-   packed = LLVMBuildShuffleVector(builder,
-                                   packed,
-                                   LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
-                                   LLVMConstNull(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
+   packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type),
+                                   LLVMConstNull(vec32_type),
                                     "");
  
     /* Initialize vector constants */
@@ -224,9 +225,40 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
     /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
      * into masked = {X, Y, Z, W}
      */
-   /* Note: we cannot do this shift on x86 natively until AVX2. */
-   shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
-   masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
+   if (desc->block.bits < 32 && normalized) {
+      /*
+       * Note: we cannot do the shift below on x86 natively until AVX2.
+       *
+       * Old llvm versions will resort to scalar extract/shift insert,
+       * which is definitely terrible, new versions will just do
+       * several vector shifts and shuffle/blend results together.
+       * We could turn this into a variable left shift plus a constant
+       * right shift, and llvm would then turn the variable left shift
+       * into a mul for us (albeit without sse41 the mul needs emulation
+       * too...). However, since we're going to do a float mul
+       * anyway, we just adjust that mul instead (plus the mask), skipping
+       * the shift completely.
+       * We could also use a extra mul when the format isn't normalized and
+       * we don't have AVX2 support, but don't bother for now. Unfortunately,
+       * this strategy doesn't work for 32bit formats (such as rgb10a2 or even
+       * rgba8 if it ends up here), as that would require UIToFP, albeit that
+       * would be fixable with easy 16bit shuffle (unless there's channels
+       * crossing 16bit boundaries).
+       */
+      for (i = 0; i < 4; ++i) {
+         if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
+            unsigned bits = desc->channel[i].size;
+            unsigned shift = desc->channel[i].shift;
+            unsigned long long mask = ((1ULL << bits) - 1) << shift;
+            scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
+            masks[i] = lp_build_const_int32(gallivm, mask);
+         }
+      }
+      masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), "");
+   } else {
+      shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
+      masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
+   }
  
     if (!needs_uitofp) {
        /* UIToFP can't be expressed in SSE2 */
@@ -235,8 +267,10 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
        casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
     }
  
-   /* At this point 'casted' may be a vector of floats such as
-    * {255.0, 255.0, 255.0, 255.0}.  Next, if the pixel values are normalized
+   /*
+    * At this point 'casted' may be a vector of floats such as
+    * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied
+    * by powers of two). Next, if the pixel values are normalized
      * we'll scale this to {1.0, 1.0, 1.0, 1.0}.
      */
  
@@ -392,6 +426,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
  
     if (format_matches_type(format_desc, type) &&
         format_desc->block.bits <= type.width * 4 &&
+       /* XXX this shouldn't be needed */
         util_is_power_of_two(format_desc->block.bits)) {
        LLVMValueRef packed;
        LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
@@ -424,6 +459,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
          format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
         format_desc->block.width == 1 &&
         format_desc->block.height == 1 &&
+       /* XXX this shouldn't be needed */
         util_is_power_of_two(format_desc->block.bits) &&
         format_desc->block.bits <= 32 &&
         format_desc->is_bitmask &&
@@ -433,8 +469,24 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
         !format_desc->channel[0].pure_integer) {
  
        LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
-      LLVMValueRef res;
-      unsigned k;
+      LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128];
+      struct lp_type conv_type;
+      unsigned k, num_conv_src, num_conv_dst;
+
+      /*
+       * XXX: We end up here for the AoS unorm8 sampling (if the format wasn't some
+       * 888(8) variant), so things like rgb565. This is _really_ suboptimal.
+       * Not only do we a single pixel at a time but we convert to float,
+       * do a normalize mul, un-normalize mul, convert back to int, finally pack
+       * down to 8 bits. At the end throw in a couple of shifts/ands/ors for aos
+       * swizzle (well rgb565 is ok but bgrx5551 not for instance) for good
+       * measure. (And if we're not extra careful we get some pointless min/max
+       * too for clamping values to range). This is a disaster of epic proportions,
+       * simply forcing SoA sampling would be way faster (even when we don't have
+       * AVX support).
+       * We should make sure we cannot hit this code path for anything but single
+       * pixels.
+       */
  
        /*
         * Unpack a pixel at a time into a <4 x float> RGBA vector
@@ -464,12 +516,38 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
                        __FUNCTION__, format_desc->short_name);
        }
  
-      lp_build_conv(gallivm,
-                    lp_float32_vec4_type(),
-                    type,
-                    tmps, num_pixels, &res, 1);
+      conv_type = lp_float32_vec4_type();
+      num_conv_src = num_pixels;
+      num_conv_dst = 1;
+
+      if (num_pixels % 8 == 0) {
+         lp_build_concat_n(gallivm, lp_float32_vec4_type(),
+                           tmps, num_pixels, tmps, num_pixels / 2);
+         conv_type.length *= num_pixels / 4;
+         num_conv_src = 4 * num_pixels / 8;
+         if (type.width == 8 && type.floating == 0 && type.fixed == 0) {
+            /*
+             * FIXME: The fast float->unorm path (which is basically
+             * skipping the MIN/MAX which are extremely pointless in any
+             * case) requires that there's 2 destinations...
+             * In any case, we really should make sure we don't hit this
+             * code with multiple pixels for unorm8 dst types, it's
+             * completely hopeless even if we do hit the right conversion.
+             */
+            type.length /= num_pixels / 4;
+            num_conv_dst = num_pixels / 4;
+         }
+      }
+
+      lp_build_conv(gallivm, conv_type, type,
+                    tmps, num_conv_src, res, num_conv_dst);
+
+      if (num_pixels % 8 == 0 &&
+          (type.width == 8 && type.floating == 0 && type.fixed == 0)) {
+         lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1);
+      }
  
-      return lp_build_format_swizzle_aos(format_desc, &bld, res);
+      return lp_build_format_swizzle_aos(format_desc, &bld, res[0]);
     }
  
     /* If all channels are of same type and we are not using half-floats */
author	Roland Scheidegger <sroland@vmware.com>
	Wed, 21 Dec 2016 03:55:34 +0000 (04:55 +0100)
committer	Roland Scheidegger <sroland@vmware.com>
	Thu, 5 Jan 2017 22:59:38 +0000 (23:59 +0100)