gallivm: optimize gather a bit, by using supplied destination type

author Roland Scheidegger <sroland@vmware.com>

Sun, 11 Dec 2016 22:39:22 +0000 (23:39 +0100)

committer Roland Scheidegger <sroland@vmware.com>

Wed, 21 Dec 2016 03:48:24 +0000 (04:48 +0100)
author Roland Scheidegger <sroland@vmware.com>
Sun, 11 Dec 2016 22:39:22 +0000 (23:39 +0100)
committer Roland Scheidegger <sroland@vmware.com>
Wed, 21 Dec 2016 03:48:24 +0000 (04:48 +0100)
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c

index c5485728e42824ea40905a1fc0bdd564337f3882..19b75a5003bfd21c579e12f090d666372ad10c8f 100644 (file)
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -1864,7 +1864,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
                                         LLVMPointerType(LLVMInt8TypeInContext(context),
                                                         0), "");
           tmp = lp_build_gather(gallivm, vs_type.length,
-                               32, 32, TRUE,
+                               32, bld.type, TRUE,
                                 fetch_elts, tmp, FALSE);
           LLVMBuildStore(builder, tmp, index_store);
        }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c

index 9f6b9e9fb6f060f938cb95e647d6a72a44112460..322e7b817dbae30cb1a90b0a4eb3ad32131badb7 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -224,6 +224,7 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
     /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
      * into masked = {X, Y, Z, W}
      */
+   /* Note: we cannot do this shift on x86 natively until AVX2. */
     shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
     masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
  
@@ -394,6 +395,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
         util_is_power_of_two(format_desc->block.bits)) {
        LLVMValueRef packed;
        LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
+      struct lp_type fetch_type;
        unsigned vec_len = type.width * type.length;
  
        /*
@@ -401,8 +403,9 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
         * scaling or converting.
         */
  
+      fetch_type = lp_type_uint(type.width*4);
        packed = lp_build_gather(gallivm, type.length/4,
-                               format_desc->block.bits, type.width*4,
+                               format_desc->block.bits, fetch_type,
                                 aligned, base_ptr, offset, TRUE);
  
        assert(format_desc->block.bits <= vec_len);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c

index 8cad3a6fc651f37cae99798b803b87d05e57a83a..636a4a6238fde24897cc5d72022bb2f0685f0380 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
@@ -70,7 +70,14 @@ lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
  
     src_vec_type  = lp_build_vec_type(gallivm,  src_type);
  
-   /* Read whole vector from memory, unaligned */
+   /*
+    * Read whole vector from memory, unaligned.
+    * XXX: Note it's actually aligned to element type. Not sure if all
+    * callers are able to guarantee that (whereas for others, we should
+    * be able to use full alignment when there's 2 or 4 channels).
+    * (If all callers can guarantee element type alignment, we should
+    * relax alignment restrictions elsewhere.)
+    */
     ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, "");
     ptr = LLVMBuildPointerCast(builder, ptr, LLVMPointerType(src_vec_type, 0), "");
     res = LLVMBuildLoad(builder, ptr, "");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c

index a48d71f09034dad44a71160a31409560e3f467e9..b3bc15552c52d165920cc8befe8d00e6a9ea0024 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -405,6 +405,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                          LLVMValueRef rgba_out[4])
  {
     LLVMBuilderRef builder = gallivm->builder;
+   enum pipe_format format = format_desc->format;
+   struct lp_type fetch_type;
  
     if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
         (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
@@ -430,10 +432,11 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
         * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
         */
        assert(format_desc->block.bits <= type.width);
+      fetch_type = lp_type_uint(type.width);
        packed = lp_build_gather(gallivm,
                                 type.length,
                                 format_desc->block.bits,
-                               type.width,
+                               fetch_type,
                                 aligned,
                                 base_ptr, offset, FALSE);
  
@@ -447,22 +450,23 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
        return;
     }
  
-   if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
-       format_desc->format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+   if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
+       format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
        /*
         * similar conceptually to above but requiring special
         * AoS packed -> SoA float conversion code.
         */
        LLVMValueRef packed;
+      struct lp_type fetch_type = lp_type_uint(type.width);
  
        assert(type.floating);
        assert(type.width == 32);
  
        packed = lp_build_gather(gallivm, type.length,
                                 format_desc->block.bits,
-                               type.width, aligned,
+                               fetch_type, aligned,
                                 base_ptr, offset, FALSE);
-      if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
           lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
        }
        else {
@@ -478,8 +482,9 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
         * 32bit (or 8bit) from each block.
         */
        LLVMValueRef packed;
+      struct lp_type fetch_type = lp_type_uint(type.width);
  
-      if (format_desc->format == PIPE_FORMAT_X32_S8X24_UINT) {
+      if (format == PIPE_FORMAT_X32_S8X24_UINT) {
           /*
            * for stencil simply fix up offsets - could in fact change
            * base_ptr instead even outside the shader.
@@ -487,14 +492,14 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
           unsigned mask = (1 << 8) - 1;
           LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
           offset = LLVMBuildAdd(builder, offset, s_offset, "");
-         packed = lp_build_gather(gallivm, type.length, 32, type.width,
+         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
                                    aligned, base_ptr, offset, FALSE);
           packed = LLVMBuildAnd(builder, packed,
                                 lp_build_const_int_vec(gallivm, type, mask), "");
        }
        else {
-         assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
-         packed = lp_build_gather(gallivm, type.length, 32, type.width,
+         assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
+         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
                                    aligned, base_ptr, offset, TRUE);
           packed = LLVMBuildBitCast(builder, packed,
                                     lp_build_vec_type(gallivm, type), "");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c

index fa0e8b656bb133fd2613cc54785a392165ccee34..d6d7552986e4d957301cebfae6844bd4686781b5 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -491,13 +491,15 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
  {
     LLVMValueRef packed;
     LLVMValueRef rgba;
+   struct lp_type fetch_type;
  
     assert(format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED);
     assert(format_desc->block.bits == 32);
     assert(format_desc->block.width == 2);
     assert(format_desc->block.height == 1);
  
-   packed = lp_build_gather(gallivm, n, 32, 32, TRUE, base_ptr, offset, FALSE);
+   fetch_type = lp_type_uint(32);
+   packed = lp_build_gather(gallivm, n, 32, fetch_type, TRUE, base_ptr, offset, FALSE);
  
     (void)j;
  
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c

index 1f7ba927bc40f593284931b7c3692ce54850046b..7654ba01d8873dc75c81405587c4db925bbbacaa 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
@@ -28,6 +28,7 @@
  
  #include "util/u_debug.h"
  #include "util/u_cpu_detect.h"
+#include "util/u_math.h"
  #include "lp_bld_debug.h"
  #include "lp_bld_const.h"
  #include "lp_bld_format.h"
@@ -36,6 +37,7 @@
  #include "lp_bld_type.h"
  #include "lp_bld_init.h"
  #include "lp_bld_intr.h"
+#include "lp_bld_pack.h"
  
  
  /**
@@ -114,14 +116,29 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
      * translation of offsets to first_elem in sampler_views it actually seems
      * gallium could not do anything else except 16 no matter what...
      */
-  if (!aligned) {
+   if (!aligned) {
        LLVMSetAlignment(res, 1);
+   } else if (!util_is_power_of_two(src_width)) {
+      /*
+       * Full alignment is impossible, assume the caller really meant
+       * the individual elements were aligned (e.g. 3x32bit format).
+       * And yes the generated code may otherwise crash, llvm will
+       * really assume 128bit alignment with a 96bit fetch (I suppose
+       * that makes sense as it can just assume the upper 32bit to be
+       * whatever).
+       * Maybe the caller should be able to explicitly set this, but
+       * this should cover all the 3-channel formats.
+       */
+      if (((src_width / 24) * 24 == src_width) &&
+           util_is_power_of_two(src_width / 24)) {
+          LLVMSetAlignment(res, src_width / 24);
+      } else {
+         LLVMSetAlignment(res, 1);
+      }
     }
  
     assert(src_width <= dst_width);
-   if (src_width > dst_width) {
-      res = LLVMBuildTrunc(gallivm->builder, res, dst_elem_type, "");
-   } else if (src_width < dst_width) {
+   if (src_width < dst_width) {
        res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
        if (vector_justify) {
  #ifdef PIPE_ARCH_BIG_ENDIAN
@@ -135,28 +152,134 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
  }
  
  
+/**
+ * Gather one element from scatter positions in memory.
+ * Nearly the same as above, however the individual elements
+ * may be vectors themselves, and fetches may be float type.
+ * Can also do pad vector instead of ZExt.
+ *
+ * @sa lp_build_gather()
+ */
+static LLVMValueRef
+lp_build_gather_elem_vec(struct gallivm_state *gallivm,
+                         unsigned length,
+                         unsigned src_width,
+                         LLVMTypeRef src_type,
+                         struct lp_type dst_type,
+                         boolean aligned,
+                         LLVMValueRef base_ptr,
+                         LLVMValueRef offsets,
+                         unsigned i,
+                         boolean vector_justify)
+{
+   LLVMValueRef ptr, res;
+   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
+   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
+
+   ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
+   ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
+   res = LLVMBuildLoad(gallivm->builder, ptr, "");
+
+   /* XXX
+    * On some archs we probably really want to avoid having to deal
+    * with alignments lower than 4 bytes (if fetch size is a power of
+    * two >= 32). On x86 it doesn't matter, however.
+    * We should be able to guarantee full alignment for any kind of texture
+    * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
+    * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
+    * but I don't think that's quite what we wanted).
+    * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
+    * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
+    * enforcing what we want (which is what d3d10 does, the offset needs to
+    * be aligned to element size, but GL has bytes regardless of element
+    * size which would only leave us with minimum alignment restriction of 16
+    * which doesn't make much sense if the type isn't 4x32bit). Due to
+    * translation of offsets to first_elem in sampler_views it actually seems
+    * gallium could not do anything else except 16 no matter what...
+    */
+   if (!aligned) {
+      LLVMSetAlignment(res, 1);
+   } else if (!util_is_power_of_two(src_width)) {
+      /*
+       * Full alignment is impossible, assume the caller really meant
+       * the individual elements were aligned (e.g. 3x32bit format).
+       * And yes the generated code may otherwise crash, llvm will
+       * really assume 128bit alignment with a 96bit fetch (I suppose
+       * that makes sense as it can just assume the upper 32bit to be
+       * whatever).
+       * Maybe the caller should be able to explicitly set this, but
+       * this should cover all the 3-channel formats.
+       */
+      if (((src_width / 24) * 24 == src_width) &&
+           util_is_power_of_two(src_width / 24)) {
+          LLVMSetAlignment(res, src_width / 24);
+      } else {
+         LLVMSetAlignment(res, 1);
+      }
+   }
+
+   assert(src_width <= dst_type.width * dst_type.length);
+   if (src_width < dst_type.width * dst_type.length) {
+      if (dst_type.length > 1) {
+         res = lp_build_pad_vector(gallivm, res, dst_type.length);
+         /*
+          * vector_justify hopefully a non-issue since we only deal
+          * with src_width >= 32 here?
+          */
+      } else {
+         /*
+          * Only valid if src_ptr_type is int type...
+          */
+         res = LLVMBuildZExt(gallivm->builder, res,
+                             lp_build_vec_type(gallivm, dst_type), "");
+         if (vector_justify) {
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         res = LLVMBuildShl(gallivm->builder, res,
+                            LLVMConstInt(dst_elem_type,
+                                         dst_type.width - src_width, 0), "");
+#endif
+         }
+      }
+   }
+   return res;
+}
+
+
+
+
  static LLVMValueRef
  lp_build_gather_avx2(struct gallivm_state *gallivm,
                       unsigned length,
                       unsigned src_width,
-                     unsigned dst_width,
+                     struct lp_type dst_type,
                       LLVMValueRef base_ptr,
                       LLVMValueRef offsets)
  {
     LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef dst_type = LLVMIntTypeInContext(gallivm->context, dst_width);
-   LLVMTypeRef dst_vec_type = LLVMVectorType(dst_type, length);
-   LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);
-   LLVMTypeRef src_vec_type = LLVMVectorType(src_type, length);
+   LLVMTypeRef src_type, src_vec_type;
     LLVMValueRef res;
+   struct lp_type res_type = dst_type;
+   res_type.length *= length;
  
+   if (dst_type.floating) {
+      src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) :
+                                   LLVMFloatTypeInContext(gallivm->context);
+   } else {
+      src_type = LLVMIntTypeInContext(gallivm->context, src_width);
+   }
+   src_vec_type = LLVMVectorType(src_type, length);
+
+   /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */
     assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
  
     if (0) {
        /*
         * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but
-       * will not use the AVX2 gather instrinsics.  See
+       * will not use the AVX2 gather instrinsics (even with llvm 4.0), at
+       * least with Haswell. See
         * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html
+       * And the generated code doing the emulation is quite a bit worse
+       * than what we get by doing it ourselves too.
         */
        LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32);
        LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
@@ -176,7 +299,8 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
        src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep");
  
        char intrinsic[64];
-      util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%ui%u", length, src_width);
+      util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u",
+                    length, dst_type.floating ? "f" : "i", src_width);
        LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0);
        LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type);
        LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
@@ -185,27 +309,36 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
  
        res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0);
     } else {
-      assert(src_width == 32);
-
        LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8);
-
-      /*
-       * We should get the caller to give more type information so we can use
-       * the intrinsics for the right int/float domain.  Int should be the most
-       * common.
-       */
        const char *intrinsic = NULL;
-      switch (length) {
-      case 4:
-         intrinsic = "llvm.x86.avx2.gather.d.d";
-         break;
-      case 8:
-         intrinsic = "llvm.x86.avx2.gather.d.d.256";
-         break;
-      default:
-         assert(0);
+      unsigned l_idx = 0;
+
+      assert(src_width == 32 || src_width == 64);
+      if (src_width == 32) {
+         assert(length == 4 || length == 8);
+      } else {
+         assert(length == 2 || length == 4);
        }
  
+      static const char *intrinsics[2][2][2] = {
+
+         {{"llvm.x86.avx2.gather.d.d",
+           "llvm.x86.avx2.gather.d.d.256"},
+          {"llvm.x86.avx2.gather.d.q",
+           "llvm.x86.avx2.gather.d.q.256"}},
+
+         {{"llvm.x86.avx2.gather.d.ps",
+           "llvm.x86.avx2.gather.d.ps.256"},
+          {"llvm.x86.avx2.gather.d.pd",
+           "llvm.x86.avx2.gather.d.pd.256"}},
+      };
+
+      if ((src_width == 32 && length == 8) ||
+          (src_width == 64 && length == 4)) {
+         l_idx = 1;
+      }
+      intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx];
+
        LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
        LLVMValueRef mask = LLVMConstAllOnes(src_vec_type);
        mask = LLVMConstBitCast(mask, src_vec_type);
@@ -215,12 +348,7 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
  
        res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0);
     }
-
-   if (src_width > dst_width) {
-      res = LLVMBuildTrunc(builder, res, dst_vec_type, "");
-   } else if (src_width < dst_width) {
-      res = LLVMBuildZExt(builder, res, dst_vec_type, "");
-   }
+   res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), "");
  
     return res;
  }
@@ -241,9 +369,11 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
   *
   * @param length length of the offsets
   * @param src_width src element width in bits
- * @param dst_width result element width in bits (src will be expanded to fit)
+ * @param dst_type result element type (src will be expanded to fit,
+ *        but truncation is not allowed)
+ *        (this may be a vector, must be pot sized)
   * @param aligned whether the data is guaranteed to be aligned (to src_width)
- * @param base_ptr base pointer, should be a i8 pointer type.
+ * @param base_ptr base pointer, needs to be a i8 pointer type.
   * @param offsets vector with offsets
   * @param vector_justify select vector rather than integer justification
   */
@@ -251,41 +381,121 @@ LLVMValueRef
  lp_build_gather(struct gallivm_state *gallivm,
                  unsigned length,
                  unsigned src_width,
-                unsigned dst_width,
+                struct lp_type dst_type,
                  boolean aligned,
                  LLVMValueRef base_ptr,
                  LLVMValueRef offsets,
                  boolean vector_justify)
  {
     LLVMValueRef res;
+   boolean need_expansion = src_width < dst_type.width * dst_type.length;
+   boolean vec_fetch;
+   struct lp_type fetch_type, fetch_dst_type;
+   LLVMTypeRef src_type;
+
+   assert(src_width <= dst_type.width * dst_type.length);
+
+   /*
+    * This is quite a mess...
+    * Figure out if the fetch should be done as:
+    * a) scalar or vector
+    * b) float or int
+    *
+    * As an example, for a 96bit fetch expanded into 4x32bit, it is better
+    * to use (3x32bit) vector type (then pad the vector). Otherwise, the
+    * zext will cause extra instructions.
+    * However, the same isn't true for 3x16bit (the codegen for that is
+    * completely worthless on x86 simd, and for 3x8bit is is way worse
+    * still, don't try that... (To get really good code out of llvm for
+    * these cases, the only way is to decompose the fetches manually
+    * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
+    * case requires sse41, otherwise simple scalar zext is way better.
+    * But probably not important enough, so don't bother.)
+    * Also, we try to honor the floating bit of destination (but isn't
+    * possible if caller asks for instance for 2x32bit dst_type with
+    * 48bit fetch - the idea would be to use 3x16bit fetch, pad and
+    * cast to 2x32f type, so the fetch is always int and on top of that
+    * we avoid the vec pad and use scalar zext due the above mentioned
+    * issue).
+    * Note this is optimized for x86 sse2 and up backend. Could be tweaked
+    * for other archs if necessary...
+    */
+   if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&
+       (dst_type.length > 1)) {
+      /* use vector fetch (if dst_type is vector) */
+      vec_fetch = TRUE;
+      if (dst_type.floating) {
+         fetch_type = lp_type_float_vec(dst_type.width, src_width);
+      } else {
+         fetch_type = lp_type_int_vec(dst_type.width, src_width);
+      }
+      /* intentionally not using lp_build_vec_type here */
+      src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),
+                                fetch_type.length);
+      fetch_dst_type = fetch_type;
+      fetch_dst_type.length = dst_type.length;
+    } else {
+      /* use scalar fetch */
+      vec_fetch = FALSE;
+      if (dst_type.floating && ((src_width == 32) || (src_width == 64))) {
+         fetch_type = lp_type_float(src_width);
+      } else {
+         fetch_type = lp_type_int(src_width);
+      }
+      src_type = lp_build_vec_type(gallivm, fetch_type);
+      fetch_dst_type = fetch_type;
+      fetch_dst_type.width = dst_type.width * dst_type.length;
+   }
  
     if (length == 1) {
        /* Scalar */
-      return lp_build_gather_elem(gallivm, length,
-                                  src_width, dst_width, aligned,
-                                  base_ptr, offsets, 0, vector_justify);
-   } else if (util_cpu_caps.has_avx2 && src_width == 32 && (length == 4 || length == 8)) {
-      return lp_build_gather_avx2(gallivm, length, src_width, dst_width, base_ptr, offsets);
+      res = lp_build_gather_elem_vec(gallivm, length,
+                                     src_width, src_type, fetch_dst_type,
+                                     aligned, base_ptr, offsets, 0,
+                                     vector_justify);
+      return LLVMBuildBitCast(gallivm->builder, res,
+                              lp_build_vec_type(gallivm, dst_type), "");
+      /*
+       * Excluding expansion from these paths because if you need it for
+       * 32bit/64bit fetches you're doing it wrong (this is gather, not
+       * conversion) and it would be awkward for floats.
+       */
+   } else if (util_cpu_caps.has_avx2 && !need_expansion &&
+              src_width == 32 && (length == 4 || length == 8)) {
+      return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
+                                  base_ptr, offsets);
+   /*
+    * This looks bad on paper wrt throughtput/latency on Haswell.
+    * Even on Broadwell it doesn't look stellar.
+    * Albeit no measurements were done (but tested to work).
+    * Should definitely enable on Skylake.
+    * (In general, should be more of a win if the fetch is 256bit wide -
+    * this is true for the 32bit case above too.)
+    */
+   } else if (0 && util_cpu_caps.has_avx2 && !need_expansion &&
+              src_width == 64 && (length == 2 || length == 4)) {
+      return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
+                                  base_ptr, offsets);
     } else {
        /* Vector */
  
-      LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width);
-      LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
-      LLVMTypeRef gather_vec_type = dst_vec_type;
+      LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];
        unsigned i;
        boolean vec_zext = FALSE;
-      unsigned gather_width = dst_width;
+      struct lp_type res_type, gather_res_type;
+      LLVMTypeRef res_t, gather_res_t;
  
+      res_type = fetch_dst_type;
+      res_type.length *= length;
+      gather_res_type = res_type;
  
-      if (src_width == 16 && dst_width == 32) {
-         LLVMTypeRef g_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width / 2);
-         gather_vec_type = LLVMVectorType(g_elem_type, length);
+      if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {
           /*
            * Note that llvm is never able to optimize zext/insert combos
            * directly (i.e. zero the simd reg, then place the elements into
-          * the appropriate place directly). And 16->32bit zext simd loads
+          * the appropriate place directly). (I think this has to do with
+          * scalar/vector transition.) And scalar 16->32bit zext simd loads
            * aren't possible (instead loading to scalar reg first).
-          * (I think this has to do with scalar/vector transition.)
            * No idea about other archs...
            * We could do this manually, but instead we just use a vector
            * zext, which is simple enough (and, in fact, llvm might optimize
@@ -293,30 +503,53 @@ lp_build_gather(struct gallivm_state *gallivm,
            * (We're not trying that with other bit widths as that might not be
            * easier, in particular with 8 bit values at least with only sse2.)
            */
+         assert(vec_fetch == FALSE);
+         gather_res_type.width /= 2;
+         fetch_dst_type = fetch_type;
+         src_type = lp_build_vec_type(gallivm, fetch_type);
           vec_zext = TRUE;
-         gather_width = 16;
        }
-      res = LLVMGetUndef(gather_vec_type);
+      res_t = lp_build_vec_type(gallivm, res_type);
+      gather_res_t = lp_build_vec_type(gallivm, gather_res_type);
+      res = LLVMGetUndef(gather_res_t);
        for (i = 0; i < length; ++i) {
           LLVMValueRef index = lp_build_const_int32(gallivm, i);
-         LLVMValueRef elem;
-         elem = lp_build_gather_elem(gallivm, length,
-                                     src_width, gather_width, aligned,
-                                     base_ptr, offsets, i, vector_justify);
-         res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, "");
+         elems[i] = lp_build_gather_elem_vec(gallivm, length,
+                                             src_width, src_type, fetch_dst_type,
+                                             aligned, base_ptr, offsets, i,
+                                             vector_justify);
+         if (!vec_fetch) {
+            res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");
+         }
        }
        if (vec_zext) {
-         res = LLVMBuildZExt(gallivm->builder, res, dst_vec_type, "");
+         res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
           if (vector_justify) {
  #if PIPE_ARCH_BIG_ENDIAN
-            struct lp_type dst_type;
-            unsigned sv = dst_width - src_width;
-            dst_type = lp_type_uint_vec(dst_width, dst_width * length);
+            unsigned sv = dst_type.width - src_width;
              res = LLVMBuildShl(gallivm->builder, res,
-                               lp_build_const_int_vec(gallivm, dst_type, sv), "");
+                               lp_build_const_int_vec(gallivm, res_type, sv), "");
  #endif
           }
        }
+      if (vec_fetch) {
+         /*
+          * Do bitcast now otherwise llvm might get some funny ideas wrt
+          * float/int types...
+          */
+         for (i = 0; i < length; i++) {
+            elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],
+                                        lp_build_vec_type(gallivm, dst_type), "");
+         }
+         res = lp_build_concat(gallivm, elems, dst_type, length);
+      } else {
+         struct lp_type really_final_type = dst_type;
+         assert(res_type.length * res_type.width ==
+                dst_type.length * dst_type.width * length);
+         really_final_type.length *= length;
+         res = LLVMBuildBitCast(gallivm->builder, res,
+                                lp_build_vec_type(gallivm, really_final_type), "");
+      }
     }
  
     return res;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.h b/src/gallium/auxiliary/gallivm/lp_bld_gather.h

index 3ede4763a702393160e99ea889ee8bfde40a80b6..7930864e61101a62f9e413a1882d32cd7100b03e 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.h
@@ -55,7 +55,7 @@ LLVMValueRef
  lp_build_gather(struct gallivm_state *gallivm,
                  unsigned length,
                  unsigned src_width,
-                unsigned dst_width,
+                struct lp_type dst_type,
                  boolean aligned,
                  LLVMValueRef base_ptr,
                  LLVMValueRef offsets,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c

index f91b761dc11e809680bec260b5601756182973cb..c46749dbac8e9b4b08d0fa5d1fb22d3ba9410a4f 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -579,10 +579,12 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
     LLVMValueRef rgba8;
     struct lp_build_context u8n;
     LLVMTypeRef u8n_vec_type;
+   struct lp_type fetch_type;
  
     lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
     u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
  
+   fetch_type = lp_type_uint(bld->texel_type.width);
     if (util_format_is_rgba8_variant(bld->format_desc)) {
        /*
         * Given the format is a rgba8, just read the pixels as is,
@@ -591,7 +593,7 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
        rgba8 = lp_build_gather(bld->gallivm,
                                bld->texel_type.length,
                                bld->format_desc->block.bits,
-                              bld->texel_type.width,
+                              fetch_type,
                                TRUE,
                                data_ptr, offset, TRUE);
  
@@ -925,14 +927,16 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
              LLVMValueRef rgba8;
  
              if (util_format_is_rgba8_variant(bld->format_desc)) {
+               struct lp_type fetch_type;
                 /*
                  * Given the format is a rgba8, just read the pixels as is,
                  * without any swizzling. Swizzling will be done later.
                  */
+               fetch_type = lp_type_uint(bld->texel_type.width);
                 rgba8 = lp_build_gather(bld->gallivm,
                                         bld->texel_type.length,
                                         bld->format_desc->block.bits,
-                                       bld->texel_type.width,
+                                       fetch_type,
                                         TRUE,
                                         data_ptr, offset[k][j][i], TRUE);
author	Roland Scheidegger <sroland@vmware.com>
	Sun, 11 Dec 2016 22:39:22 +0000 (23:39 +0100)
committer	Roland Scheidegger <sroland@vmware.com>
	Wed, 21 Dec 2016 03:48:24 +0000 (04:48 +0100)
src/gallium/auxiliary/draw/draw_llvm.c		patch \| blob \| history
src/gallium/auxiliary/gallivm/lp_bld_format_aos.c		patch \| blob \| history
src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c		patch \| blob \| history
src/gallium/auxiliary/gallivm/lp_bld_format_soa.c		patch \| blob \| history
src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c		patch \| blob \| history
src/gallium/auxiliary/gallivm/lp_bld_gather.c		patch \| blob \| history
src/gallium/auxiliary/gallivm/lp_bld_gather.h		patch \| blob \| history
src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c		patch \| blob \| history