X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fauxiliary%2Fgallivm%2Flp_bld_format_soa.c;h=c5962a666998d48d62e70337b5f2b99d67212665;hb=7730d583c207002e14ca2e95d30cab181db20082;hp=2b66162eb40fdb0fd29a15bd0b56b0b9b3324eb3;hpb=8f3bdeaad610d7d5a5c6e73e1e9c721219595754;p=mesa.git

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 2b66162eb40..c5962a66699 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -26,64 +26,256 @@
  **************************************************************************/
 
 
-#include "util/u_format.h"
+#include "pipe/p_defines.h"
+
+#include "util/format/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_string.h"
+#include "util/u_math.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_conv.h"
-#include "lp_bld_sample.h" /* for lp_build_gather */
-#include "lp_bld_init.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_gather.h"
+#include "lp_bld_debug.h"
 #include "lp_bld_format.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_pack.h"
+#include "lp_bld_flow.h"
+#include "lp_bld_printf.h"
+#include "lp_bld_intr.h"
 
-
-static LLVMValueRef
-lp_build_format_swizzle_chan_soa(struct lp_type type,
-                                 const LLVMValueRef *unswizzled,
-                                 enum util_format_swizzle swizzle)
+static void
+convert_to_soa(struct gallivm_state *gallivm,
+               LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
+               LLVMValueRef dst_soa[4],
+               const struct lp_type soa_type)
 {
-   switch (swizzle) {
-   case UTIL_FORMAT_SWIZZLE_X:
-   case UTIL_FORMAT_SWIZZLE_Y:
-   case UTIL_FORMAT_SWIZZLE_Z:
-   case UTIL_FORMAT_SWIZZLE_W:
-      return unswizzled[swizzle];
-   case UTIL_FORMAT_SWIZZLE_0:
-      return lp_build_zero(type);
-   case UTIL_FORMAT_SWIZZLE_1:
-      return lp_build_one(type);
-   case UTIL_FORMAT_SWIZZLE_NONE:
-      return lp_build_undef(type);
-   default:
-      assert(0);
-      return lp_build_undef(type);
+   unsigned j, k;
+   struct lp_type aos_channel_type = soa_type;
+
+   LLVMValueRef aos_channels[4];
+   unsigned pixels_per_channel = soa_type.length / 4;
+
+   debug_assert((soa_type.length % 4) == 0);
+
+   aos_channel_type.length >>= 1;
+
+   for (j = 0; j < 4; ++j) {
+      LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
+
+      assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
+
+      for (k = 0; k < pixels_per_channel; ++k) {
+         channel[k] = src_aos[j + 4 * k];
+      }
+
+      aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
    }
+
+   lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
 }
 
 
 void
 lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
-                            struct lp_type type,
+                            struct lp_build_context *bld,
                             const LLVMValueRef *unswizzled,
-                            LLVMValueRef *swizzled)
+                            LLVMValueRef swizzled_out[4])
 {
-   if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
-      enum util_format_swizzle swizzle = format_desc->swizzle[0];
-      LLVMValueRef depth = lp_build_format_swizzle_chan_soa(type, unswizzled, swizzle);
-      swizzled[2] = swizzled[1] = swizzled[0] = depth;
-      swizzled[3] = lp_build_one(type);
+   if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      enum pipe_swizzle swizzle;
+      LLVMValueRef depth_or_stencil;
+
+      if (util_format_has_stencil(format_desc) &&
+          !util_format_has_depth(format_desc)) {
+         assert(!bld->type.floating);
+         swizzle = format_desc->swizzle[1];
+      }
+      else {
+         assert(bld->type.floating);
+         swizzle = format_desc->swizzle[0];
+      }
+      /*
+       * Return zzz1 or sss1 for depth-stencil formats here.
+       * Correct swizzling will be handled by apply_sampler_swizzle() later.
+       */
+      depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
+
+      swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil;
+      swizzled_out[3] = bld->one;
    }
    else {
       unsigned chan;
       for (chan = 0; chan < 4; ++chan) {
-         enum util_format_swizzle swizzle = format_desc->swizzle[chan];
-         swizzled[chan] = lp_build_format_swizzle_chan_soa(type, unswizzled, swizzle);
+         enum pipe_swizzle swizzle = format_desc->swizzle[chan];
+         swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
       }
    }
 }
 
 
+
+static LLVMValueRef
+lp_build_extract_soa_chan(struct lp_build_context *bld,
+                          unsigned blockbits,
+                          boolean srgb_chan,
+                          struct util_format_channel_description chan_desc,
+                          LLVMValueRef packed)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_type type = bld->type;
+   LLVMValueRef input = packed;
+   const unsigned width = chan_desc.size;
+   const unsigned start = chan_desc.shift;
+   const unsigned stop = start + width;
+
+   /* Decode the input vector component */
+
+   switch(chan_desc.type) {
+   case UTIL_FORMAT_TYPE_VOID:
+      input = bld->undef;
+      break;
+
+   case UTIL_FORMAT_TYPE_UNSIGNED:
+      /*
+       * Align the LSB
+       */
+      if (start) {
+         input = LLVMBuildLShr(builder, input,
+                               lp_build_const_int_vec(gallivm, type, start), "");
+      }
+
+      /*
+       * Zero the MSBs
+       */
+      if (stop < blockbits) {
+         unsigned mask = ((unsigned long long)1 << width) - 1;
+         input = LLVMBuildAnd(builder, input,
+                              lp_build_const_int_vec(gallivm, type, mask), "");
+      }
+
+      /*
+       * Type conversion
+       */
+      if (type.floating) {
+         if (srgb_chan) {
+            struct lp_type conv_type = lp_uint_type(type);
+            input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
+         }
+         else {
+            if(chan_desc.normalized)
+               input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
+            else
+               input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
+         }
+      }
+      else if (chan_desc.pure_integer) {
+         /* Nothing to do */
+      } else {
+          /* FIXME */
+          assert(0);
+      }
+      break;
+
+   case UTIL_FORMAT_TYPE_SIGNED:
+      /*
+       * Align the sign bit first.
+       */
+      if (stop < type.width) {
+         unsigned bits = type.width - stop;
+         LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
+         input = LLVMBuildShl(builder, input, bits_val, "");
+      }
+
+      /*
+       * Align the LSB (with an arithmetic shift to preserve the sign)
+       */
+      if (chan_desc.size < type.width) {
+         unsigned bits = type.width - chan_desc.size;
+         LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
+         input = LLVMBuildAShr(builder, input, bits_val, "");
+      }
+
+      /*
+       * Type conversion
+       */
+      if (type.floating) {
+         input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
+         if (chan_desc.normalized) {
+            double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
+            LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
+            input = LLVMBuildFMul(builder, input, scale_val, "");
+            /*
+             * The formula above will produce value below -1.0 for most negative
+             * value but everything seems happy with that hence disable for now.
+             */
+            if (0)
+               input = lp_build_max(bld, input,
+                                    lp_build_const_vec(gallivm, type, -1.0f));
+         }
+      }
+      else if (chan_desc.pure_integer) {
+         /* Nothing to do */
+      } else {
+          /* FIXME */
+          assert(0);
+      }
+      break;
+
+   case UTIL_FORMAT_TYPE_FLOAT:
+      if (type.floating) {
+         if (chan_desc.size == 16) {
+            struct lp_type f16i_type = type;
+            f16i_type.width /= 2;
+            f16i_type.floating = 0;
+            if (start) {
+               input = LLVMBuildLShr(builder, input,
+                                     lp_build_const_int_vec(gallivm, type, start), "");
+            }
+            input = LLVMBuildTrunc(builder, input,
+                                   lp_build_vec_type(gallivm, f16i_type), "");
+            input = lp_build_half_to_float(gallivm, input);
+         } else {
+            assert(start == 0);
+            assert(stop == 32);
+            assert(type.width == 32);
+         }
+         input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
+      }
+      else {
+         /* FIXME */
+         assert(0);
+         input = bld->undef;
+      }
+      break;
+
+   case UTIL_FORMAT_TYPE_FIXED:
+      if (type.floating) {
+         double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
+         LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
+         input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
+         input = LLVMBuildFMul(builder, input, scale_val, "");
+      }
+      else {
+         /* FIXME */
+         assert(0);
+         input = bld->undef;
+      }
+      break;
+
+   default:
+      assert(0);
+      input = bld->undef;
+      break;
+   }
+
+   return input;
+}
+
+
 /**
  * Unpack several pixels in SoA.
  *
@@ -101,16 +293,21 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
  * It requires that a packed pixel fits into an element of the output
  * channels. The common case is when converting pixel with a depth of 32 bit or
  * less into floats.
+ *
+ * \param format_desc  the format of the 'packed' incoming pixel vector
+ * \param type  the desired type for rgba_out (type.length = n, above)
+ * \param packed  the incoming vector of packed pixels
+ * \param rgba_out  returns the SoA R,G,B,A vectors
  */
 void
-lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
+lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
                          const struct util_format_description *format_desc,
                          struct lp_type type,
                          LLVMValueRef packed,
-                         LLVMValueRef *rgba)
+                         LLVMValueRef rgba_out[4])
 {
+   struct lp_build_context bld;
    LLVMValueRef inputs[4];
-   unsigned start;
    unsigned chan;
 
    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
@@ -118,171 +315,132 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
    assert(format_desc->block.height == 1);
    assert(format_desc->block.bits <= type.width);
    /* FIXME: Support more output types */
-   assert(type.floating);
    assert(type.width == 32);
 
+   lp_build_context_init(&bld, gallivm, type);
+
    /* Decode the input vector components */
-   start = 0;
    for (chan = 0; chan < format_desc->nr_channels; ++chan) {
-      unsigned width = format_desc->channel[chan].size;
-      unsigned stop = start + width;
-      LLVMValueRef input;
-
-      input = packed;
-
-      switch(format_desc->channel[chan].type) {
-      case UTIL_FORMAT_TYPE_VOID:
-         input = lp_build_undef(type);
-         break;
-
-      case UTIL_FORMAT_TYPE_UNSIGNED:
-         /*
-          * Align the LSB
-          */
-
-         if (start) {
-            input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(type, start), "");
-         }
-
-         /*
-          * Zero the MSBs
-          */
-
-         if (stop < format_desc->block.bits) {
-            unsigned mask = ((unsigned long long)1 << width) - 1;
-            input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(type, mask), "");
-         }
+      struct util_format_channel_description chan_desc = format_desc->channel[chan];
+      boolean srgb_chan = FALSE;
 
-         /*
-          * Type conversion
-          */
-
-         if (type.floating) {
-            if(format_desc->channel[chan].normalized)
-               input = lp_build_unsigned_norm_to_float(builder, width, type, input);
-            else
-               input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), "");
-         }
-         else {
-            /* FIXME */
-            assert(0);
-            input = lp_build_undef(type);
-         }
+      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
+          format_desc->swizzle[3] != chan) {
+         srgb_chan = TRUE;
+      }
 
-         break;
+      inputs[chan] = lp_build_extract_soa_chan(&bld,
+                                               format_desc->block.bits,
+                                               srgb_chan,
+                                               chan_desc,
+                                               packed);
+   }
 
-      case UTIL_FORMAT_TYPE_SIGNED:
-         /*
-          * Align the sign bit first.
-          */
+   lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
+}
 
-         if (stop < type.width) {
-            unsigned bits = type.width - stop;
-            LLVMValueRef bits_val = lp_build_const_int_vec(type, bits);
-            input = LLVMBuildShl(builder, input, bits_val, "");
-         }
 
-         /*
-          * Align the LSB (with an arithmetic shift to preserve the sign)
-          */
-
-         if (format_desc->channel[chan].size < type.width) {
-            unsigned bits = type.width - format_desc->channel[chan].size;
-            LLVMValueRef bits_val = lp_build_const_int_vec(type, bits);
-            input = LLVMBuildAShr(builder, input, bits_val, "");
-         }
+/**
+ * Convert a vector of rgba8 values into 32bit wide SoA vectors.
+ *
+ * \param dst_type  The desired return type. For pure integer formats
+ *                  this should be a 32bit wide int or uint vector type,
+ *                  otherwise a float vector type.
+ *
+ * \param packed    The rgba8 values to pack.
+ *
+ * \param rgba      The 4 SoA return vectors.
+ */
+void
+lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
+                           struct lp_type dst_type,
+                           LLVMValueRef packed,
+                           LLVMValueRef *rgba)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);
+   unsigned chan;
 
-         /*
-          * Type conversion
-          */
+   /* XXX technically shouldn't use that for uint dst_type */
+   packed = LLVMBuildBitCast(builder, packed,
+                             lp_build_int_vec_type(gallivm, dst_type), "");
 
-         if (type.floating) {
-            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), "");
-            if (format_desc->channel[chan].normalized) {
-               double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1);
-               LLVMValueRef scale_val = lp_build_const_vec(type, scale);
-               input = LLVMBuildMul(builder, input, scale_val, "");
-            }
-         }
-         else {
-            /* FIXME */
-            assert(0);
-            input = lp_build_undef(type);
-         }
+   /* Decode the input vector components */
+   for (chan = 0; chan < 4; ++chan) {
+#if UTIL_ARCH_LITTLE_ENDIAN
+      unsigned start = chan*8;
+#else
+      unsigned start = (3-chan)*8;
+#endif
+      unsigned stop = start + 8;
+      LLVMValueRef input;
 
-         break;
+      input = packed;
 
-      case UTIL_FORMAT_TYPE_FLOAT:
-         if (type.floating) {
-            assert(start == 0);
-            assert(stop == 32);
-            assert(type.width == 32);
-            input = LLVMBuildBitCast(builder, input, lp_build_vec_type(type), "");
-         }
-         else {
-            /* FIXME */
-            assert(0);
-            input = lp_build_undef(type);
-         }
-         break;
-
-      case UTIL_FORMAT_TYPE_FIXED:
-         if (type.floating) {
-            double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1);
-            LLVMValueRef scale_val = lp_build_const_vec(type, scale);
-            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), "");
-            input = LLVMBuildMul(builder, input, scale_val, "");
-         }
-         else {
-            /* FIXME */
-            assert(0);
-            input = lp_build_undef(type);
-         }
-         break;
+      if (start)
+         input = LLVMBuildLShr(builder, input,
+                               lp_build_const_int_vec(gallivm, dst_type, start), "");
 
-      default:
-         assert(0);
-         input = lp_build_undef(type);
-         break;
-      }
+      if (stop < 32)
+         input = LLVMBuildAnd(builder, input, mask, "");
 
-      inputs[chan] = input;
+      if (dst_type.floating)
+         input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);
 
-      start = stop;
+      rgba[chan] = input;
    }
-
-   lp_build_format_swizzle_soa(format_desc, type, inputs, rgba);
 }
 
 
+
 /**
- * Fetch a pixel into a SoA.
+ * Fetch a texels from a texture, returning them in SoA layout.
+ *
+ * \param type  the desired return type for 'rgba'.  The vector length
+ *              is the number of texels to fetch
+ * \param aligned if the offset is guaranteed to be aligned to element width
  *
- * i and j are the sub-block pixel coordinates.
+ * \param base_ptr  points to the base of the texture mip tree.
+ * \param offset    offset to start of the texture image block.  For non-
+ *                  compressed formats, this simply is an offset to the texel.
+ *                  For compressed formats, it is an offset to the start of the
+ *                  compressed data block.
+ *
+ * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
+ *              these will always be (0,0).  For compressed formats, i will
+ *              be in [0, block_width-1] and j will be in [0, block_height-1].
+ * \param cache  optional value pointing to a lp_build_format_cache structure
  */
 void
-lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
+lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                         const struct util_format_description *format_desc,
                         struct lp_type type,
+                        boolean aligned,
                         LLVMValueRef base_ptr,
                         LLVMValueRef offset,
                         LLVMValueRef i,
                         LLVMValueRef j,
-                        LLVMValueRef *rgba)
+                        LLVMValueRef cache,
+                        LLVMValueRef rgba_out[4])
 {
+   LLVMBuilderRef builder = gallivm->builder;
+   enum pipe_format format = format_desc->format;
+   struct lp_type fetch_type;
 
    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
+        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
        format_desc->block.width == 1 &&
        format_desc->block.height == 1 &&
        format_desc->block.bits <= type.width &&
        (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
-        format_desc->channel[0].size == 32))
+        format_desc->channel[0].size == 32 ||
+        format_desc->channel[0].size == 16))
    {
       /*
        * The packed pixel fits into an element of the destination format. Put
-       * the packed pixels into a vector and estract each component for all
+       * the packed pixels into a vector and extract each component for all
        * vector elements in parallel.
        */
 
@@ -290,107 +448,643 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
 
       /*
        * gather the texels from the texture
+       * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
        */
-      packed = lp_build_gather(builder,
+      assert(format_desc->block.bits <= type.width);
+      fetch_type = lp_type_uint(type.width);
+      packed = lp_build_gather(gallivm,
                                type.length,
                                format_desc->block.bits,
-                               type.width,
-                               base_ptr, offset);
+                               fetch_type,
+                               aligned,
+                               base_ptr, offset, FALSE);
 
       /*
        * convert texels to float rgba
        */
-      lp_build_unpack_rgba_soa(builder,
+      lp_build_unpack_rgba_soa(gallivm,
                                format_desc,
                                type,
-                               packed, rgba);
+                               packed, rgba_out);
+      return;
    }
-   else {
+
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
+       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
+       format_desc->block.width == 1 &&
+       format_desc->block.height == 1 &&
+       format_desc->block.bits > type.width &&
+       ((format_desc->block.bits <= type.width * type.length &&
+         format_desc->channel[0].size <= type.width) ||
+        (format_desc->channel[0].size == 64 &&
+         format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
+         type.floating)))
+   {
+      /*
+       * Similar to above, but the packed pixel is larger than what fits
+       * into an element of the destination format. The packed pixels will be
+       * shuffled into SoA vectors appropriately, and then the extraction will
+       * be done in parallel as much as possible.
+       * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
+       * the gathered vectors can be shuffled easily (even with avx).
+       * 64xn float -> 32xn float is handled too but it's a bit special as
+       * it does the conversion pre-shuffle.
+       */
+
+      LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
+      struct lp_type fetch_type, gather_type = type;
+      unsigned num_gather, fetch_width, i, j;
+      struct lp_build_context bld;
+      boolean fp64 = format_desc->channel[0].size == 64;
+
+      lp_build_context_init(&bld, gallivm, type);
+
+      assert(type.width == 32);
+      assert(format_desc->block.bits > type.width);
+
       /*
-       * Fallback to calling util_format_description::fetch_rgba_float for each
-       * pixel.
-       *
-       * This is definitely not the most efficient way of fetching pixels, as
-       * we miss the opportunity to do vectorization, but this it is a
-       * convenient for formats or scenarios for which there was no opportunity
-       * or incentive to optimize.
+       * First, figure out fetch order.
        */
+      fetch_width = util_next_power_of_two(format_desc->block.bits);
+      /*
+       * fp64 are treated like fp32 except we fetch twice wide values
+       * (as we shuffle after trunc). The shuffles for that work out
+       * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
+       * albeit we miss the potential opportunity for hw gather (as it
+       * only handles native size).
+       */
+      num_gather = fetch_width / type.width;
+      gather_type.width *= num_gather;
+      if (fp64) {
+         num_gather /= 2;
+      }
+      gather_type.length /= num_gather;
+
+      for (i = 0; i < num_gather; i++) {
+         LLVMValueRef offsetr, shuf_vec;
+         if(num_gather == 4) {
+            for (j = 0; j < gather_type.length; j++) {
+               unsigned idx = i + 4*j;
+               shuffles[j] = lp_build_const_int32(gallivm, idx);
+            }
+            shuf_vec = LLVMConstVector(shuffles, gather_type.length);
+            offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
 
-      LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
-      char name[256];
-      LLVMValueRef function;
-      LLVMValueRef tmp;
-      unsigned k, chan;
+         }
+         else if (num_gather == 2) {
+            assert(num_gather == 2);
+            for (j = 0; j < gather_type.length; j++) {
+               unsigned idx = i*2 + (j%2) + (j/2)*4;
+               shuffles[j] = lp_build_const_int32(gallivm, idx);
+            }
+            shuf_vec = LLVMConstVector(shuffles, gather_type.length);
+            offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
+         }
+         else {
+            assert(num_gather == 1);
+            offsetr = offset;
+         }
+         if (gather_type.length == 1) {
+            LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
+            offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
+         }
+
+         /*
+          * Determine whether to use float or int loads. This is mostly
+          * to outsmart the (stupid) llvm int/float shuffle logic, we
+          * don't really care much if the data is floats or ints...
+          * But llvm will refuse to use single float shuffle with int data
+          * and instead use 3 int shuffles instead, the code looks atrocious.
+          * (Note bitcasts often won't help, as llvm is too smart to be
+          * fooled by that.)
+          * Nobody cares about simd float<->int domain transition penalties,
+          * which usually don't even exist for shuffles anyway.
+          * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
+          * going into transpose, which is unpacks, so doesn't really matter
+          * much).
+          * With 2x32bit or 4x16bit fetch, we use float vec, since those
+          * go into the weird channel separation shuffle. With floats,
+          * this is (with 128bit vectors):
+          * - 2 movq, 2 movhpd, 2 shufps
+          * With ints it would be:
+          * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
+          * I've seen texture functions increase in code size by 15% just due
+          * to that (there's lots of such fetches in them...)
+          * (We could chose a different gather order to improve this somewhat
+          * for the int path, but it would basically just drop the blends,
+          * so the float path with this order really is optimal.)
+          * Albeit it is tricky sometimes llvm doesn't ignore the float->int
+          * casts so must avoid them until we're done with the float shuffle...
+          * 3x16bit formats (the same is also true for 3x8) are pretty bad but
+          * there's nothing we can do about them (we could overallocate by
+          * those couple bytes and use unaligned but pot sized load).
+          * Note that this is very much x86 specific. I don't know if this
+          * affect other archs at all.
+          */
+         if (num_gather > 1) {
+            /*
+             * We always want some float type here (with x86)
+             * due to shuffles being float ones afterwards (albeit for
+             * the num_gather == 4 case int should work fine too
+             * (unless there's some problems with avx but not avx2).
+             */
+            if (format_desc->channel[0].size == 64) {
+               fetch_type = lp_type_float_vec(64, gather_type.width);
+            } else {
+               fetch_type = lp_type_int_vec(32, gather_type.width);
+            }
+         }
+         else {
+            /* type doesn't matter much */
+            if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
+                (format_desc->channel[0].size == 32 ||
+                 format_desc->channel[0].size == 64)) {
+            fetch_type = lp_type_float(gather_type.width);
+            } else {
+               fetch_type = lp_type_uint(gather_type.width);
+            }
+         }
+
+         /* Now finally gather the values */
+         packed[i] = lp_build_gather(gallivm, gather_type.length,
+                                     format_desc->block.bits,
+                                     fetch_type, aligned,
+                                     base_ptr, offsetr, FALSE);
+         if (fp64) {
+            struct lp_type conv_type = type;
+            conv_type.width *= 2;
+            packed[i] = LLVMBuildBitCast(builder, packed[i],
+                                         lp_build_vec_type(gallivm, conv_type), "");
+            packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
+         }
+      }
+
+      /* shuffle the gathered values to SoA */
+      if (num_gather == 2) {
+         for (i = 0; i < num_gather; i++) {
+            for (j = 0; j < type.length; j++) {
+               unsigned idx = (j%2)*2 + (j/4)*4 + i;
+               if ((j/2)%2)
+                  idx += type.length;
+               shuffles[j] = lp_build_const_int32(gallivm, idx);
+            }
+            dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
+                                            LLVMConstVector(shuffles, type.length), "");
+         }
+      }
+      else if (num_gather == 4) {
+         lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
+      }
+      else {
+         assert(num_gather == 1);
+         dst[0] = packed[0];
+      }
+
+      /*
+       * And finally unpack exactly as above, except that
+       * chan shift is adjusted and the right vector selected.
+       */
+      if (!fp64) {
+         for (i = 0; i < num_gather; i++) {
+            dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
+         }
+         for (i = 0; i < format_desc->nr_channels; i++) {
+            struct util_format_channel_description chan_desc = format_desc->channel[i];
+            unsigned blockbits = type.width;
+            unsigned vec_nr;
+
+#if UTIL_ARCH_BIG_ENDIAN
+            vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width;
+#else
+            vec_nr = chan_desc.shift / type.width;
+#endif
+            chan_desc.shift %= type.width;
+
+            output[i] = lp_build_extract_soa_chan(&bld,
+                                                  blockbits,
+                                                  FALSE,
+                                                  chan_desc,
+                                                  dst[vec_nr]);
+         }
+      }
+      else {
+         for (i = 0; i < format_desc->nr_channels; i++)  {
+            output[i] = dst[i];
+         }
+      }
+
+      lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
+      return;
+   }
+
+   if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
+       format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+      /*
+       * similar conceptually to above but requiring special
+       * AoS packed -> SoA float conversion code.
+       */
+      LLVMValueRef packed;
+      struct lp_type fetch_type = lp_type_uint(type.width);
 
       assert(type.floating);
+      assert(type.width == 32);
 
-      util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_float", format_desc->short_name);
+      packed = lp_build_gather(gallivm, type.length,
+                               format_desc->block.bits,
+                               fetch_type, aligned,
+                               base_ptr, offset, FALSE);
+      if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
+         lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
+      }
+      else {
+         lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
+      }
+      return;
+   }
 
+   if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
+       format_desc->block.bits == 64) {
       /*
-       * Declare and bind format_desc->fetch_rgba_float().
+       * special case the format is 64 bits but we only require
+       * 32bit (or 8bit) from each block.
        */
+      LLVMValueRef packed;
+      struct lp_type fetch_type = lp_type_uint(type.width);
+
+      if (format == PIPE_FORMAT_X32_S8X24_UINT) {
+         /*
+          * for stencil simply fix up offsets - could in fact change
+          * base_ptr instead even outside the shader.
+          */
+         unsigned mask = (1 << 8) - 1;
+         LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
+         offset = LLVMBuildAdd(builder, offset, s_offset, "");
+         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
+                                  aligned, base_ptr, offset, FALSE);
+         packed = LLVMBuildAnd(builder, packed,
+                               lp_build_const_int_vec(gallivm, type, mask), "");
+      }
+      else {
+         assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
+         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
+                                  aligned, base_ptr, offset, TRUE);
+         packed = LLVMBuildBitCast(builder, packed,
+                                   lp_build_vec_type(gallivm, type), "");
+      }
+      /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
+      rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
+      rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
+      return;
+   }
+
+   /*
+    * Try calling lp_build_fetch_rgba_aos for all pixels.
+    * Should only really hit subsampled, compressed
+    * (for s3tc srgb too, for rgtc the unorm ones only) by now.
+    * (This is invalid for plain 8unorm formats because we're lazy with
+    * the swizzle since some results would arrive swizzled, some not.)
+    */
+
+   if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
+       (util_format_fits_8unorm(format_desc) ||
+        format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
+       type.floating && type.width == 32 &&
+       (type.length == 1 || (type.length % 4 == 0))) {
+      struct lp_type tmp_type;
+      struct lp_build_context bld;
+      LLVMValueRef packed, rgba[4];
+      const struct util_format_description *flinear_desc;
+      const struct util_format_description *frgba8_desc;
+      unsigned chan;
 
-      function = LLVMGetNamedFunction(module, name);
-      if (!function) {
-         LLVMTypeRef ret_type;
-         LLVMTypeRef arg_types[4];
-         LLVMTypeRef function_type;
+      lp_build_context_init(&bld, gallivm, type);
 
-         ret_type = LLVMVoidType();
-         arg_types[0] = LLVMPointerType(LLVMFloatType(), 0);
-         arg_types[1] = LLVMPointerType(LLVMInt8Type(), 0);
-         arg_types[3] = arg_types[2] = LLVMIntType(sizeof(unsigned) * 8);
-         function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0);
-         function = LLVMAddFunction(module, name, function_type);
+      /*
+       * Make sure the conversion in aos really only does convert to rgba8
+       * and not anything more (so use linear format, adjust type).
+       */
+      flinear_desc = util_format_description(util_format_linear(format));
+      memset(&tmp_type, 0, sizeof tmp_type);
+      tmp_type.width = 8;
+      tmp_type.length = type.length * 4;
+      tmp_type.norm = TRUE;
 
-         LLVMSetFunctionCallConv(function, LLVMCCallConv);
-         LLVMSetLinkage(function, LLVMExternalLinkage);
+      packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
+                                       aligned, base_ptr, offset, i, j, cache);
+      packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");
 
-         assert(LLVMIsDeclaration(function));
+      /*
+       * The values are now packed so they match ordinary (srgb) RGBA8 format,
+       * hence need to use matching format for unpack.
+       */
+      frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM);
+      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+         assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
+         frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
+      }
+      lp_build_unpack_rgba_soa(gallivm,
+                               frgba8_desc,
+                               type,
+                               packed, rgba);
 
-         LLVMAddGlobalMapping(lp_build_engine, function, format_desc->fetch_rgba_float);
+      /*
+       * We converted 4 channels. Make sure llvm can drop unneeded ones
+       * (luckily the rgba order is fixed, only LA needs special case).
+       */
+      for (chan = 0; chan < 4; chan++) {
+         enum pipe_swizzle swizzle = format_desc->swizzle[chan];
+         if (chan == 3 && util_format_is_luminance_alpha(format)) {
+            swizzle = PIPE_SWIZZLE_W;
+         }
+         rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
       }
+      return;
+   }
 
-      for (chan = 0; chan < 4; ++chan) {
-         rgba[chan] = lp_build_undef(type);
+
+   /*
+    * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
+    *
+    * This is not the most efficient way of fetching pixels, as we
+    * miss some opportunities to do vectorization, but this is
+    * convenient for formats or scenarios for which there was no
+    * opportunity or incentive to optimize.
+    *
+    * We do NOT want to end up here, this typically is quite terrible,
+    * in particular if the formats have less than 4 channels.
+    *
+    * Right now, this should only be hit for:
+    * - RGTC snorm formats
+    *   (those miss fast fetch functions hence they are terrible anyway)
+    */
+
+   {
+      unsigned k;
+      struct lp_type tmp_type;
+      LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
+
+      if (gallivm_debug & GALLIVM_DEBUG_PERF) {
+         debug_printf("%s: AoS fetch fallback for %s\n",
+                      __FUNCTION__, format_desc->short_name);
       }
 
-      tmp = LLVMBuildArrayAlloca(builder,
-                                 LLVMFloatType(),
-                                 LLVMConstInt(LLVMInt32Type(), 4, 0),
-                                 "");
+      tmp_type = type;
+      tmp_type.length = 4;
 
       /*
-       * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result
-       * in the SoA vectors.
+       * Note that vector transpose can be worse compared to insert/extract
+       * for aos->soa conversion (for formats with 1 or 2 channels). However,
+       * we should try to avoid getting here for just about all formats, so
+       * don't bother.
        */
 
+      /* loop over number of pixels */
       for(k = 0; k < type.length; ++k) {
-         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0);
+         LLVMValueRef index = lp_build_const_int32(gallivm, k);
          LLVMValueRef offset_elem;
-         LLVMValueRef ptr;
          LLVMValueRef i_elem, j_elem;
-         LLVMValueRef args[4];
 
-         offset_elem = LLVMBuildExtractElement(builder, offset, index, "");
-         ptr = LLVMBuildGEP(builder, base_ptr, &offset_elem, 1, "");
+         offset_elem = LLVMBuildExtractElement(builder, offset,
+                                               index, "");
 
          i_elem = LLVMBuildExtractElement(builder, i, index, "");
          j_elem = LLVMBuildExtractElement(builder, j, index, "");
 
-         args[0] = tmp;
-         args[1] = ptr;
-         args[2] = i_elem;
-         args[3] = j_elem;
+         /* Get a single float[4]={R,G,B,A} pixel */
+         aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
+                                                aligned, base_ptr, offset_elem,
+                                                i_elem, j_elem, cache);
+
+      }
+      convert_to_soa(gallivm, aos_fetch, rgba_out, type);
+   }
+}
+
+static void
+lp_build_insert_soa_chan(struct lp_build_context *bld,
+                         unsigned blockbits,
+                         struct util_format_channel_description chan_desc,
+                         LLVMValueRef *output,
+                         LLVMValueRef rgba)
+{
+    struct gallivm_state *gallivm = bld->gallivm;
+    LLVMBuilderRef builder = gallivm->builder;
+    struct lp_type type = bld->type;
+    const unsigned width = chan_desc.size;
+    const unsigned start = chan_desc.shift;
+    const unsigned stop = start + width;
+    LLVMValueRef chan;
+    switch(chan_desc.type) {
+    case UTIL_FORMAT_TYPE_UNSIGNED:
 
-         LLVMBuildCall(builder, function, args, 4, "");
+       if (chan_desc.pure_integer)
+          chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
+       else if (type.floating) {
+          if (chan_desc.normalized)
+             chan = lp_build_clamped_float_to_unsigned_norm(gallivm, type, width, rgba);
+          else
+             chan = LLVMBuildFPToSI(builder, rgba, bld->vec_type, "");
+       }
+       if (start)
+          chan = LLVMBuildShl(builder, chan,
+                              lp_build_const_int_vec(gallivm, type, start), "");
+       if (!*output)
+          *output = chan;
+       else
+          *output = LLVMBuildOr(builder, *output, chan, "");
+       break;
+    case UTIL_FORMAT_TYPE_SIGNED:
+       if (chan_desc.pure_integer)
+          chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
+       else if (type.floating) {
+          uint32_t mask_val = (1UL << chan_desc.size) - 1;
+          if (chan_desc.normalized) {
+             char intrin[32];
+             double scale = ((1 << (chan_desc.size - 1)) - 1);
+             LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
+             rgba = lp_build_clamp(bld, rgba, lp_build_negate(bld, bld->one), bld->one);
+             rgba = LLVMBuildFMul(builder, rgba, scale_val, "");
+             lp_format_intrinsic(intrin, sizeof intrin, "llvm.rint", bld->vec_type);
+             rgba = lp_build_intrinsic_unary(builder, intrin, bld->vec_type, rgba);
+          }
+          chan = LLVMBuildFPToSI(builder, rgba, bld->int_vec_type, "");
+          chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, mask_val), "");
+       }
+       if (start)
+          chan = LLVMBuildShl(builder, chan,
+                              lp_build_const_int_vec(gallivm, type, start), "");
+       if (!*output)
+          *output = chan;
+       else
+          *output = LLVMBuildOr(builder, *output, chan, "");
+       break;
+    case UTIL_FORMAT_TYPE_FLOAT:
+       if (type.floating) {
+          if (chan_desc.size == 16) {
+             chan = lp_build_float_to_half(gallivm, rgba);
+             chan = LLVMBuildZExt(builder, chan, bld->int_vec_type, "");
+             if (start)
+                chan = LLVMBuildShl(builder, chan,
+                                    lp_build_const_int_vec(gallivm, type, start), "");
+             if (!*output)
+                *output = chan;
+             else
+                *output = LLVMBuildOr(builder, *output, chan, "");
+          } else {
+             assert(start == 0);
+             assert(stop == 32);
+             assert(type.width == 32);
+             *output = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
+          }
+       } else
+          assert(0);
+       break;
+    default:
+       assert(0);
+       *output = bld->undef;
+    }
+}
 
-         for (chan = 0; chan < 4; ++chan) {
-            LLVMValueRef chan_val = LLVMConstInt(LLVMInt32Type(), chan, 0),
-            tmp_chan = LLVMBuildGEP(builder, tmp, &chan_val, 1, "");
-            tmp_chan = LLVMBuildLoad(builder, tmp_chan, "");
-            rgba[chan] = LLVMBuildInsertElement(builder, rgba[chan], tmp_chan, index, "");
-         }
+static void
+lp_build_pack_rgba_soa(struct gallivm_state *gallivm,
+                       const struct util_format_description *format_desc,
+                       struct lp_type type,
+                       const LLVMValueRef rgba_in[4],
+                       LLVMValueRef *packed)
+{
+   unsigned chan;
+   struct lp_build_context bld;
+   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
+   assert(format_desc->block.width == 1);
+   assert(format_desc->block.height == 1);
+   assert(format_desc->block.bits <= type.width);
+   /* FIXME: Support more output types */
+   assert(type.width == 32);
+
+   lp_build_context_init(&bld, gallivm, type);
+   for (chan = 0; chan < format_desc->nr_channels; ++chan) {
+      struct util_format_channel_description chan_desc = format_desc->channel[chan];
+
+      lp_build_insert_soa_chan(&bld, format_desc->block.bits,
+                               chan_desc,
+                               packed,
+                               rgba_in[chan]);
+   }
+}
+
+void
+lp_build_store_rgba_soa(struct gallivm_state *gallivm,
+                        const struct util_format_description *format_desc,
+                        struct lp_type type,
+                        LLVMValueRef exec_mask,
+                        LLVMValueRef base_ptr,
+                        LLVMValueRef offset,
+                        LLVMValueRef out_of_bounds,
+                        const LLVMValueRef rgba_in[4])
+{
+   enum pipe_format format = format_desc->format;
+   LLVMValueRef packed[4];
+   unsigned num_stores;
+
+   memset(packed, 0, sizeof(LLVMValueRef) * 4);
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
+       format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
+       format_desc->block.width == 1 &&
+       format_desc->block.height == 1 &&
+       format_desc->block.bits <= type.width &&
+       (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
+        format_desc->channel[0].size == 32 ||
+        format_desc->channel[0].size == 16))
+   {
+      lp_build_pack_rgba_soa(gallivm, format_desc, type, rgba_in, &packed[0]);
+
+      num_stores = 1;
+   } else if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
+       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
+       format_desc->block.width == 1 &&
+       format_desc->block.height == 1 &&
+       format_desc->block.bits > type.width &&
+       ((format_desc->block.bits <= type.width * type.length &&
+         format_desc->channel[0].size <= type.width) ||
+        (format_desc->channel[0].size == 64 &&
+         format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
+         type.floating)))
+   {
+      /*
+       * Similar to above, but the packed pixel is larger than what fits
+       * into an element of the destination format. The packed pixels will be
+       * shuffled into SoA vectors appropriately, and then the extraction will
+       * be done in parallel as much as possible.
+       * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
+       * the gathered vectors can be shuffled easily (even with avx).
+       * 64xn float -> 32xn float is handled too but it's a bit special as
+       * it does the conversion pre-shuffle.
+       */
+      struct lp_build_context bld;
+
+      lp_build_context_init(&bld, gallivm, type);
+      assert(type.width == 32);
+      assert(format_desc->block.bits > type.width);
+
+      unsigned store_width = util_next_power_of_two(format_desc->block.bits);
+      num_stores = store_width / type.width;
+      for (unsigned i = 0; i < format_desc->nr_channels; i++) {
+            struct util_format_channel_description chan_desc = format_desc->channel[i];
+            unsigned blockbits = type.width;
+            unsigned vec_nr;
+
+            vec_nr = chan_desc.shift / type.width;
+            chan_desc.shift %= type.width;
+
+            lp_build_insert_soa_chan(&bld, blockbits,
+                                     chan_desc,
+                                     &packed[vec_nr],
+                                     rgba_in[i]);
       }
+
+      assert(num_stores == 4 || num_stores == 2);
+      /* we can transpose and store at the same time */
+   } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      packed[0] = lp_build_float_to_r11g11b10(gallivm, rgba_in);
+      num_stores = 1;
+   } else
+      assert(0);
+
+   assert(exec_mask);
+
+   LLVMTypeRef int32_ptr_type = LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0);
+   LLVMTypeRef int16_ptr_type = LLVMPointerType(LLVMInt16TypeInContext(gallivm->context), 0);
+   LLVMTypeRef int8_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
+
+   LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask");
+   should_store_mask = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), "");
+   for (unsigned i = 0; i < num_stores; i++) {
+      struct lp_build_loop_state loop_state;
+
+      LLVMValueRef store_offset = LLVMBuildAdd(gallivm->builder, offset, lp_build_const_int_vec(gallivm, type, i * 4), "");
+      store_offset = LLVMBuildGEP(gallivm->builder, base_ptr, &store_offset, 1, "");
+
+      lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+
+      struct lp_build_if_state ifthen;
+      LLVMValueRef cond = LLVMBuildExtractElement(gallivm->builder, should_store_mask, loop_state.counter, "");
+      lp_build_if(&ifthen, gallivm, cond);
+
+      LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed[i], loop_state.counter, "");
+      LLVMValueRef this_offset = LLVMBuildExtractElement(gallivm->builder, store_offset, loop_state.counter, "");
+
+      if (format_desc->block.bits == 8) {
+         this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int8_ptr_type, "");
+         data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt8TypeInContext(gallivm->context), "");
+      } else if (format_desc->block.bits == 16) {
+         this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int16_ptr_type, "");
+         data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt16TypeInContext(gallivm->context), "");
+      } else
+         this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int32_ptr_type, "");
+      LLVMBuildStore(gallivm->builder, data, this_offset);
+      lp_build_endif(&ifthen);
+      lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length),
+                             NULL, LLVMIntUGE);
    }
 }