gallivm: optimize gather a bit, by using supplied destination type

[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_format_soa.c
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c

index c724a4453e6e0ad2b11417d7c4d68f4aff15a8d7..b3bc15552c52d165920cc8befe8d00e6a9ea0024 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -37,7 +37,42 @@
  #include "lp_bld_conv.h"
  #include "lp_bld_swizzle.h"
  #include "lp_bld_gather.h"
+#include "lp_bld_debug.h"
  #include "lp_bld_format.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_pack.h"
+
+
+static void
+convert_to_soa(struct gallivm_state *gallivm,
+               LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
+               LLVMValueRef dst_soa[4],
+               const struct lp_type soa_type)
+{
+   unsigned j, k;
+   struct lp_type aos_channel_type = soa_type;
+
+   LLVMValueRef aos_channels[4];
+   unsigned pixels_per_channel = soa_type.length / 4;
+
+   debug_assert((soa_type.length % 4) == 0);
+
+   aos_channel_type.length >>= 1;
+
+   for (j = 0; j < 4; ++j) {
+      LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
+
+      assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
+
+      for (k = 0; k < pixels_per_channel; ++k) {
+         channel[k] = src_aos[j + 4 * k];
+      }
+
+      aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
+   }
+
+   lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
+}
  
  
  void
@@ -46,26 +81,32 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
                              const LLVMValueRef *unswizzled,
                              LLVMValueRef swizzled_out[4])
  {
-   assert(UTIL_FORMAT_SWIZZLE_0 == PIPE_SWIZZLE_ZERO);
-   assert(UTIL_FORMAT_SWIZZLE_1 == PIPE_SWIZZLE_ONE);
-
     if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      enum pipe_swizzle swizzle;
+      LLVMValueRef depth_or_stencil;
+
+      if (util_format_has_stencil(format_desc) &&
+          !util_format_has_depth(format_desc)) {
+         assert(!bld->type.floating);
+         swizzle = format_desc->swizzle[1];
+      }
+      else {
+         assert(bld->type.floating);
+         swizzle = format_desc->swizzle[0];
+      }
        /*
-       * Return zzz1 for depth-stencil formats.
-       *
-       * XXX: Allow to control the depth swizzle with an additional parameter,
-       * as the caller may wish another depth swizzle, or retain the stencil
-       * value.
+       * Return zzz1 or sss1 for depth-stencil formats here.
+       * Correct swizzling will be handled by apply_sampler_swizzle() later.
         */
-      enum util_format_swizzle swizzle = format_desc->swizzle[0];
-      LLVMValueRef depth = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
-      swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth;
+      depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
+
+      swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil;
        swizzled_out[3] = bld->one;
     }
     else {
        unsigned chan;
        for (chan = 0; chan < 4; ++chan) {
-         enum util_format_swizzle swizzle = format_desc->swizzle[chan];
+         enum pipe_swizzle swizzle = format_desc->swizzle[chan];
           swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
        }
     }
@@ -96,15 +137,15 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
   * \param rgba_out  returns the SoA R,G,B,A vectors
   */
  void
-lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
+lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
                           const struct util_format_description *format_desc,
                           struct lp_type type,
                           LLVMValueRef packed,
                           LLVMValueRef rgba_out[4])
  {
+   LLVMBuilderRef builder = gallivm->builder;
     struct lp_build_context bld;
     LLVMValueRef inputs[4];
-   unsigned start;
     unsigned chan;
  
     assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
@@ -112,15 +153,14 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
     assert(format_desc->block.height == 1);
     assert(format_desc->block.bits <= type.width);
     /* FIXME: Support more output types */
-   assert(type.floating);
     assert(type.width == 32);
  
-   lp_build_context_init(&bld, builder, type);
+   lp_build_context_init(&bld, gallivm, type);
  
     /* Decode the input vector components */
-   start = 0;
     for (chan = 0; chan < format_desc->nr_channels; ++chan) {
        const unsigned width = format_desc->channel[chan].size;
+      const unsigned start = format_desc->channel[chan].shift;
        const unsigned stop = start + width;
        LLVMValueRef input;
  
@@ -128,7 +168,7 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
  
        switch(format_desc->channel[chan].type) {
        case UTIL_FORMAT_TYPE_VOID:
-         input = lp_build_undef(type);
+         input = lp_build_undef(gallivm, type);
           break;
  
        case UTIL_FORMAT_TYPE_UNSIGNED:
@@ -137,7 +177,7 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
            */
  
           if (start) {
-            input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(type, start), "");
+            input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, type, start), "");
           }
  
           /*
@@ -146,7 +186,7 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
  
           if (stop < format_desc->block.bits) {
              unsigned mask = ((unsigned long long)1 << width) - 1;
-            input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(type, mask), "");
+            input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(gallivm, type, mask), "");
           }
  
           /*
@@ -154,15 +194,28 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
            */
  
           if (type.floating) {
-            if(format_desc->channel[chan].normalized)
-               input = lp_build_unsigned_norm_to_float(builder, width, type, input);
-            else
-               input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), "");
+            if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+               if (format_desc->swizzle[3] == chan) {
+                  input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
+               }
+               else {
+                  struct lp_type conv_type = lp_uint_type(type);
+                  input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
+               }
+            }
+            else {
+               if(format_desc->channel[chan].normalized)
+                  input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
+               else
+                  input = LLVMBuildSIToFP(builder, input,
+                                          lp_build_vec_type(gallivm, type), "");
+            }
           }
-         else {
-            /* FIXME */
-            assert(0);
-            input = lp_build_undef(type);
+         else if (format_desc->channel[chan].pure_integer) {
+            /* Nothing to do */
+         } else {
+             /* FIXME */
+             assert(0);
           }
  
           break;
@@ -174,7 +227,7 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
  
           if (stop < type.width) {
              unsigned bits = type.width - stop;
-            LLVMValueRef bits_val = lp_build_const_int_vec(type, bits);
+            LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
              input = LLVMBuildShl(builder, input, bits_val, "");
           }
  
@@ -184,7 +237,7 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
  
           if (format_desc->channel[chan].size < type.width) {
              unsigned bits = type.width - format_desc->channel[chan].size;
-            LLVMValueRef bits_val = lp_build_const_int_vec(type, bits);
+            LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
              input = LLVMBuildAShr(builder, input, bits_val, "");
           }
  
@@ -193,79 +246,113 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
            */
  
           if (type.floating) {
-            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), "");
+            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
              if (format_desc->channel[chan].normalized) {
                 double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1);
-               LLVMValueRef scale_val = lp_build_const_vec(type, scale);
+               LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
                 input = LLVMBuildFMul(builder, input, scale_val, "");
+               /* the formula above will produce value below -1.0 for most negative
+                * value but everything seems happy with that hence disable for now */
+               if (0)
+                  input = lp_build_max(&bld, input,
+                                       lp_build_const_vec(gallivm, type, -1.0f));
              }
           }
-         else {
-            /* FIXME */
-            assert(0);
-            input = lp_build_undef(type);
+         else if (format_desc->channel[chan].pure_integer) {
+            /* Nothing to do */
+         } else {
+             /* FIXME */
+             assert(0);
           }
  
           break;
  
        case UTIL_FORMAT_TYPE_FLOAT:
           if (type.floating) {
-            assert(start == 0);
-            assert(stop == 32);
-            assert(type.width == 32);
-            input = LLVMBuildBitCast(builder, input, lp_build_vec_type(type), "");
+            if (format_desc->channel[chan].size == 16) {
+               struct lp_type f16i_type = type;
+               f16i_type.width /= 2;
+               f16i_type.floating = 0;
+               if (start) {
+                  input = LLVMBuildLShr(builder, input,
+                             lp_build_const_int_vec(gallivm, type, start), "");
+               }
+               input = LLVMBuildTrunc(builder, input,
+                                      lp_build_vec_type(gallivm, f16i_type), "");
+               input = lp_build_half_to_float(gallivm, input);
+            } else {
+               assert(start == 0);
+               assert(stop == 32);
+               assert(type.width == 32);
+            }
+            input = LLVMBuildBitCast(builder, input, lp_build_vec_type(gallivm, type), "");
           }
           else {
              /* FIXME */
              assert(0);
-            input = lp_build_undef(type);
+            input = lp_build_undef(gallivm, type);
           }
           break;
  
        case UTIL_FORMAT_TYPE_FIXED:
           if (type.floating) {
              double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1);
-            LLVMValueRef scale_val = lp_build_const_vec(type, scale);
-            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), "");
+            LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
+            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
              input = LLVMBuildFMul(builder, input, scale_val, "");
           }
           else {
              /* FIXME */
              assert(0);
-            input = lp_build_undef(type);
+            input = lp_build_undef(gallivm, type);
           }
           break;
  
        default:
           assert(0);
-         input = lp_build_undef(type);
+         input = lp_build_undef(gallivm, type);
           break;
        }
  
        inputs[chan] = input;
-
-      start = stop;
     }
  
     lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
  }
  
  
+/**
+ * Convert a vector of rgba8 values into 32bit wide SoA vectors.
+ *
+ * \param dst_type  The desired return type. For pure integer formats
+ *                  this should be a 32bit wide int or uint vector type,
+ *                  otherwise a float vector type.
+ *
+ * \param packed    The rgba8 values to pack.
+ *
+ * \param rgba      The 4 SoA return vectors.
+ */
  void
-lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
-                          struct lp_type dst_type,
-                          LLVMValueRef packed,
-                          LLVMValueRef *rgba)
+lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
+                           struct lp_type dst_type,
+                           LLVMValueRef packed,
+                           LLVMValueRef *rgba)
  {
-   LLVMValueRef mask = lp_build_const_int_vec(dst_type, 0xff);
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);
     unsigned chan;
  
+   /* XXX technically shouldn't use that for uint dst_type */
     packed = LLVMBuildBitCast(builder, packed,
-                             lp_build_int_vec_type(dst_type), "");
+                             lp_build_int_vec_type(gallivm, dst_type), "");
  
     /* Decode the input vector components */
     for (chan = 0; chan < 4; ++chan) {
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
        unsigned start = chan*8;
+#else
+      unsigned start = (3-chan)*8;
+#endif
        unsigned stop = start + 8;
        LLVMValueRef input;
  
@@ -273,12 +360,13 @@ lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
  
        if (start)
           input = LLVMBuildLShr(builder, input,
-                               lp_build_const_int_vec(dst_type, start), "");
+                               lp_build_const_int_vec(gallivm, dst_type, start), "");
  
        if (stop < 32)
           input = LLVMBuildAnd(builder, input, mask, "");
  
-      input = lp_build_unsigned_norm_to_float(builder, 8, dst_type, input);
+      if (dst_type.floating)
+         input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);
  
        rgba[chan] = input;
     }
@@ -291,35 +379,45 @@ lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
   *
   * \param type  the desired return type for 'rgba'.  The vector length
   *              is the number of texels to fetch
+ * \param aligned if the offset is guaranteed to be aligned to element width
   *
- * \param base_ptr  points to start of the texture image block.  For non-
- *                  compressed formats, this simply points to the texel.
- *                  For compressed formats, it points to the start of the
+ * \param base_ptr  points to the base of the texture mip tree.
+ * \param offset    offset to start of the texture image block.  For non-
+ *                  compressed formats, this simply is an offset to the texel.
+ *                  For compressed formats, it is an offset to the start of the
   *                  compressed data block.
   *
   * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
   *              these will always be (0,0).  For compressed formats, i will
   *              be in [0, block_width-1] and j will be in [0, block_height-1].
+ * \param cache  optional value pointing to a lp_build_format_cache structure
   */
  void
-lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
+lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                          const struct util_format_description *format_desc,
                          struct lp_type type,
+                        boolean aligned,
                          LLVMValueRef base_ptr,
                          LLVMValueRef offset,
                          LLVMValueRef i,
                          LLVMValueRef j,
+                        LLVMValueRef cache,
                          LLVMValueRef rgba_out[4])
  {
+   LLVMBuilderRef builder = gallivm->builder;
+   enum pipe_format format = format_desc->format;
+   struct lp_type fetch_type;
  
     if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
         (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
+        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
          format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
         format_desc->block.width == 1 &&
         format_desc->block.height == 1 &&
         format_desc->block.bits <= type.width &&
         (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
-        format_desc->channel[0].size == 32))
+        format_desc->channel[0].size == 32 ||
+        format_desc->channel[0].size == 16))
     {
        /*
         * The packed pixel fits into an element of the destination format. Put
@@ -331,30 +429,94 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
  
        /*
         * gather the texels from the texture
-       * Ex: packed = {BGRA, BGRA, BGRA, BGRA}.
+       * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
         */
-      packed = lp_build_gather(builder,
+      assert(format_desc->block.bits <= type.width);
+      fetch_type = lp_type_uint(type.width);
+      packed = lp_build_gather(gallivm,
                                 type.length,
                                 format_desc->block.bits,
-                               type.width,
-                               base_ptr, offset);
+                               fetch_type,
+                               aligned,
+                               base_ptr, offset, FALSE);
  
        /*
         * convert texels to float rgba
         */
-      lp_build_unpack_rgba_soa(builder,
+      lp_build_unpack_rgba_soa(gallivm,
                                 format_desc,
                                 type,
                                 packed, rgba_out);
        return;
     }
  
+   if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
+       format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+      /*
+       * similar conceptually to above but requiring special
+       * AoS packed -> SoA float conversion code.
+       */
+      LLVMValueRef packed;
+      struct lp_type fetch_type = lp_type_uint(type.width);
+
+      assert(type.floating);
+      assert(type.width == 32);
+
+      packed = lp_build_gather(gallivm, type.length,
+                               format_desc->block.bits,
+                               fetch_type, aligned,
+                               base_ptr, offset, FALSE);
+      if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
+         lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
+      }
+      else {
+         lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
+      }
+      return;
+   }
+
+   if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
+       format_desc->block.bits == 64) {
+      /*
+       * special case the format is 64 bits but we only require
+       * 32bit (or 8bit) from each block.
+       */
+      LLVMValueRef packed;
+      struct lp_type fetch_type = lp_type_uint(type.width);
+
+      if (format == PIPE_FORMAT_X32_S8X24_UINT) {
+         /*
+          * for stencil simply fix up offsets - could in fact change
+          * base_ptr instead even outside the shader.
+          */
+         unsigned mask = (1 << 8) - 1;
+         LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
+         offset = LLVMBuildAdd(builder, offset, s_offset, "");
+         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
+                                  aligned, base_ptr, offset, FALSE);
+         packed = LLVMBuildAnd(builder, packed,
+                               lp_build_const_int_vec(gallivm, type, mask), "");
+      }
+      else {
+         assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
+         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
+                                  aligned, base_ptr, offset, TRUE);
+         packed = LLVMBuildBitCast(builder, packed,
+                                   lp_build_vec_type(gallivm, type), "");
+      }
+      /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
+      rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
+      rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
+      return;
+   }
+
     /*
      * Try calling lp_build_fetch_rgba_aos for all pixels.
      */
  
     if (util_format_fits_8unorm(format_desc) &&
-       type.floating && type.width == 32 && type.length == 4) {
+       type.floating && type.width == 32 &&
+       (type.length == 1 || (type.length % 4 == 0))) {
        struct lp_type tmp_type;
        LLVMValueRef tmp;
  
@@ -363,10 +525,10 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
        tmp_type.length = type.length * 4;
        tmp_type.norm = TRUE;
  
-      tmp = lp_build_fetch_rgba_aos(builder, format_desc, tmp_type,
-                                    base_ptr, offset, i, j);
+      tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
+                                    aligned, base_ptr, offset, i, j, cache);
  
-      lp_build_rgba8_to_f32_soa(builder,
+      lp_build_rgba8_to_fi32_soa(gallivm,
                                  type,
                                  tmp,
                                  rgba_out);
@@ -374,6 +536,40 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
        return;
     }
  
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC &&
+       /* non-srgb case is already handled above */
+       format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
+       type.floating && type.width == 32 &&
+       (type.length == 1 || (type.length % 4 == 0)) &&
+       cache) {
+      const struct util_format_description *format_decompressed;
+      const struct util_format_description *flinear_desc;
+      LLVMValueRef packed;
+      flinear_desc = util_format_description(util_format_linear(format_desc->format));
+      /* This probably only works with aligned data */
+      packed = lp_build_fetch_cached_texels(gallivm,
+                                            flinear_desc,
+                                            type.length,
+                                            base_ptr,
+                                            offset,
+                                            i, j,
+                                            cache);
+      packed = LLVMBuildBitCast(builder, packed,
+                                lp_build_int_vec_type(gallivm, type), "");
+      /*
+       * The values are now packed so they match ordinary srgb RGBA8 format,
+       * hence need to use matching format for unpack.
+       */
+      format_decompressed = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
+
+      lp_build_unpack_rgba_soa(gallivm,
+                               format_decompressed,
+                               type,
+                               packed, rgba_out);
+
+      return;
+   }
+
     /*
      * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
      *
@@ -384,43 +580,43 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
      */
  
     {
-      unsigned k, chan;
+      unsigned k;
        struct lp_type tmp_type;
+      LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
+
+      if (gallivm_debug & GALLIVM_DEBUG_PERF) {
+         debug_printf("%s: AoS fetch fallback for %s\n",
+                      __FUNCTION__, format_desc->short_name);
+      }
  
        tmp_type = type;
        tmp_type.length = 4;
  
-      for (chan = 0; chan < 4; ++chan) {
-         rgba_out[chan] = lp_build_undef(type);
-      }
+      /*
+       * Note that vector transpose can be worse compared to insert/extract
+       * for aos->soa conversion (for formats with 1 or 2 channels). However,
+       * we should try to avoid getting here for just about all formats, so
+       * don't bother.
+       */
  
        /* loop over number of pixels */
        for(k = 0; k < type.length; ++k) {
-         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0);
+         LLVMValueRef index = lp_build_const_int32(gallivm, k);
           LLVMValueRef offset_elem;
           LLVMValueRef i_elem, j_elem;
-         LLVMValueRef tmp;
  
-         offset_elem = LLVMBuildExtractElement(builder, offset, index, "");
+         offset_elem = LLVMBuildExtractElement(builder, offset,
+                                               index, "");
  
           i_elem = LLVMBuildExtractElement(builder, i, index, "");
           j_elem = LLVMBuildExtractElement(builder, j, index, "");
  
           /* Get a single float[4]={R,G,B,A} pixel */
-         tmp = lp_build_fetch_rgba_aos(builder, format_desc, tmp_type,
-                                       base_ptr, offset_elem,
-                                       i_elem, j_elem);
+         aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
+                                                aligned, base_ptr, offset_elem,
+                                                i_elem, j_elem, cache);
  
-         /*
-          * Insert the AoS tmp value channels into the SoA result vectors at
-          * position = 'index'.
-          */
-         for (chan = 0; chan < 4; ++chan) {
-            LLVMValueRef chan_val = LLVMConstInt(LLVMInt32Type(), chan, 0),
-            tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, "");
-            rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan],
-                                                    tmp_chan, index, "");
-         }
        }
+      convert_to_soa(gallivm, aos_fetch, rgba_out, type);
     }
  }