gallivm/sample: change texture function generator api

[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_sample_soa.c
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c

index 378c562bb9fe43067b6c88618fef7945b8b2f9f8..86c70bc3088306045f33cb015bb6157d3d8ef241 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -40,9 +40,9 @@
  #include "util/u_dump.h"
  #include "util/u_memory.h"
  #include "util/u_math.h"
-#include "util/u_format.h"
+#include "util/format/u_format.h"
  #include "util/u_cpu_detect.h"
-#include "util/u_format_rgb9e5.h"
+#include "util/format_rgb9e5.h"
  #include "lp_bld_debug.h"
  #include "lp_bld_type.h"
  #include "lp_bld_const.h"
@@ -60,6 +60,8 @@
  #include "lp_bld_struct.h"
  #include "lp_bld_quad.h"
  #include "lp_bld_pack.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_misc.h"
  
  
  /**
@@ -158,9 +160,10 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
  
     lp_build_fetch_rgba_soa(bld->gallivm,
                             bld->format_desc,
-                           bld->texel_type,
+                           bld->texel_type, TRUE,
                             data_ptr, offset,
                             i, j,
+                           bld->cache,
                             texel_out);
  
     /*
@@ -192,9 +195,17 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
        for (chan = 0; chan < 4; chan++) {
           unsigned chan_s;
           /* reverse-map channel... */
-         for (chan_s = 0; chan_s < 4; chan_s++) {
-            if (chan_s == format_desc->swizzle[chan]) {
+         if (util_format_has_stencil(format_desc)) {
+            if (chan == 0)
+               chan_s = 0;
+            else
                 break;
+         }
+         else {
+            for (chan_s = 0; chan_s < 4; chan_s++) {
+               if (chan_s == format_desc->swizzle[chan]) {
+                  break;
+               }
              }
           }
           if (chan_s <= 3) {
@@ -216,29 +227,42 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
  
  
  /**
- * Helper to compute the mirror function for the PIPE_WRAP_MIRROR modes.
+ * Helper to compute the mirror function for the PIPE_WRAP_MIRROR_REPEAT mode.
+ * (Note that with pot sizes could do this much more easily post-scale
+ * with some bit arithmetic.)
   */
  static LLVMValueRef
  lp_build_coord_mirror(struct lp_build_sample_context *bld,
-                      LLVMValueRef coord)
+                      LLVMValueRef coord, boolean posOnly)
  {
     struct lp_build_context *coord_bld = &bld->coord_bld;
-   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   LLVMValueRef fract, flr, isOdd;
-
-   lp_build_ifloor_fract(coord_bld, coord, &flr, &fract);
-
-   /* isOdd = flr & 1 */
-   isOdd = LLVMBuildAnd(bld->gallivm->builder, flr, int_coord_bld->one, "");
+   LLVMValueRef fract;
+   LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
  
-   /* make coord positive or negative depending on isOdd */
-   coord = lp_build_set_sign(coord_bld, fract, isOdd);
+   /*
+    * We can just use 2*(x - round(0.5*x)) to do all the mirroring,
+    * it all works out. (The result is in range [-1, 1.0], negative if
+    * the coord is in the "odd" section, otherwise positive.)
+    */
  
-   /* convert isOdd to float */
-   isOdd = lp_build_int_to_float(coord_bld, isOdd);
+   coord = lp_build_mul(coord_bld, coord, half);
+   fract = lp_build_round(coord_bld, coord);
+   fract = lp_build_sub(coord_bld, coord, fract);
+   coord = lp_build_add(coord_bld, fract, fract);
  
-   /* add isOdd to coord */
-   coord = lp_build_add(coord_bld, coord, isOdd);
+   if (posOnly) {
+      /*
+       * Theoretically it's not quite 100% accurate because the spec says
+       * that ultimately a scaled coord of -x.0 should map to int coord
+       * -x + 1 with mirroring, not -x (this does not matter for bilinear
+       * filtering).
+       */
+      coord = lp_build_abs(coord_bld, coord);
+      /* kill off NaNs */
+      /* XXX: not safe without arch rounding, fract can be anything. */
+      coord = lp_build_max_ext(coord_bld, coord, coord_bld->zero,
+                               GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
+   }
  
     return coord;
  }
@@ -271,10 +295,15 @@ lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
      * we avoided the 0.5/length division before the repeat wrap,
      * now need to fix up edge cases with selects
      */
+   /*
+    * Note we do a float (unordered) compare so we can eliminate NaNs.
+    * (Otherwise would need fract_safe above).
+    */
+   mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
+                           PIPE_FUNC_LESS, coord_f, coord_bld->zero);
+
     /* convert to int, compute lerp weight */
     lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
-   mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
-                           PIPE_FUNC_LESS, *coord0_i, int_coord_bld->zero);
     *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
  }
  
@@ -287,6 +316,7 @@ lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
   */
  static void
  lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
+                            boolean is_gather,
                              LLVMValueRef coord,
                              LLVMValueRef length,
                              LLVMValueRef length_f,
@@ -349,7 +379,13 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
           coord = lp_build_add(coord_bld, coord, offset);
        }
  
-      /* clamp to [0, length] */
+      /*
+       * clamp to [0, length]
+       *
+       * Unlike some other wrap modes, this should be correct for gather
+       * too. GL_CLAMP explicitly does this clamp on the coord prior to
+       * actual wrapping (which is per sample).
+       */
        coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
  
        coord = lp_build_sub(coord_bld, coord, half);
@@ -374,14 +410,31 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
           }
  
           /* clamp to length max */
-         coord = lp_build_min(coord_bld, coord, length_f);
-         /* subtract 0.5 */
-         coord = lp_build_sub(coord_bld, coord, half);
-         /* clamp to [0, length - 0.5] */
-         coord = lp_build_max(coord_bld, coord, coord_bld->zero);
-         /* convert to int, compute lerp weight */
-         lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
-         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+         coord = lp_build_min_ext(coord_bld, coord, length_f,
+                                  GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
+         if (!is_gather) {
+            /* subtract 0.5 */
+            coord = lp_build_sub(coord_bld, coord, half);
+            /* clamp to [0, length - 0.5] */
+            coord = lp_build_max(coord_bld, coord, coord_bld->zero);
+            /* convert to int, compute lerp weight */
+            lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
+            coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+         } else {
+            /*
+             * The non-gather path will end up with coords 0, 1 if coord was
+             * smaller than 0.5 (with corresponding weight 0.0 so it doesn't
+             * really matter what the second coord is). But for gather, we
+             * really need to end up with coords 0, 0.
+             */
+            coord = lp_build_max(coord_bld, coord, coord_bld->zero);
+            coord0 = lp_build_sub(coord_bld, coord, half);
+            coord1 = lp_build_add(coord_bld, coord, half);
+            /* Values range ([-0.5, length_f - 0.5], [0.5, length_f + 0.5] */
+            coord0 = lp_build_itrunc(coord_bld, coord0);
+            coord1 = lp_build_itrunc(coord_bld, coord1);
+            weight = coord_bld->undef;
+         }
           /* coord1 = min(coord1, length-1) */
           coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
           break;
@@ -396,8 +449,13 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
           offset = lp_build_int_to_float(coord_bld, offset);
           coord = lp_build_add(coord_bld, coord, offset);
        }
-      /* was: clamp to [-0.5, length + 0.5], then sub 0.5 */
-      /* can skip clamp (though might not work for very large coord values */
+      /*
+       * We don't need any clamp. Technically, for very large (pos or neg)
+       * (or infinite) values, clamp against [-length, length] would be
+       * correct, but we don't need to guarantee any specific
+       * result for such coords (the ifloor will be undefined, but for modes
+       * requiring border all resulting coords are safe).
+       */
        coord = lp_build_sub(coord_bld, coord, half);
        /* convert to int, compute lerp weight */
        lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
@@ -405,25 +463,69 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
        break;
  
     case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      /* compute mirror function */
-      coord = lp_build_coord_mirror(bld, coord);
-
-      /* scale coord to length */
-      coord = lp_build_mul(coord_bld, coord, length_f);
-      coord = lp_build_sub(coord_bld, coord, half);
        if (offset) {
           offset = lp_build_int_to_float(coord_bld, offset);
+         offset = lp_build_div(coord_bld, offset, length_f);
           coord = lp_build_add(coord_bld, coord, offset);
        }
+      if (!is_gather) {
+         /* compute mirror function */
+         coord = lp_build_coord_mirror(bld, coord, TRUE);
  
-      /* convert to int, compute lerp weight */
-      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
-      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+         /* scale coord to length */
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         coord = lp_build_sub(coord_bld, coord, half);
  
-      /* coord0 = max(coord0, 0) */
-      coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
-      /* coord1 = min(coord1, length-1) */
-      coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
+         /* convert to int, compute lerp weight */
+         lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
+         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+
+         /* coord0 = max(coord0, 0) */
+         coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
+         /* coord1 = min(coord1, length-1) */
+         coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
+      } else {
+         /*
+          * This is pretty reasonable in the end,  all what the tests care
+          * about is nasty edge cases (scaled coords x.5, so the individual
+          * coords are actually integers, which is REALLY tricky to get right
+          * due to this working differently both for negative numbers as well
+          * as for even/odd cases). But with enough magic it's not too complex
+          * after all.
+          * Maybe should try a bit arithmetic one though for POT textures...
+          */
+         LLVMValueRef isNeg;
+         /*
+          * Wrapping just once still works, even though it means we can
+          * get "wrong" sign due to performing mirror in the middle of the
+          * two coords (because this can only happen very near the odd/even
+          * edges, so both coords will actually end up as 0 or length - 1
+          * in the end).
+          * For GL4 gather with per-sample offsets we'd need to the mirroring
+          * per coord too.
+          */
+         coord = lp_build_coord_mirror(bld, coord, FALSE);
+         coord = lp_build_mul(coord_bld, coord, length_f);
+
+         /*
+          * NaNs should be safe here, we'll do away with them with
+          * the ones' complement plus min.
+          */
+         coord0 = lp_build_sub(coord_bld, coord, half);
+         coord0 = lp_build_ifloor(coord_bld, coord0);
+         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+         /* ones complement for neg numbers (mirror(negX) = X - 1)  */
+         isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
+                              coord0, int_coord_bld->zero);
+         coord0 = lp_build_xor(int_coord_bld, coord0, isNeg);
+         isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
+                              coord1, int_coord_bld->zero);
+         coord1 = lp_build_xor(int_coord_bld, coord1, isNeg);
+         coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
+         coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
+
+         weight = coord_bld->undef;
+      }
        break;
  
     case PIPE_TEX_WRAP_MIRROR_CLAMP:
@@ -435,10 +537,19 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
           offset = lp_build_int_to_float(coord_bld, offset);
           coord = lp_build_add(coord_bld, coord, offset);
        }
+      /*
+       * XXX: probably not correct for gather, albeit I'm not
+       * entirely sure as it's poorly specified. The wrapping looks
+       * correct according to the spec which is against gl 1.2.1,
+       * however negative values will be swapped - gl re-specified
+       * wrapping with newer versions (no more pre-clamp except with
+       * GL_CLAMP).
+       */
        coord = lp_build_abs(coord_bld, coord);
  
        /* clamp to [0, length] */
-      coord = lp_build_min(coord_bld, coord, length_f);
+      coord = lp_build_min_ext(coord_bld, coord, length_f,
+                               GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
  
        coord = lp_build_sub(coord_bld, coord, half);
  
@@ -460,20 +571,59 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
              offset = lp_build_int_to_float(coord_bld, offset);
              coord = lp_build_add(coord_bld, coord, offset);
           }
-         coord = lp_build_abs(coord_bld, coord);
-
-         /* clamp to length max */
-         coord = lp_build_min(coord_bld, coord, length_f);
-         /* subtract 0.5 */
-         coord = lp_build_sub(coord_bld, coord, half);
-         /* clamp to [0, length - 0.5] */
-         coord = lp_build_max(coord_bld, coord, coord_bld->zero);
+         if (!is_gather) {
+            coord = lp_build_abs(coord_bld, coord);
+
+            /* clamp to length max */
+            coord = lp_build_min_ext(coord_bld, coord, length_f,
+                                     GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
+            /* subtract 0.5 */
+            coord = lp_build_sub(coord_bld, coord, half);
+            /* clamp to [0, length - 0.5] */
+            coord = lp_build_max(coord_bld, coord, coord_bld->zero);
+
+            /* convert to int, compute lerp weight */
+            lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
+            coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+            /* coord1 = min(coord1, length-1) */
+            coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
+         } else {
+            /*
+             * The non-gather path will swap coord0/1 if coord was negative,
+             * which is ok for filtering since the filter weight matches
+             * accordingly. Also, if coord is close to zero, coord0/1 will
+             * be 0 and 1, instead of 0 and 0 (again ok due to filter
+             * weight being 0.0). Both issues need to be fixed for gather.
+             */
+            LLVMValueRef isNeg;
  
-         /* convert to int, compute lerp weight */
-         lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
-         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
-         /* coord1 = min(coord1, length-1) */
-         coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
+            /*
+             * Actually wanted to cheat here and use:
+             * coord1 = lp_build_iround(coord_bld, coord);
+             * but it's not good enough for some tests (even piglit
+             * textureGather is set up in a way so the coords area always
+             * .5, that is right at the crossover points).
+             * So do ordinary sub/floor, then do ones' complement
+             * for negative numbers.
+             * (Note can't just do sub|add/abs/itrunc per coord neither -
+             * because the spec demands that mirror(3.0) = 3 but
+             * mirror(-3.0) = 2.)
+             */
+            coord = lp_build_sub(coord_bld, coord, half);
+            coord0 = lp_build_ifloor(coord_bld, coord);
+            coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+            isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord0,
+                                 int_coord_bld->zero);
+            coord0 = lp_build_xor(int_coord_bld, isNeg, coord0);
+            coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
+
+            isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord1,
+                                 int_coord_bld->zero);
+            coord1 = lp_build_xor(int_coord_bld, isNeg, coord1);
+            coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
+
+            weight = coord_bld->undef;
+         }
        }
        break;
  
@@ -487,11 +637,20 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
              offset = lp_build_int_to_float(coord_bld, offset);
              coord = lp_build_add(coord_bld, coord, offset);
           }
+         /*
+          * XXX: probably not correct for gather due to swapped
+          * order if coord is negative (same rationale as for
+          * MIRROR_CLAMP).
+          */
           coord = lp_build_abs(coord_bld, coord);
  
-         /* was: clamp to [-0.5, length + 0.5] then sub 0.5 */
-         /* skip clamp - always positive, and other side
-            only potentially matters for very large coords */
+         /*
+          * We don't need any clamp. Technically, for very large
+          * (or infinite) values, clamp against length would be
+          * correct, but we don't need to guarantee any specific
+          * result for such coords (the ifloor will be undefined, but
+          * for modes requiring border all resulting coords are safe).
+          */
           coord = lp_build_sub(coord_bld, coord, half);
  
           /* convert to int, compute lerp weight */
@@ -567,12 +726,13 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
           coord = lp_build_mul(coord_bld, coord, length_f);
        }
  
+      if (offset) {
+         offset = lp_build_int_to_float(coord_bld, offset);
+         coord = lp_build_add(coord_bld, coord, offset);
+      }
        /* floor */
        /* use itrunc instead since we clamp to 0 anyway */
        icoord = lp_build_itrunc(coord_bld, coord);
-      if (offset) {
-         icoord = lp_build_add(int_coord_bld, icoord, offset);
-      }
  
        /* clamp to [0, length - 1]. */
        icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
@@ -598,7 +758,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
           coord = lp_build_add(coord_bld, coord, offset);
        }
        /* compute mirror function */
-      coord = lp_build_coord_mirror(bld, coord);
+      coord = lp_build_coord_mirror(bld, coord, TRUE);
  
        /* scale coord to length */
        assert(bld->static_sampler_state->normalized_coords);
@@ -625,9 +785,15 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
  
        /* itrunc == ifloor here */
        icoord = lp_build_itrunc(coord_bld, coord);
-
-      /* clamp to [0, length - 1] */
-      icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
+      /*
+       * Use unsigned min due to possible undef values (NaNs, overflow)
+       */
+      {
+         struct lp_build_context abs_coord_bld = *int_coord_bld;
+         abs_coord_bld.type.sign = FALSE;
+         /* clamp to [0, length - 1] */
+         icoord = lp_build_min(&abs_coord_bld, icoord, length_minus_one);
+      }
        break;
  
     case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
@@ -700,7 +866,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                                LLVMValueRef img_stride_vec,
                                LLVMValueRef data_ptr,
                                LLVMValueRef mipoffsets,
-                              LLVMValueRef *coords,
+                              const LLVMValueRef *coords,
                                const LLVMValueRef *offsets,
                                LLVMValueRef colors_out[4])
  {
@@ -847,7 +1013,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                               LLVMValueRef img_stride_vec,
                               LLVMValueRef data_ptr,
                               LLVMValueRef mipoffsets,
-                             LLVMValueRef *coords,
+                             const LLVMValueRef *coords,
                               const LLVMValueRef *offsets,
                               LLVMValueRef colors_out[4])
  {
@@ -873,20 +1039,29 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
     LLVMValueRef neighbors[2][2][4];
     int chan, texel_index;
     boolean seamless_cube_filter, accurate_cube_corners;
+   unsigned chan_swiz = bld->static_texture_state->swizzle_r;
+
+   if (is_gather) {
+      switch (bld->gather_comp) {
+      case 0: chan_swiz = bld->static_texture_state->swizzle_r; break;
+      case 1: chan_swiz = bld->static_texture_state->swizzle_g; break;
+      case 2: chan_swiz = bld->static_texture_state->swizzle_b; break;
+      case 3: chan_swiz = bld->static_texture_state->swizzle_a; break;
+      default:
+        break;
+      }
+   }
  
     seamless_cube_filter = (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
                             bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
                            bld->static_sampler_state->seamless_cube_map;
+
     /*
-    * XXX I don't know how this is really supposed to work with gather. From GL
-    * spec wording (not gather specific) it sounds like the 4th missing texel
-    * should be an average of the other 3, hence for gather could return this.
-    * This is however NOT how the code here works, which just fixes up the
-    * weights used for filtering instead. And of course for gather there is
-    * no filter to tweak...
+    * Disable accurate cube corners for integer textures, which should only
+    * get here in the gather path.
      */
     accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter &&
-                           !is_gather;
+     !util_format_is_pure_integer(bld->static_texture_state->format);
  
     lp_build_extract_image_sizes(bld,
                                  &bld->int_size_bld,
@@ -907,7 +1082,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
      */
  
     if (!seamless_cube_filter) {
-      lp_build_sample_wrap_linear(bld, coords[0], width_vec,
+      lp_build_sample_wrap_linear(bld, is_gather, coords[0], width_vec,
                                    flt_width_vec, offsets[0],
                                    bld->static_texture_state->pot_width,
                                    bld->static_sampler_state->wrap_s,
@@ -918,7 +1093,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
        x11 = x01;
  
        if (dims >= 2) {
-         lp_build_sample_wrap_linear(bld, coords[1], height_vec,
+         lp_build_sample_wrap_linear(bld, is_gather, coords[1], height_vec,
                                       flt_height_vec, offsets[1],
                                       bld->static_texture_state->pot_height,
                                       bld->static_sampler_state->wrap_t,
@@ -929,7 +1104,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
           y11 = y10;
  
           if (dims == 3) {
-            lp_build_sample_wrap_linear(bld, coords[2], depth_vec,
+            lp_build_sample_wrap_linear(bld, is_gather, coords[2], depth_vec,
                                          flt_depth_vec, offsets[2],
                                          bld->static_texture_state->pot_depth,
                                          bld->static_sampler_state->wrap_r,
@@ -956,7 +1131,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
        struct lp_build_if_state edge_if;
        LLVMTypeRef int1t;
        LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
-      LLVMValueRef coord, have_edge, have_corner;
+      LLVMValueRef coord0, coord1, have_edge, have_corner;
        LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, fall_off_y;
        LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
        LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
@@ -973,16 +1148,27 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
         */
        /* should always have normalized coords, and offsets are undefined */
        assert(bld->static_sampler_state->normalized_coords);
-      coord = lp_build_mul(coord_bld, coords[0], flt_width_vec);
+      /*
+       * The coords should all be between [0,1] however we can have NaNs,
+       * which will wreak havoc. In particular the y1_clamped value below
+       * can be -INT_MAX (on x86) and be propagated right through (probably
+       * other values might be bogus in the end too).
+       * So kill off the NaNs here.
+       */
+      coord0 = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero,
+                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
+      coord0 = lp_build_mul(coord_bld, coord0, flt_width_vec);
        /* instead of clamp, build mask if overflowed */
-      coord = lp_build_sub(coord_bld, coord, half);
+      coord0 = lp_build_sub(coord_bld, coord0, half);
        /* convert to int, compute lerp weight */
        /* not ideal with AVX (and no AVX2) */
-      lp_build_ifloor_fract(coord_bld, coord, &x0, &s_fpart);
+      lp_build_ifloor_fract(coord_bld, coord0, &x0, &s_fpart);
        x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
-      coord = lp_build_mul(coord_bld, coords[1], flt_height_vec);
-      coord = lp_build_sub(coord_bld, coord, half);
-      lp_build_ifloor_fract(coord_bld, coord, &y0, &t_fpart);
+      coord1 = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero,
+                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
+      coord1 = lp_build_mul(coord_bld, coord1, flt_height_vec);
+      coord1 = lp_build_sub(coord_bld, coord1, half);
+      lp_build_ifloor_fract(coord_bld, coord1, &y0, &t_fpart);
        y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
  
        fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero);
@@ -1214,94 +1400,191 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
         * as well) here.
         */
        if (accurate_cube_corners) {
-         LLVMValueRef w00, w01, w10, w11, wx0, wy0;
-         LLVMValueRef c_weight, c00, c01, c10, c11;
-         LLVMValueRef have_corner, one_third, tmp;
+         LLVMValueRef c00, c01, c10, c11, c00f, c01f, c10f, c11f;
+         LLVMValueRef have_corner, one_third;
  
-         colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
-         colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
-         colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
-         colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
+         colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs0");
+         colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs1");
+         colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs2");
+         colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs3");
  
           have_corner = LLVMBuildLoad(builder, have_corners, "");
  
           lp_build_if(&corner_if, bld->gallivm, have_corner);
  
-         /*
-          * we can't use standard 2d lerp as we need per-element weight
-          * in case of corners, so just calculate bilinear result as
-          * w00*s00 + w01*s01 + w10*s10 + w11*s11.
-          * (This is actually less work than using 2d lerp, 7 vs. 9 instructions,
-          * however calculating the weights needs another 6, so actually probably
-          * not slower than 2d lerp only for 4 channels as weights only need
-          * to be calculated once - of course fixing the weights has additional cost.)
-          */
-         wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
-         wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
-         w00 = lp_build_mul(coord_bld, wx0, wy0);
-         w01 = lp_build_mul(coord_bld, s_fpart, wy0);
-         w10 = lp_build_mul(coord_bld, wx0, t_fpart);
-         w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
-
-         /* find corner weight */
+         one_third = lp_build_const_vec(bld->gallivm, coord_bld->type,
+                                        1.0f/3.0f);
+
+         /* find corner */
           c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]);
-         c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
+         c00f = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
           c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]);
-         c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
+         c01f = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
           c10 = lp_build_and(ivec_bld, fall_off[0], fall_off[3]);
-         c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
+         c10f = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
           c11 = lp_build_and(ivec_bld, fall_off[1], fall_off[3]);
-         c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
+         c11f = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
  
-         /*
-          * add 1/3 of the corner weight to each of the 3 other samples
-          * and null out corner weight
-          */
-         one_third = lp_build_const_vec(bld->gallivm, coord_bld->type, 1.0f/3.0f);
-         c_weight = lp_build_mul(coord_bld, c_weight, one_third);
-         w00 = lp_build_add(coord_bld, w00, c_weight);
-         c00 = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
-         w00 = lp_build_andnot(coord_bld, w00, c00);
-         w01 = lp_build_add(coord_bld, w01, c_weight);
-         c01 = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
-         w01 = lp_build_andnot(coord_bld, w01, c01);
-         w10 = lp_build_add(coord_bld, w10, c_weight);
-         c10 = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
-         w10 = lp_build_andnot(coord_bld, w10, c10);
-         w11 = lp_build_add(coord_bld, w11, c_weight);
-         c11 = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
-         w11 = lp_build_andnot(coord_bld, w11, c11);
+         if (!is_gather) {
+            /*
+             * we can't use standard 2d lerp as we need per-element weight
+             * in case of corners, so just calculate bilinear result as
+             * w00*s00 + w01*s01 + w10*s10 + w11*s11.
+             * (This is actually less work than using 2d lerp, 7 vs. 9
+             * instructions, however calculating the weights needs another 6,
+             * so actually probably not slower than 2d lerp only for 4 channels
+             * as weights only need to be calculated once - of course fixing
+             * the weights has additional cost.)
+             */
+            LLVMValueRef w00, w01, w10, w11, wx0, wy0, c_weight, tmp;
+            wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
+            wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
+            w00 = lp_build_mul(coord_bld, wx0, wy0);
+            w01 = lp_build_mul(coord_bld, s_fpart, wy0);
+            w10 = lp_build_mul(coord_bld, wx0, t_fpart);
+            w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
+
+            /* find corner weight */
+            c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
+            c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
+            c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
+            c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
  
-         if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
-            for (chan = 0; chan < 4; chan++) {
-               colors0[chan] = lp_build_mul(coord_bld, w00, neighbors[0][0][chan]);
-               tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
-               colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
-               tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
-               colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
-               tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
-               colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
+            /*
+             * add 1/3 of the corner weight to the weight of the 3 other
+             * samples and null out corner weight.
+             */
+            c_weight = lp_build_mul(coord_bld, c_weight, one_third);
+            w00 = lp_build_add(coord_bld, w00, c_weight);
+            w00 = lp_build_andnot(coord_bld, w00, c00f);
+            w01 = lp_build_add(coord_bld, w01, c_weight);
+            w01 = lp_build_andnot(coord_bld, w01, c01f);
+            w10 = lp_build_add(coord_bld, w10, c_weight);
+            w10 = lp_build_andnot(coord_bld, w10, c10f);
+            w11 = lp_build_add(coord_bld, w11, c_weight);
+            w11 = lp_build_andnot(coord_bld, w11, c11f);
+
+            if (bld->static_sampler_state->compare_mode ==
+                PIPE_TEX_COMPARE_NONE) {
+               for (chan = 0; chan < 4; chan++) {
+                  colors0[chan] = lp_build_mul(coord_bld, w00,
+                                               neighbors[0][0][chan]);
+                  tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
+                  colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
+                  tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
+                  colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
+                  tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
+                  colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
+               }
+            }
+            else {
+               LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
+               cmpval00 = lp_build_sample_comparefunc(bld, coords[4],
+                                                      neighbors[0][0][0]);
+               cmpval01 = lp_build_sample_comparefunc(bld, coords[4],
+                                                      neighbors[0][1][0]);
+               cmpval10 = lp_build_sample_comparefunc(bld, coords[4],
+                                                      neighbors[1][0][0]);
+               cmpval11 = lp_build_sample_comparefunc(bld, coords[4],
+                                                      neighbors[1][1][0]);
+               /*
+                * inputs to interpolation are just masks so just add
+                * masked weights together
+                */
+               cmpval00 = LLVMBuildBitCast(builder, cmpval00,
+                                           coord_bld->vec_type, "");
+               cmpval01 = LLVMBuildBitCast(builder, cmpval01,
+                                           coord_bld->vec_type, "");
+               cmpval10 = LLVMBuildBitCast(builder, cmpval10,
+                                           coord_bld->vec_type, "");
+               cmpval11 = LLVMBuildBitCast(builder, cmpval11,
+                                           coord_bld->vec_type, "");
+               colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
+               tmp = lp_build_and(coord_bld, w01, cmpval01);
+               colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
+               tmp = lp_build_and(coord_bld, w10, cmpval10);
+               colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
+               tmp = lp_build_and(coord_bld, w11, cmpval11);
+               colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
+               colors0[1] = colors0[2] = colors0[3] = colors0[0];
              }
           }
           else {
-            LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
-            cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
-            cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
-            cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
-            cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
-            /* inputs to interpolation are just masks so just add masked weights together */
-            cmpval00 = LLVMBuildBitCast(builder, cmpval00, coord_bld->vec_type, "");
-            cmpval01 = LLVMBuildBitCast(builder, cmpval01, coord_bld->vec_type, "");
-            cmpval10 = LLVMBuildBitCast(builder, cmpval10, coord_bld->vec_type, "");
-            cmpval11 = LLVMBuildBitCast(builder, cmpval11, coord_bld->vec_type, "");
-            colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
-            tmp = lp_build_and(coord_bld, w01, cmpval01);
-            colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
-            tmp = lp_build_and(coord_bld, w10, cmpval10);
-            colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
-            tmp = lp_build_and(coord_bld, w11, cmpval11);
-            colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
-            colors0[1] = colors0[2] = colors0[3] = colors0[0];
+            /*
+             * We don't have any weights to adjust, so instead calculate
+             * the fourth texel as simply the average of the other 3.
+             * (This would work for non-gather too, however we'd have
+             * a boatload more of the select stuff due to there being
+             * 4 times as many colors as weights.)
+             */
+            LLVMValueRef col00, col01, col10, col11;
+            LLVMValueRef colc, colc0, colc1;
+            col10 = lp_build_swizzle_soa_channel(texel_bld,
+                                                 neighbors[1][0], chan_swiz);
+            col11 = lp_build_swizzle_soa_channel(texel_bld,
+                                                 neighbors[1][1], chan_swiz);
+            col01 = lp_build_swizzle_soa_channel(texel_bld,
+                                                 neighbors[0][1], chan_swiz);
+            col00 = lp_build_swizzle_soa_channel(texel_bld,
+                                                 neighbors[0][0], chan_swiz);
+
+            /*
+             * The spec says for comparison filtering, the comparison
+             * must happen before synthesizing the new value.
+             * This means all gathered values are always 0 or 1,
+             * except for the non-existing texel, which can be 0,1/3,2/3,1...
+             * Seems like we'd be allowed to just return 0 or 1 too, so we
+             * could simplify and pass down the compare mask values to the
+             * end (using int arithmetic/compare on the mask values to
+             * construct the fourth texel) and only there convert to floats
+             * but it's probably not worth it (it might be easier for the cpu
+             * but not for the code)...
+             */
+            if (bld->static_sampler_state->compare_mode !=
+                PIPE_TEX_COMPARE_NONE) {
+               LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
+               cmpval00 = lp_build_sample_comparefunc(bld, coords[4], col00);
+               cmpval01 = lp_build_sample_comparefunc(bld, coords[4], col01);
+               cmpval10 = lp_build_sample_comparefunc(bld, coords[4], col10);
+               cmpval11 = lp_build_sample_comparefunc(bld, coords[4], col11);
+               col00 = lp_build_select(texel_bld, cmpval00,
+                                       texel_bld->one, texel_bld->zero);
+               col01 = lp_build_select(texel_bld, cmpval01,
+                                       texel_bld->one, texel_bld->zero);
+               col10 = lp_build_select(texel_bld, cmpval10,
+                                       texel_bld->one, texel_bld->zero);
+               col11 = lp_build_select(texel_bld, cmpval11,
+                                       texel_bld->one, texel_bld->zero);
+            }
+
+            /*
+             * Null out corner color.
+             */
+            col00 = lp_build_andnot(coord_bld, col00, c00f);
+            col01 = lp_build_andnot(coord_bld, col01, c01f);
+            col10 = lp_build_andnot(coord_bld, col10, c10f);
+            col11 = lp_build_andnot(coord_bld, col11, c11f);
+
+            /*
+             * New corner texel color is all colors added / 3.
+             */
+            colc0 = lp_build_add(coord_bld, col00, col01);
+            colc1 = lp_build_add(coord_bld, col10, col11);
+            colc = lp_build_add(coord_bld, colc0, colc1);
+            colc = lp_build_mul(coord_bld, one_third, colc);
+
+            /*
+             * Replace the corner texel color with the new value.
+             */
+            col00 = lp_build_select(coord_bld, c00, colc, col00);
+            col01 = lp_build_select(coord_bld, c01, colc, col01);
+            col10 = lp_build_select(coord_bld, c10, colc, col10);
+            col11 = lp_build_select(coord_bld, c11, colc, col11);
+
+            colors0[0] = col10;
+            colors0[1] = col11;
+            colors0[2] = col01;
+            colors0[3] = col00;
           }
  
           LLVMBuildStore(builder, colors0[0], colorss[0]);
@@ -1320,7 +1603,6 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
               * end of sampling (much less values to swizzle), but this
               * obviously cannot work when using gather.
               */
-            unsigned chan_swiz = bld->static_texture_state->swizzle_r;
              colors0[0] = lp_build_swizzle_soa_channel(texel_bld,
                                                        neighbors[1][0],
                                                        chan_swiz);
@@ -1356,25 +1638,14 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
  
           if (is_gather) {
              /* more hacks for swizzling, should be X, ONE or ZERO... */
-            unsigned chan_swiz = bld->static_texture_state->swizzle_r;
-            if (chan_swiz <= PIPE_SWIZZLE_ALPHA) {
-               colors0[0] = lp_build_select(texel_bld, cmpval10,
-                                            texel_bld->one, texel_bld->zero);
-               colors0[1] = lp_build_select(texel_bld, cmpval11,
-                                            texel_bld->one, texel_bld->zero);
-               colors0[2] = lp_build_select(texel_bld, cmpval01,
-                                            texel_bld->one, texel_bld->zero);
-               colors0[3] = lp_build_select(texel_bld, cmpval00,
-                                            texel_bld->one, texel_bld->zero);
-            }
-            else if (chan_swiz == PIPE_SWIZZLE_ZERO) {
-               colors0[0] = colors0[1] = colors0[2] = colors0[3] =
-                            texel_bld->zero;
-            }
-            else {
-               colors0[0] = colors0[1] = colors0[2] = colors0[3] =
-                            texel_bld->one;
-            }
+            colors0[0] = lp_build_select(texel_bld, cmpval10,
+                                         texel_bld->one, texel_bld->zero);
+            colors0[1] = lp_build_select(texel_bld, cmpval11,
+                                         texel_bld->one, texel_bld->zero);
+            colors0[2] = lp_build_select(texel_bld, cmpval01,
+                                         texel_bld->one, texel_bld->zero);
+            colors0[3] = lp_build_select(texel_bld, cmpval00,
+                                         texel_bld->one, texel_bld->zero);
           }
           else {
              colors0[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
@@ -1467,6 +1738,26 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
           }
        }
     }
+   if (is_gather) {
+      /*
+       * For gather, we can't do our usual channel swizzling done later,
+       * so do it here. It only really matters for 0/1 swizzles in case
+       * of comparison filtering, since in this case the results would be
+       * wrong, without comparison it should all work out alright but it
+       * can't hurt to do that here, since it will instantly drop all
+       * calculations above, though it's a rather stupid idea to do
+       * gather on a channel which will always return 0 or 1 in any case...
+       */
+      if (chan_swiz == PIPE_SWIZZLE_1) {
+         for (chan = 0; chan < 4; chan++) {
+            colors_out[chan] = texel_bld->one;
+         }
+      } else if (chan_swiz == PIPE_SWIZZLE_0) {
+         for (chan = 0; chan < 4; chan++) {
+            colors_out[chan] = texel_bld->zero;
+         }
+      }
+   }
  }
  
  
@@ -1481,7 +1772,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                         unsigned img_filter,
                         unsigned mip_filter,
                         boolean is_gather,
-                       LLVMValueRef *coords,
+                       const LLVMValueRef *coords,
                         const LLVMValueRef *offsets,
                         LLVMValueRef ilevel0,
                         LLVMValueRef ilevel1,
@@ -1554,6 +1845,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                                        PIPE_FUNC_GREATER,
                                        lod_fpart, bld->lodf_bld.zero);
           need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, need_lerp);
+         lp_build_name(need_lerp, "need_lerp");
        }
  
        lp_build_if(&if_ctx, bld->gallivm, need_lerp);
@@ -1622,7 +1914,7 @@ static void
  lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
                              LLVMValueRef linear_mask,
                              unsigned mip_filter,
-                            LLVMValueRef *coords,
+                            const LLVMValueRef *coords,
                              const LLVMValueRef *offsets,
                              LLVMValueRef ilevel0,
                              LLVMValueRef ilevel1,
@@ -1679,6 +1971,7 @@ lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
         * should be able to merge the branches in this case.
         */
        need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_positive);
+      lp_build_name(need_lerp, "need_lerp");
  
        lp_build_if(&if_ctx, bld->gallivm, need_lerp);
        {
@@ -1769,6 +2062,7 @@ lp_build_layer_coord(struct lp_build_sample_context *bld,
   */
  static void
  lp_build_sample_common(struct lp_build_sample_context *bld,
+                       boolean is_lodq,
                         unsigned texture_index,
                         unsigned sampler_index,
                         LLVMValueRef *coords,
@@ -1776,6 +2070,7 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
                         LLVMValueRef lod_bias, /* optional */
                         LLVMValueRef explicit_lod, /* optional */
                         LLVMValueRef *lod_pos_or_zero,
+                       LLVMValueRef *lod,
                         LLVMValueRef *lod_fpart,
                         LLVMValueRef *ilevel0,
                         LLVMValueRef *ilevel1)
@@ -1835,7 +2130,7 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
        const struct util_format_description *format_desc = bld->format_desc;
        unsigned chan_type;
        /* not entirely sure we couldn't end up with non-valid swizzle here */
-      chan_type = format_desc->swizzle[0] <= UTIL_FORMAT_SWIZZLE_W ?
+      chan_type = format_desc->swizzle[0] <= PIPE_SWIZZLE_W ?
                       format_desc->channel[format_desc->swizzle[0]].type :
                       UTIL_FORMAT_TYPE_FLOAT;
        if (chan_type != UTIL_FORMAT_TYPE_FLOAT) {
@@ -1848,15 +2143,44 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
      * Compute the level of detail (float).
      */
     if (min_filter != mag_filter ||
-       mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+       mip_filter != PIPE_TEX_MIPFILTER_NONE || is_lodq) {
        /* Need to compute lod either to choose mipmap levels or to
         * distinguish between minification/magnification with one mipmap level.
         */
-      lp_build_lod_selector(bld, texture_index, sampler_index,
+      lp_build_lod_selector(bld, is_lodq, texture_index, sampler_index,
                              coords[0], coords[1], coords[2], cube_rho,
                              derivs, lod_bias, explicit_lod,
-                            mip_filter,
+                            mip_filter, lod,
                              &lod_ipart, lod_fpart, lod_pos_or_zero);
+      if (is_lodq) {
+         LLVMValueRef last_level;
+         last_level = bld->dynamic_state->last_level(bld->dynamic_state,
+                                                     bld->gallivm,
+                                                     bld->context_ptr,
+                                                     texture_index);
+         first_level = bld->dynamic_state->first_level(bld->dynamic_state,
+                                                       bld->gallivm,
+                                                       bld->context_ptr,
+                                                       texture_index);
+         last_level = lp_build_sub(&bld->int_bld, last_level, first_level);
+         last_level = lp_build_int_to_float(&bld->float_bld, last_level);
+         last_level = lp_build_broadcast_scalar(&bld->lodf_bld, last_level);
+
+         switch (mip_filter) {
+         case PIPE_TEX_MIPFILTER_NONE:
+            *lod_fpart = bld->lodf_bld.zero;
+            break;
+         case PIPE_TEX_MIPFILTER_NEAREST:
+             *lod_fpart = lp_build_round(&bld->lodf_bld, *lod_fpart);
+             /* fallthrough */
+         case PIPE_TEX_MIPFILTER_LINEAR:
+            *lod_fpart = lp_build_clamp(&bld->lodf_bld, *lod_fpart,
+                                        bld->lodf_bld.zero, last_level);
+            break;
+         }
+         return;
+      }
+
     } else {
        lod_ipart = bld->lodi_bld.zero;
        *lod_pos_or_zero = bld->lodi_bld.zero;
@@ -1936,7 +2260,7 @@ lp_build_clamp_border_color(struct lp_build_sample_context *bld,
                                         LLVMPointerType(vec4_bld.vec_type, 0), "");
     border_color = LLVMBuildLoad(builder, border_color_ptr, "");
     /* we don't have aligned type in the dynamic state unfortunately */
-   lp_set_load_alignment(border_color, 4);
+   LLVMSetAlignment(border_color, 4);
  
     /*
      * Instead of having some incredibly complex logic which will try to figure out
@@ -1954,7 +2278,7 @@ lp_build_clamp_border_color(struct lp_build_sample_context *bld,
        else {
           chan = util_format_get_first_non_void_channel(format_desc->format);
        }
-      if (chan >= 0 && chan <= UTIL_FORMAT_SWIZZLE_W) {
+      if (chan >= 0 && chan <= PIPE_SWIZZLE_W) {
           unsigned chan_type = format_desc->channel[chan].type;
           unsigned chan_norm = format_desc->channel[chan].normalized;
           unsigned chan_pure = format_desc->channel[chan].pure_integer;
@@ -2041,13 +2365,16 @@ lp_build_clamp_border_color(struct lp_build_sample_context *bld,
           max_clamp = vec4_bld.one;
        }
        else if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
-               format_desc->layout == UTIL_FORMAT_LAYOUT_ETC) {
+               format_desc->layout == UTIL_FORMAT_LAYOUT_ETC ||
+               format_desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
           switch (format_desc->format) {
           case PIPE_FORMAT_RGTC1_UNORM:
           case PIPE_FORMAT_RGTC2_UNORM:
           case PIPE_FORMAT_LATC1_UNORM:
           case PIPE_FORMAT_LATC2_UNORM:
           case PIPE_FORMAT_ETC1_RGB8:
+         case PIPE_FORMAT_BPTC_RGBA_UNORM:
+         case PIPE_FORMAT_BPTC_SRGBA:
              min_clamp = vec4_bld.zero;
              max_clamp = vec4_bld.one;
              break;
@@ -2058,6 +2385,12 @@ lp_build_clamp_border_color(struct lp_build_sample_context *bld,
              min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
              max_clamp = vec4_bld.one;
              break;
+         case PIPE_FORMAT_BPTC_RGB_FLOAT:
+            /* not sure if we should clamp to max half float? */
+            break;
+         case PIPE_FORMAT_BPTC_RGB_UFLOAT:
+            min_clamp = vec4_bld.zero;
+            break;
           default:
              assert(0);
              break;
@@ -2125,7 +2458,7 @@ static void
  lp_build_sample_general(struct lp_build_sample_context *bld,
                          unsigned sampler_unit,
                          boolean is_gather,
-                        LLVMValueRef *coords,
+                        const LLVMValueRef *coords,
                          const LLVMValueRef *offsets,
                          LLVMValueRef lod_positive,
                          LLVMValueRef lod_fpart,
@@ -2186,7 +2519,8 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
           struct lp_build_if_state if_ctx;
  
           lod_positive = LLVMBuildTrunc(builder, lod_positive,
-                                       LLVMInt1TypeInContext(bld->gallivm->context), "");
+                                       LLVMInt1TypeInContext(bld->gallivm->context),
+                                       "lod_pos");
  
           lp_build_if(&if_ctx, bld->gallivm, lod_positive);
           {
@@ -2222,6 +2556,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
           }
           need_linear = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods,
                                                 linear_mask);
+         lp_build_name(need_linear, "need_linear");
  
           if (bld->num_lods != bld->coord_type.length) {
              linear_mask = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
@@ -2253,8 +2588,8 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
               * All pixels require just nearest filtering, which is way
               * cheaper than linear, hence do a separate path for that.
               */
-            lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST, FALSE,
-                                   mip_filter_for_nearest,
+            lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST,
+                                   mip_filter_for_nearest, FALSE,
                                     coords, offsets,
                                     ilevel0, ilevel1, lod_fpart,
                                     texels);
@@ -2281,6 +2616,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
  static void
  lp_build_fetch_texel(struct lp_build_sample_context *bld,
                       unsigned texture_unit,
+                     LLVMValueRef ms_index,
                       const LLVMValueRef *coords,
                       LLVMValueRef explicit_lod,
                       const LLVMValueRef *offsets,
@@ -2380,13 +2716,26 @@ lp_build_fetch_texel(struct lp_build_sample_context *bld,
                              lp_build_get_mip_offsets(bld, ilevel));
     }
  
+   if (bld->fetch_ms) {
+      LLVMValueRef num_samples;
+      num_samples = bld->dynamic_state->num_samples(bld->dynamic_state, bld->gallivm,
+                                                    bld->context_ptr, texture_unit);
+      out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, ms_index, int_coord_bld->zero);
+      out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
+      out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, ms_index, lp_build_broadcast_scalar(int_coord_bld, num_samples));
+      out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
+      offset = lp_build_add(int_coord_bld, offset,
+                            lp_build_mul(int_coord_bld, bld->sample_stride, ms_index));
+   }
+
     offset = lp_build_andnot(int_coord_bld, offset, out_of_bounds);
  
     lp_build_fetch_rgba_soa(bld->gallivm,
                             bld->format_desc,
-                           bld->texel_type,
+                           bld->texel_type, TRUE,
                             bld->base_ptr, offset,
                             i, j,
+                           bld->cache,
                             colors_out);
  
     if (out_of_bound_ret_zero) {
@@ -2440,10 +2789,12 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                           unsigned texture_index,
                           unsigned sampler_index,
                           LLVMValueRef context_ptr,
+                         LLVMValueRef thread_data_ptr,
                           const LLVMValueRef *coords,
                           const LLVMValueRef *offsets,
                           const struct lp_derivatives *derivs, /* optional */
                           LLVMValueRef lod, /* optional */
+                         LLVMValueRef ms_index, /* optional */
                           LLVMValueRef texel_out[4])
  {
     unsigned target = static_texture_state->target;
@@ -2460,7 +2811,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
     enum lp_sampler_op_type op_type;
     LLVMValueRef lod_bias = NULL;
     LLVMValueRef explicit_lod = NULL;
-   boolean op_is_tex;
+   boolean op_is_tex, op_is_lodq, op_is_gather, fetch_ms;
  
     if (0) {
        enum pipe_format fmt = static_texture_state->format;
@@ -2473,8 +2824,11 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                      LP_SAMPLER_LOD_CONTROL_SHIFT;
     op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
                   LP_SAMPLER_OP_TYPE_SHIFT;
+   fetch_ms = !!(sample_key & LP_SAMPLER_FETCH_MS);
  
     op_is_tex = op_type == LP_SAMPLER_OP_TEXTURE;
+   op_is_lodq = op_type == LP_SAMPLER_OP_LODQ;
+   op_is_gather = op_type == LP_SAMPLER_OP_GATHER;
  
     if (lod_control == LP_SAMPLER_LOD_BIAS) {
        lod_bias = lod;
@@ -2501,7 +2855,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
         * all zero as mandated by d3d10 in this case.
         */
        unsigned chan;
-      LLVMValueRef zero = lp_build_const_vec(gallivm, type, 0.0F);
+      LLVMValueRef zero = lp_build_zero(gallivm, type);
        for (chan = 0; chan < 4; chan++) {
           texel_out[chan] = zero;
        }
@@ -2520,6 +2874,16 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
     bld.format_desc = util_format_description(static_texture_state->format);
     bld.dims = dims;
  
+   if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD || op_is_lodq) {
+      bld.no_quad_lod = TRUE;
+   }
+   if (gallivm_perf & GALLIVM_PERF_NO_RHO_APPROX || op_is_lodq) {
+      bld.no_rho_approx = TRUE;
+   }
+   if (gallivm_perf & GALLIVM_PERF_NO_BRILINEAR || op_is_lodq) {
+      bld.no_brilinear = TRUE;
+   }
+
     bld.vector_width = lp_type_width(type);
  
     bld.float_type = lp_type_float(32);
@@ -2546,15 +2910,16 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
     else if (util_format_has_stencil(bld.format_desc) &&
         !util_format_has_depth(bld.format_desc)) {
        /* for stencil only formats, sample stencil (uint) */
-      bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
+      bld.texel_type = lp_type_uint_vec(type.width, type.width * type.length);
     }
  
-   if (!static_texture_state->level_zero_only) {
+   if (!static_texture_state->level_zero_only ||
+       !static_sampler_state->max_lod_pos || op_is_lodq) {
        derived_sampler_state.min_mip_filter = static_sampler_state->min_mip_filter;
     } else {
        derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
     }
-   if (op_type == LP_SAMPLER_OP_GATHER) {
+   if (op_is_gather) {
        /*
         * gather4 is exactly like GL_LINEAR filtering but in the end skipping
         * the actual filtering. Using mostly the same paths, so cube face
@@ -2586,6 +2951,10 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
        derived_sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
        derived_sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
     }
+   /*
+    * We could force CLAMP to CLAMP_TO_EDGE here if min/mag filter is nearest,
+    * so AoS path could be used. Not sure it's worth the trouble...
+    */
  
     min_img_filter = derived_sampler_state.min_img_filter;
     mag_img_filter = derived_sampler_state.mag_img_filter;
@@ -2611,11 +2980,11 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
      */
     bld.num_mips = bld.num_lods = 1;
  
-   if ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) &&
-       (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) &&
-       (static_texture_state->target == PIPE_TEXTURE_CUBE ||
-        static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
-       (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
+   if (bld.no_quad_lod && bld.no_rho_approx &&
+       ((mip_filter != PIPE_TEX_MIPFILTER_NONE && op_is_tex &&
+         (static_texture_state->target == PIPE_TEXTURE_CUBE ||
+          static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)) ||
+        op_is_lodq)) {
        /*
         * special case for using per-pixel lod even for implicit lod,
         * which is generally never required (ok by APIs) except to please
@@ -2623,6 +2992,8 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
         * can cause derivatives to be different for pixels outside the primitive
         * due to the major axis division even if pre-project derivatives are
         * looking normal).
+       * For lodq, we do it to simply avoid scalar pack / unpack (albeit for
+       * cube maps we do indeed get per-pixel lod values).
         */
        bld.num_mips = type.length;
        bld.num_lods = type.length;
@@ -2650,7 +3021,9 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
        bld.num_lods = num_quads;
     }
  
-
+   bld.fetch_ms = fetch_ms;
+   if (op_is_gather)
+      bld.gather_comp = (sample_key & LP_SAMPLER_GATHER_COMP_MASK) >> LP_SAMPLER_GATHER_COMP_SHIFT;
     bld.lodf_type = type;
     /* we want native vector size to be able to use our intrinsics */
     if (bld.num_lods != type.length) {
@@ -2699,8 +3072,17 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                                            context_ptr, texture_index);
     bld.mip_offsets = dynamic_state->mip_offsets(dynamic_state, gallivm,
                                                  context_ptr, texture_index);
+
+   if (fetch_ms)
+      bld.sample_stride = lp_build_broadcast_scalar(&bld.int_coord_bld, dynamic_state->sample_stride(dynamic_state, gallivm,
+                                                                                                     context_ptr, texture_index));
     /* Note that mip_offsets is an array[level] of offsets to texture images */
  
+   if (dynamic_state->cache_ptr && thread_data_ptr) {
+      bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
+                                           thread_data_ptr, texture_index);
+   }
+
     /* width, height, depth as single int vector */
     if (dims <= 1) {
        bld.int_size = tex_width;
@@ -2731,6 +3113,32 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
        newcoords[i] = coords[i];
     }
  
+   if (util_format_is_pure_integer(static_texture_state->format) &&
+       !util_format_has_depth(bld.format_desc) && op_is_tex &&
+       (static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR ||
+        static_sampler_state->min_img_filter == PIPE_TEX_FILTER_LINEAR ||
+        static_sampler_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
+      /*
+       * Bail if impossible filtering is specified (the awkard additional
+       * depth check is because it is legal in gallium to have things like S8Z24
+       * here which would say it's pure int despite such formats should sample
+       * the depth component).
+       * In GL such filters make the texture incomplete, this makes it robust
+       * against gallium frontends which set this up regardless (we'd crash in the
+       * lerp later otherwise).
+       * At least in some apis it may be legal to use such filters with lod
+       * queries and/or gather (at least for gather d3d10 says only the wrap
+       * bits are really used hence filter bits are likely simply ignored).
+       * For fetch, we don't get valid samplers either way here.
+       */
+      unsigned chan;
+      LLVMValueRef zero = lp_build_zero(gallivm, type);
+      for (chan = 0; chan < 4; chan++) {
+         texel_out[chan] = zero;
+      }
+      return;
+   }
+
     if (0) {
        /* For debug: no-op texture sampling */
        lp_build_sample_nop(gallivm,
@@ -2740,23 +3148,30 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
     }
  
     else if (op_type == LP_SAMPLER_OP_FETCH) {
-      lp_build_fetch_texel(&bld, texture_index, newcoords,
+      lp_build_fetch_texel(&bld, texture_index, ms_index, newcoords,
                             lod, offsets,
                             texel_out);
     }
  
     else {
        LLVMValueRef lod_fpart = NULL, lod_positive = NULL;
-      LLVMValueRef ilevel0 = NULL, ilevel1 = NULL;
-      boolean use_aos = util_format_fits_8unorm(bld.format_desc) &&
-                        op_is_tex &&
-                        /* not sure this is strictly needed or simply impossible */
-                        derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
-                        lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
+      LLVMValueRef ilevel0 = NULL, ilevel1 = NULL, lod = NULL;
+      boolean use_aos;
+
+      use_aos = util_format_fits_8unorm(bld.format_desc) &&
+                op_is_tex &&
+                /* not sure this is strictly needed or simply impossible */
+                derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
+                lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
  
        use_aos &= bld.num_lods <= num_quads ||
                   derived_sampler_state.min_img_filter ==
                      derived_sampler_state.mag_img_filter;
+
+      if(gallivm_perf & GALLIVM_PERF_NO_AOS_SAMPLING) {
+         use_aos = 0;
+      }
+
        if (dims > 1) {
           use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_t);
           if (dims > 2) {
@@ -2788,12 +3203,19 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                        derived_sampler_state.wrap_r);
        }
  
-      lp_build_sample_common(&bld, texture_index, sampler_index,
+      lp_build_sample_common(&bld, op_is_lodq, texture_index, sampler_index,
                               newcoords,
                               derivs, lod_bias, explicit_lod,
-                             &lod_positive, &lod_fpart,
+                             &lod_positive, &lod, &lod_fpart,
                               &ilevel0, &ilevel1);
  
+      if (op_is_lodq) {
+         texel_out[0] = lod_fpart;
+         texel_out[1] = lod;
+         texel_out[2] = texel_out[3] = bld.coord_bld.zero;
+         return;
+      }
+
        if (use_aos && static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
           /* The aos path doesn't do seamless filtering so simply add cube layer
            * to face now.
@@ -2802,12 +3224,13 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
        }
  
        /*
-       * we only try 8-wide sampling with soa as it appears to
-       * be a loss with aos with AVX (but it should work, except
-       * for conformance if min_filter != mag_filter if num_lods > 1).
-       * (It should be faster if we'd support avx2)
+       * we only try 8-wide sampling with soa or if we have AVX2
+       * as it appears to be a loss with just AVX)
         */
-      if (num_quads == 1 || !use_aos) {
+      if (num_quads == 1 || !use_aos ||
+          (util_cpu_caps.has_avx2 &&
+           (bld.num_lods == 1 ||
+            derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) {
           if (use_aos) {
              /* do sampling/filtering with fixed pt arithmetic */
              lp_build_sample_aos(&bld, sampler_index,
@@ -2839,6 +3262,9 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
  
           /* Setup our build context */
           memset(&bld4, 0, sizeof bld4);
+         bld4.no_quad_lod = bld.no_quad_lod;
+         bld4.no_rho_approx = bld.no_rho_approx;
+         bld4.no_brilinear = bld.no_brilinear;
           bld4.gallivm = bld.gallivm;
           bld4.context_ptr = bld.context_ptr;
           bld4.static_texture_state = bld.static_texture_state;
@@ -2851,6 +3277,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
           bld4.base_ptr = bld.base_ptr;
           bld4.mip_offsets = bld.mip_offsets;
           bld4.int_size = bld.int_size;
+         bld4.cache = bld.cache;
  
           bld4.vector_width = lp_type_width(type4);
  
@@ -2865,8 +3292,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
           bld4.texel_type.length = 4;
  
           bld4.num_mips = bld4.num_lods = 1;
-         if ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) &&
-             (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) &&
+         if (bld4.no_quad_lod && bld4.no_rho_approx &&
               (static_texture_state->target == PIPE_TEXTURE_CUBE ||
                static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
               (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
@@ -3048,13 +3474,16 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
     LLVMValueRef coords[5];
     LLVMValueRef offsets[3] = { NULL };
     LLVMValueRef lod = NULL;
+   LLVMValueRef ms_index = NULL;
     LLVMValueRef context_ptr;
+   LLVMValueRef thread_data_ptr = NULL;
     LLVMValueRef texel_out[4];
     struct lp_derivatives derivs;
     struct lp_derivatives *deriv_ptr = NULL;
     unsigned num_param = 0;
     unsigned i, num_coords, num_derivs, num_offsets, layer;
     enum lp_sampler_lod_control lod_control;
+   boolean need_cache = FALSE;
  
     lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
                      LP_SAMPLER_LOD_CONTROL_SHIFT;
@@ -3062,8 +3491,19 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
     get_target_info(static_texture_state->target,
                     &num_coords, &num_derivs, &num_offsets, &layer);
  
+   if (dynamic_state->cache_ptr) {
+      const struct util_format_description *format_desc;
+      format_desc = util_format_description(static_texture_state->format);
+      if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+         need_cache = TRUE;
+      }
+   }
+
     /* "unpack" arguments */
     context_ptr = LLVMGetParam(function, num_param++);
+   if (need_cache) {
+      thread_data_ptr = LLVMGetParam(function, num_param++);
+   }
     for (i = 0; i < num_coords; i++) {
        coords[i] = LLVMGetParam(function, num_param++);
     }
@@ -3077,6 +3517,9 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
     if (sample_key & LP_SAMPLER_SHADOW) {
        coords[4] = LLVMGetParam(function, num_param++);
     }
+   if (sample_key & LP_SAMPLER_FETCH_MS) {
+      ms_index = LLVMGetParam(function, num_param++);
+   }
     if (sample_key & LP_SAMPLER_OFFSETS) {
        for (i = 0; i < num_offsets; i++) {
           offsets[i] = LLVMGetParam(function, num_param++);
@@ -3114,10 +3557,12 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
                              texture_index,
                              sampler_index,
                              context_ptr,
+                            thread_data_ptr,
                              coords,
                              offsets,
                              deriv_ptr,
                              lod,
+                            ms_index,
                              texel_out);
  
     LLVMBuildAggregateRet(gallivm->builder, texel_out, 4);
@@ -3138,7 +3583,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
                           const struct lp_static_texture_state *static_texture_state,
                           const struct lp_static_sampler_state *static_sampler_state,
                           struct lp_sampler_dynamic_state *dynamic_state,
-                         const struct lp_sampler_params *params)
+                         const struct lp_sampler_params *params,
+                         int texture_index, int sampler_index,
+                         LLVMValueRef *tex_ret)
  {
     LLVMBuilderRef builder = gallivm->builder;
     LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(
@@ -3146,17 +3593,15 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
     LLVMValueRef function, inst;
     LLVMValueRef args[LP_MAX_TEX_FUNC_ARGS];
     LLVMBasicBlockRef bb;
-   LLVMValueRef tex_ret;
     unsigned num_args = 0;
     char func_name[64];
     unsigned i, num_coords, num_derivs, num_offsets, layer;
-   unsigned texture_index = params->texture_index;
-   unsigned sampler_index = params->sampler_index;
     unsigned sample_key = params->sample_key;
     const LLVMValueRef *coords = params->coords;
     const LLVMValueRef *offsets = params->offsets;
     const struct lp_derivatives *derivs = params->derivs;
     enum lp_sampler_lod_control lod_control;
+   boolean need_cache = FALSE;
  
     lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
                      LP_SAMPLER_LOD_CONTROL_SHIFT;
@@ -3164,6 +3609,13 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
     get_target_info(static_texture_state->target,
                     &num_coords, &num_derivs, &num_offsets, &layer);
  
+   if (dynamic_state->cache_ptr) {
+      const struct util_format_description *format_desc;
+      format_desc = util_format_description(static_texture_state->format);
+      if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+         need_cache = TRUE;
+      }
+   }
     /*
      * texture function matches are found by name.
      * Thus the name has to include both the texture and sampler unit
@@ -3172,8 +3624,8 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
      * Additionally lod_property has to be included too.
      */
  
-   util_snprintf(func_name, sizeof(func_name), "texfunc_res_%d_sam_%d_%x",
-                 texture_index, sampler_index, sample_key);
+   snprintf(func_name, sizeof(func_name), "texfunc_res_%d_sam_%d_%x",
+            texture_index, sampler_index, sample_key);
  
     function = LLVMGetNamedFunction(module, func_name);
  
@@ -3189,6 +3641,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
         */
  
        arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
+      if (need_cache) {
+         arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
+      }
        for (i = 0; i < num_coords; i++) {
           arg_types[num_param++] = LLVMTypeOf(coords[0]);
           assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
@@ -3200,6 +3655,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
        if (sample_key & LP_SAMPLER_SHADOW) {
           arg_types[num_param++] = LLVMTypeOf(coords[0]);
        }
+      if (sample_key & LP_SAMPLER_FETCH_MS) {
+         arg_types[num_param++] = LLVMTypeOf(params->ms_index);
+      }
        if (sample_key & LP_SAMPLER_OFFSETS) {
           for (i = 0; i < num_offsets; i++) {
              arg_types[num_param++] = LLVMTypeOf(offsets[0]);
@@ -3227,12 +3685,13 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
  
        for (i = 0; i < num_param; ++i) {
           if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
-            LLVMAddAttribute(LLVMGetParam(function, i), LLVMNoAliasAttribute);
+
+            lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
           }
        }
  
        LLVMSetFunctionCallConv(function, LLVMFastCallConv);
-      LLVMSetLinkage(function, LLVMPrivateLinkage);
+      LLVMSetLinkage(function, LLVMInternalLinkage);
  
        lp_build_sample_gen_func(gallivm,
                                 static_texture_state,
@@ -3248,6 +3707,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
  
     num_args = 0;
     args[num_args++] = params->context_ptr;
+   if (need_cache) {
+      args[num_args++] = params->thread_data_ptr;
+   }
     for (i = 0; i < num_coords; i++) {
        args[num_args++] = coords[i];
     }
@@ -3257,6 +3719,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
     if (sample_key & LP_SAMPLER_SHADOW) {
        args[num_args++] = coords[4];
     }
+   if (sample_key & LP_SAMPLER_FETCH_MS) {
+      args[num_args++] = params->ms_index;
+   }
     if (sample_key & LP_SAMPLER_OFFSETS) {
        for (i = 0; i < num_offsets; i++) {
           args[num_args++] = offsets[i];
@@ -3275,14 +3740,11 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
  
     assert(num_args <= LP_MAX_TEX_FUNC_ARGS);
  
-   tex_ret = LLVMBuildCall(builder, function, args, num_args, "");
+   *tex_ret = LLVMBuildCall(builder, function, args, num_args, "");
     bb = LLVMGetInsertBlock(builder);
     inst = LLVMGetLastInstruction(bb);
     LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
  
-   for (i = 0; i < 4; i++) {
-      params->texel[i] = LLVMBuildExtractValue(gallivm->builder, tex_ret, i, "");
-   }
  }
  
  
@@ -3330,19 +3792,22 @@ lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
           op_type != LP_SAMPLER_OP_TEXTURE ||
             ((static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE ||
               static_texture_state->level_zero_only == TRUE) &&
-            static_sampler_state->min_img_filter == static_sampler_state->mag_img_filter &&
-            (static_sampler_state->min_img_filter == PIPE_TEX_FILTER_NEAREST ||
-             static_sampler_state->min_img_filter == PIPE_TEX_FILTER_NEAREST));
+            static_sampler_state->min_img_filter == static_sampler_state->mag_img_filter);
  
        use_tex_func = format_desc && !(simple_format && simple_tex);
     }
  
     if (use_tex_func) {
+      LLVMValueRef tex_ret;
        lp_build_sample_soa_func(gallivm,
                                 static_texture_state,
                                 static_sampler_state,
                                 dynamic_state,
-                               params);
+                               params, params->texture_index, params->sampler_index, &tex_ret);
+
+      for (unsigned i = 0; i < 4; i++) {
+         params->texel[i] = LLVMBuildExtractValue(gallivm->builder, tex_ret, i, "");
+      }
     }
     else {
        lp_build_sample_soa_code(gallivm,
@@ -3354,10 +3819,12 @@ lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
                                 params->texture_index,
                                 params->sampler_index,
                                 params->context_ptr,
+                               params->thread_data_ptr,
                                 params->coords,
                                 params->offsets,
                                 params->derivs,
                                 params->lod,
+                               params->ms_index,
                                 params->texel);
     }
  }
@@ -3367,21 +3834,17 @@ void
  lp_build_size_query_soa(struct gallivm_state *gallivm,
                          const struct lp_static_texture_state *static_state,
                          struct lp_sampler_dynamic_state *dynamic_state,
-                        struct lp_type int_type,
-                        unsigned texture_unit,
-                        unsigned target,
-                        LLVMValueRef context_ptr,
-                        boolean is_sviewinfo,
-                        enum lp_sampler_lod_property lod_property,
-                        LLVMValueRef explicit_lod,
-                        LLVMValueRef *sizes_out)
+                        const struct lp_sampler_size_query_params *params)
  {
-   LLVMValueRef lod, level, size;
+   LLVMValueRef lod, level = 0, size;
     LLVMValueRef first_level = NULL;
     int dims, i;
     boolean has_array;
     unsigned num_lods = 1;
     struct lp_build_context bld_int_vec4;
+   LLVMValueRef context_ptr = params->context_ptr;
+   unsigned texture_unit = params->texture_unit;
+   unsigned target = params->target;
  
     if (static_state->format == PIPE_FORMAT_NONE) {
        /*
@@ -3389,9 +3852,9 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
         * all zero as mandated by d3d10 in this case.
         */
        unsigned chan;
-      LLVMValueRef zero = lp_build_const_vec(gallivm, int_type, 0.0F);
+      LLVMValueRef zero = lp_build_const_vec(gallivm, params->int_type, 0.0F);
        for (chan = 0; chan < 4; chan++) {
-         sizes_out[chan] = zero;
+         params->sizes_out[chan] = zero;
        }
        return;
     }
@@ -3436,13 +3899,19 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
        break;
     }
  
-   assert(!int_type.floating);
+   assert(!params->int_type.floating);
  
     lp_build_context_init(&bld_int_vec4, gallivm, lp_type_int_vec(32, 128));
  
-   if (explicit_lod) {
+   if (params->samples_only) {
+      params->sizes_out[0] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, params->int_type),
+                                                dynamic_state->num_samples(dynamic_state, gallivm,
+                                                                           context_ptr, texture_unit));
+      return;
+   }
+   if (params->explicit_lod) {
        /* FIXME: this needs to honor per-element lod */
-      lod = LLVMBuildExtractElement(gallivm->builder, explicit_lod,
+      lod = LLVMBuildExtractElement(gallivm->builder, params->explicit_lod,
                                      lp_build_const_int32(gallivm, 0), "");
        first_level = dynamic_state->first_level(dynamic_state, gallivm,
                                                 context_ptr, texture_unit);
@@ -3496,7 +3965,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
      * if level is out of bounds (note this can't cover unbound texture
      * here, which also requires returning zero).
      */
-   if (explicit_lod && is_sviewinfo) {
+   if (params->explicit_lod && params->is_sviewinfo) {
        LLVMValueRef last_level, out, out1;
        struct lp_build_context leveli_bld;
  
@@ -3518,13 +3987,13 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
        size = lp_build_andnot(&bld_int_vec4, size, out);
     }
     for (i = 0; i < dims + (has_array ? 1 : 0); i++) {
-      sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec4.type, int_type,
+      params->sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec4.type, params->int_type,
                                                  size,
                                                  lp_build_const_int32(gallivm, i));
     }
-   if (is_sviewinfo) {
+   if (params->is_sviewinfo) {
        for (; i < 4; i++) {
-         sizes_out[i] = lp_build_const_vec(gallivm, int_type, 0.0);
+         params->sizes_out[i] = lp_build_const_vec(gallivm, params->int_type, 0.0);
        }
     }
  
@@ -3532,7 +4001,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
      * if there's no explicit_lod (buffers, rects) queries requiring nr of
      * mips would be illegal.
      */
-   if (is_sviewinfo && explicit_lod) {
+   if (params->is_sviewinfo && params->explicit_lod) {
        struct lp_build_context bld_int_scalar;
        LLVMValueRef num_levels;
        lp_build_context_init(&bld_int_scalar, gallivm, lp_type_int(32));
@@ -3548,7 +4017,214 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
           num_levels = lp_build_sub(&bld_int_scalar, last_level, first_level);
           num_levels = lp_build_add(&bld_int_scalar, num_levels, bld_int_scalar.one);
        }
-      sizes_out[3] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, int_type),
+      params->sizes_out[3] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, params->int_type),
                                          num_levels);
     }
  }
+
+static void
+lp_build_do_atomic_soa(struct gallivm_state *gallivm,
+                       const struct util_format_description *format_desc,
+                       struct lp_type type,
+                       LLVMValueRef exec_mask,
+                       LLVMValueRef base_ptr,
+                       LLVMValueRef offset,
+                       LLVMValueRef out_of_bounds,
+                       unsigned img_op,
+                       LLVMAtomicRMWBinOp op,
+                       const LLVMValueRef rgba_in[4],
+                       const LLVMValueRef rgba2_in[4],
+                       LLVMValueRef atomic_result[4])
+{
+   enum pipe_format format = format_desc->format;
+
+   if (format != PIPE_FORMAT_R32_UINT && format != PIPE_FORMAT_R32_SINT && format != PIPE_FORMAT_R32_FLOAT) {
+      atomic_result[0] = lp_build_zero(gallivm, type);
+      return;
+   }
+
+   LLVMValueRef atom_res = lp_build_alloca(gallivm,
+                                           LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), type.length), "");
+
+   offset = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, "");
+   struct lp_build_loop_state loop_state;
+   lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+   struct lp_build_if_state ifthen;
+   LLVMValueRef cond;
+   LLVMValueRef packed = rgba_in[0], packed2 = rgba2_in[0];
+
+   LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask");
+   assert(exec_mask);
+
+   cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), "");
+   cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
+   lp_build_if(&ifthen, gallivm, cond);
+
+   LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed, loop_state.counter, "");
+   LLVMValueRef cast_base_ptr = LLVMBuildExtractElement(gallivm->builder, offset, loop_state.counter, "");
+   cast_base_ptr = LLVMBuildBitCast(gallivm->builder, cast_base_ptr, LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0), "");
+   data = LLVMBuildBitCast(gallivm->builder, data, LLVMInt32TypeInContext(gallivm->context), "");
+
+   if (img_op == LP_IMG_ATOMIC_CAS) {
+      LLVMValueRef cas_src_ptr = LLVMBuildExtractElement(gallivm->builder, packed2, loop_state.counter, "");
+      LLVMValueRef cas_src = LLVMBuildBitCast(gallivm->builder, cas_src_ptr, LLVMInt32TypeInContext(gallivm->context), "");
+      data = LLVMBuildAtomicCmpXchg(gallivm->builder, cast_base_ptr, data,
+                                    cas_src,
+                                    LLVMAtomicOrderingSequentiallyConsistent,
+                                    LLVMAtomicOrderingSequentiallyConsistent,
+                                    false);
+      data = LLVMBuildExtractValue(gallivm->builder, data, 0, "");
+   } else {
+      data = LLVMBuildAtomicRMW(gallivm->builder, op,
+                                cast_base_ptr, data,
+                                LLVMAtomicOrderingSequentiallyConsistent,
+                                false);
+   }
+
+   LLVMValueRef temp_res = LLVMBuildLoad(gallivm->builder, atom_res, "");
+   temp_res = LLVMBuildInsertElement(gallivm->builder, temp_res, data, loop_state.counter, "");
+   LLVMBuildStore(gallivm->builder, temp_res, atom_res);
+
+   lp_build_endif(&ifthen);
+   lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length),
+                          NULL, LLVMIntUGE);
+   atomic_result[0] = LLVMBuildLoad(gallivm->builder, atom_res, "");
+}
+
+void
+lp_build_img_op_soa(const struct lp_static_texture_state *static_texture_state,
+                    struct lp_sampler_dynamic_state *dynamic_state,
+                    struct gallivm_state *gallivm,
+                    const struct lp_img_params *params)
+{
+   unsigned target = params->target;
+   unsigned dims = texture_dims(target);
+   /** regular scalar int type */
+   struct lp_type int_type, int_coord_type;
+   struct lp_build_context int_bld, int_coord_bld;
+   const struct util_format_description *format_desc = util_format_description(static_texture_state->format);
+   LLVMValueRef x = params->coords[0], y = params->coords[1], z = params->coords[2];
+   LLVMValueRef ms_index = params->ms_index;
+   LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
+   int_type = lp_type_int(32);
+   int_coord_type = lp_int_type(params->type);
+   lp_build_context_init(&int_bld, gallivm, int_type);
+   lp_build_context_init(&int_coord_bld, gallivm, int_coord_type);
+
+   LLVMValueRef offset, i, j;
+
+   LLVMValueRef row_stride = dynamic_state->row_stride(dynamic_state, gallivm,
+                                                       params->context_ptr, params->image_index);
+   LLVMValueRef img_stride = dynamic_state->img_stride(dynamic_state, gallivm,
+                                                       params->context_ptr, params->image_index);
+   LLVMValueRef base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
+                                                   params->context_ptr, params->image_index);
+   LLVMValueRef width = dynamic_state->width(dynamic_state, gallivm,
+                                                params->context_ptr, params->image_index);
+   LLVMValueRef height = dynamic_state->height(dynamic_state, gallivm,
+                                               params->context_ptr, params->image_index);
+   LLVMValueRef depth = dynamic_state->depth(dynamic_state, gallivm,
+                                              params->context_ptr, params->image_index);
+   LLVMValueRef num_samples = NULL, sample_stride = NULL;
+   if (ms_index) {
+      num_samples = dynamic_state->num_samples(dynamic_state, gallivm,
+                                               params->context_ptr, params->image_index);
+      sample_stride = dynamic_state->sample_stride(dynamic_state, gallivm,
+                                                   params->context_ptr, params->image_index);
+   }
+
+   boolean layer_coord = has_layer_coord(target);
+
+   width = lp_build_broadcast_scalar(&int_coord_bld, width);
+   if (dims >= 2) {
+      height = lp_build_broadcast_scalar(&int_coord_bld, height);
+      row_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, row_stride);
+   }
+   if (dims >= 3 || layer_coord) {
+      depth = lp_build_broadcast_scalar(&int_coord_bld, depth);
+      img_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, img_stride);
+   }
+
+   LLVMValueRef out_of_bounds = int_coord_bld.zero;
+   LLVMValueRef out1;
+   out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
+   out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
+
+   if (dims >= 2) {
+      out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
+      out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
+   }
+   if (dims >= 3) {
+      out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
+      out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
+   }
+   lp_build_sample_offset(&int_coord_bld,
+                          format_desc,
+                          x, y, z, row_stride_vec, img_stride_vec,
+                          &offset, &i, &j);
+
+   if (ms_index) {
+      out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, ms_index, lp_build_broadcast_scalar(&int_coord_bld, num_samples));
+      out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
+
+      offset = lp_build_add(&int_coord_bld, offset,
+                            lp_build_mul(&int_coord_bld, lp_build_broadcast_scalar(&int_coord_bld, sample_stride),
+                                         ms_index));
+   }
+   if (params->img_op == LP_IMG_LOAD) {
+      struct lp_type texel_type = params->type;
+      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
+          format_desc->channel[0].pure_integer) {
+         if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
+            texel_type = lp_type_int_vec(params->type.width, params->type.width * params->type.length);
+         } else if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+            texel_type = lp_type_uint_vec(params->type.width, params->type.width * params->type.length);
+         }
+      }
+
+      if (static_texture_state->format == PIPE_FORMAT_NONE) {
+         /*
+          * If there's nothing bound, format is NONE, and we must return
+          * all zero as mandated by d3d10 in this case.
+          */
+         unsigned chan;
+         LLVMValueRef zero = lp_build_zero(gallivm, params->type);
+         for (chan = 0; chan < 4; chan++) {
+            params->outdata[chan] = zero;
+         }
+         return;
+      }
+
+      offset = lp_build_andnot(&int_coord_bld, offset, out_of_bounds);
+      struct lp_build_context texel_bld;
+      lp_build_context_init(&texel_bld, gallivm, texel_type);
+      lp_build_fetch_rgba_soa(gallivm,
+                              format_desc,
+                              texel_type, TRUE,
+                              base_ptr, offset,
+                              i, j,
+                              NULL,
+                              params->outdata);
+
+      for (unsigned chan = 0; chan < 4; chan++) {
+         params->outdata[chan] = lp_build_select(&texel_bld, out_of_bounds,
+                                                 texel_bld.zero, params->outdata[chan]);
+      }
+   } else if (params->img_op == LP_IMG_STORE) {
+      if (static_texture_state->format == PIPE_FORMAT_NONE)
+         return;
+      lp_build_store_rgba_soa(gallivm, format_desc, params->type, params->exec_mask, base_ptr, offset, out_of_bounds,
+                              params->indata);
+   } else {
+      if (static_texture_state->format == PIPE_FORMAT_NONE) {
+         /*
+         * For atomic operation just return 0 in the unbound case to avoid a crash.
+          */
+         LLVMValueRef zero = lp_build_zero(gallivm, params->type);
+         params->outdata[0] = zero;
+         return;
+      }
+      lp_build_do_atomic_soa(gallivm, format_desc, params->type, params->exec_mask, base_ptr, offset, out_of_bounds,
+                             params->img_op, params->op, params->indata, params->indata2, params->outdata);
+   }
+}