gallivm/sample: change texture function generator api

[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_sample_soa.c
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c

index cb4660e424d709fd6c8ecead01c73c48b79e363b..86c70bc3088306045f33cb015bb6157d3d8ef241 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -40,7 +40,7 @@
  #include "util/u_dump.h"
  #include "util/u_memory.h"
  #include "util/u_math.h"
-#include "util/u_format.h"
+#include "util/format/u_format.h"
  #include "util/u_cpu_detect.h"
  #include "util/format_rgb9e5.h"
  #include "lp_bld_debug.h"
@@ -61,6 +61,7 @@
  #include "lp_bld_quad.h"
  #include "lp_bld_pack.h"
  #include "lp_bld_intr.h"
+#include "lp_bld_misc.h"
  
  
  /**
@@ -194,9 +195,17 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
        for (chan = 0; chan < 4; chan++) {
           unsigned chan_s;
           /* reverse-map channel... */
-         for (chan_s = 0; chan_s < 4; chan_s++) {
-            if (chan_s == format_desc->swizzle[chan]) {
+         if (util_format_has_stencil(format_desc)) {
+            if (chan == 0)
+               chan_s = 0;
+            else
                 break;
+         }
+         else {
+            for (chan_s = 0; chan_s < 4; chan_s++) {
+               if (chan_s == format_desc->swizzle[chan]) {
+                  break;
+               }
              }
           }
           if (chan_s <= 3) {
@@ -218,34 +227,42 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
  
  
  /**
- * Helper to compute the mirror function for the PIPE_WRAP_MIRROR modes.
+ * Helper to compute the mirror function for the PIPE_WRAP_MIRROR_REPEAT mode.
+ * (Note that with pot sizes could do this much more easily post-scale
+ * with some bit arithmetic.)
   */
  static LLVMValueRef
  lp_build_coord_mirror(struct lp_build_sample_context *bld,
-                      LLVMValueRef coord)
+                      LLVMValueRef coord, boolean posOnly)
  {
     struct lp_build_context *coord_bld = &bld->coord_bld;
-   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   LLVMValueRef fract, flr, isOdd;
-
-   lp_build_ifloor_fract(coord_bld, coord, &flr, &fract);
-   /* kill off NaNs */
-   /* XXX: not safe without arch rounding, fract can be anything. */
-   fract = lp_build_max_ext(coord_bld, fract, coord_bld->zero,
-                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
-
-   /* isOdd = flr & 1 */
-   isOdd = LLVMBuildAnd(bld->gallivm->builder, flr, int_coord_bld->one, "");
+   LLVMValueRef fract;
+   LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
  
-   /* make coord positive or negative depending on isOdd */
-   /* XXX slight overkill masking out sign bit is unnecessary */
-   coord = lp_build_set_sign(coord_bld, fract, isOdd);
+   /*
+    * We can just use 2*(x - round(0.5*x)) to do all the mirroring,
+    * it all works out. (The result is in range [-1, 1.0], negative if
+    * the coord is in the "odd" section, otherwise positive.)
+    */
  
-   /* convert isOdd to float */
-   isOdd = lp_build_int_to_float(coord_bld, isOdd);
+   coord = lp_build_mul(coord_bld, coord, half);
+   fract = lp_build_round(coord_bld, coord);
+   fract = lp_build_sub(coord_bld, coord, fract);
+   coord = lp_build_add(coord_bld, fract, fract);
  
-   /* add isOdd to coord */
-   coord = lp_build_add(coord_bld, coord, isOdd);
+   if (posOnly) {
+      /*
+       * Theoretically it's not quite 100% accurate because the spec says
+       * that ultimately a scaled coord of -x.0 should map to int coord
+       * -x + 1 with mirroring, not -x (this does not matter for bilinear
+       * filtering).
+       */
+      coord = lp_build_abs(coord_bld, coord);
+      /* kill off NaNs */
+      /* XXX: not safe without arch rounding, fract can be anything. */
+      coord = lp_build_max_ext(coord_bld, coord, coord_bld->zero,
+                               GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
+   }
  
     return coord;
  }
@@ -299,6 +316,7 @@ lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
   */
  static void
  lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
+                            boolean is_gather,
                              LLVMValueRef coord,
                              LLVMValueRef length,
                              LLVMValueRef length_f,
@@ -361,7 +379,13 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
           coord = lp_build_add(coord_bld, coord, offset);
        }
  
-      /* clamp to [0, length] */
+      /*
+       * clamp to [0, length]
+       *
+       * Unlike some other wrap modes, this should be correct for gather
+       * too. GL_CLAMP explicitly does this clamp on the coord prior to
+       * actual wrapping (which is per sample).
+       */
        coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
  
        coord = lp_build_sub(coord_bld, coord, half);
@@ -388,13 +412,29 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
           /* clamp to length max */
           coord = lp_build_min_ext(coord_bld, coord, length_f,
                                    GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
-         /* subtract 0.5 */
-         coord = lp_build_sub(coord_bld, coord, half);
-         /* clamp to [0, length - 0.5] */
-         coord = lp_build_max(coord_bld, coord, coord_bld->zero);
-         /* convert to int, compute lerp weight */
-         lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
-         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+         if (!is_gather) {
+            /* subtract 0.5 */
+            coord = lp_build_sub(coord_bld, coord, half);
+            /* clamp to [0, length - 0.5] */
+            coord = lp_build_max(coord_bld, coord, coord_bld->zero);
+            /* convert to int, compute lerp weight */
+            lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
+            coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+         } else {
+            /*
+             * The non-gather path will end up with coords 0, 1 if coord was
+             * smaller than 0.5 (with corresponding weight 0.0 so it doesn't
+             * really matter what the second coord is). But for gather, we
+             * really need to end up with coords 0, 0.
+             */
+            coord = lp_build_max(coord_bld, coord, coord_bld->zero);
+            coord0 = lp_build_sub(coord_bld, coord, half);
+            coord1 = lp_build_add(coord_bld, coord, half);
+            /* Values range ([-0.5, length_f - 0.5], [0.5, length_f + 0.5] */
+            coord0 = lp_build_itrunc(coord_bld, coord0);
+            coord1 = lp_build_itrunc(coord_bld, coord1);
+            weight = coord_bld->undef;
+         }
           /* coord1 = min(coord1, length-1) */
           coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
           break;
@@ -409,8 +449,13 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
           offset = lp_build_int_to_float(coord_bld, offset);
           coord = lp_build_add(coord_bld, coord, offset);
        }
-      /* was: clamp to [-0.5, length + 0.5], then sub 0.5 */
-      /* can skip clamp (though might not work for very large coord values) */
+      /*
+       * We don't need any clamp. Technically, for very large (pos or neg)
+       * (or infinite) values, clamp against [-length, length] would be
+       * correct, but we don't need to guarantee any specific
+       * result for such coords (the ifloor will be undefined, but for modes
+       * requiring border all resulting coords are safe).
+       */
        coord = lp_build_sub(coord_bld, coord, half);
        /* convert to int, compute lerp weight */
        lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
@@ -423,21 +468,64 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
           offset = lp_build_div(coord_bld, offset, length_f);
           coord = lp_build_add(coord_bld, coord, offset);
        }
-      /* compute mirror function */
-      coord = lp_build_coord_mirror(bld, coord);
+      if (!is_gather) {
+         /* compute mirror function */
+         coord = lp_build_coord_mirror(bld, coord, TRUE);
  
-      /* scale coord to length */
-      coord = lp_build_mul(coord_bld, coord, length_f);
-      coord = lp_build_sub(coord_bld, coord, half);
+         /* scale coord to length */
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         coord = lp_build_sub(coord_bld, coord, half);
  
-      /* convert to int, compute lerp weight */
-      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
-      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+         /* convert to int, compute lerp weight */
+         lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
+         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+
+         /* coord0 = max(coord0, 0) */
+         coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
+         /* coord1 = min(coord1, length-1) */
+         coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
+      } else {
+         /*
+          * This is pretty reasonable in the end,  all what the tests care
+          * about is nasty edge cases (scaled coords x.5, so the individual
+          * coords are actually integers, which is REALLY tricky to get right
+          * due to this working differently both for negative numbers as well
+          * as for even/odd cases). But with enough magic it's not too complex
+          * after all.
+          * Maybe should try a bit arithmetic one though for POT textures...
+          */
+         LLVMValueRef isNeg;
+         /*
+          * Wrapping just once still works, even though it means we can
+          * get "wrong" sign due to performing mirror in the middle of the
+          * two coords (because this can only happen very near the odd/even
+          * edges, so both coords will actually end up as 0 or length - 1
+          * in the end).
+          * For GL4 gather with per-sample offsets we'd need to the mirroring
+          * per coord too.
+          */
+         coord = lp_build_coord_mirror(bld, coord, FALSE);
+         coord = lp_build_mul(coord_bld, coord, length_f);
  
-      /* coord0 = max(coord0, 0) */
-      coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
-      /* coord1 = min(coord1, length-1) */
-      coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
+         /*
+          * NaNs should be safe here, we'll do away with them with
+          * the ones' complement plus min.
+          */
+         coord0 = lp_build_sub(coord_bld, coord, half);
+         coord0 = lp_build_ifloor(coord_bld, coord0);
+         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+         /* ones complement for neg numbers (mirror(negX) = X - 1)  */
+         isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
+                              coord0, int_coord_bld->zero);
+         coord0 = lp_build_xor(int_coord_bld, coord0, isNeg);
+         isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
+                              coord1, int_coord_bld->zero);
+         coord1 = lp_build_xor(int_coord_bld, coord1, isNeg);
+         coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
+         coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
+
+         weight = coord_bld->undef;
+      }
        break;
  
     case PIPE_TEX_WRAP_MIRROR_CLAMP:
@@ -449,10 +537,19 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
           offset = lp_build_int_to_float(coord_bld, offset);
           coord = lp_build_add(coord_bld, coord, offset);
        }
+      /*
+       * XXX: probably not correct for gather, albeit I'm not
+       * entirely sure as it's poorly specified. The wrapping looks
+       * correct according to the spec which is against gl 1.2.1,
+       * however negative values will be swapped - gl re-specified
+       * wrapping with newer versions (no more pre-clamp except with
+       * GL_CLAMP).
+       */
        coord = lp_build_abs(coord_bld, coord);
  
        /* clamp to [0, length] */
-      coord = lp_build_min(coord_bld, coord, length_f);
+      coord = lp_build_min_ext(coord_bld, coord, length_f,
+                               GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
  
        coord = lp_build_sub(coord_bld, coord, half);
  
@@ -474,21 +571,59 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
              offset = lp_build_int_to_float(coord_bld, offset);
              coord = lp_build_add(coord_bld, coord, offset);
           }
-         coord = lp_build_abs(coord_bld, coord);
-
-         /* clamp to length max */
-         coord = lp_build_min_ext(coord_bld, coord, length_f,
-                                  GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
-         /* subtract 0.5 */
-         coord = lp_build_sub(coord_bld, coord, half);
-         /* clamp to [0, length - 0.5] */
-         coord = lp_build_max(coord_bld, coord, coord_bld->zero);
+         if (!is_gather) {
+            coord = lp_build_abs(coord_bld, coord);
+
+            /* clamp to length max */
+            coord = lp_build_min_ext(coord_bld, coord, length_f,
+                                     GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
+            /* subtract 0.5 */
+            coord = lp_build_sub(coord_bld, coord, half);
+            /* clamp to [0, length - 0.5] */
+            coord = lp_build_max(coord_bld, coord, coord_bld->zero);
+
+            /* convert to int, compute lerp weight */
+            lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
+            coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+            /* coord1 = min(coord1, length-1) */
+            coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
+         } else {
+            /*
+             * The non-gather path will swap coord0/1 if coord was negative,
+             * which is ok for filtering since the filter weight matches
+             * accordingly. Also, if coord is close to zero, coord0/1 will
+             * be 0 and 1, instead of 0 and 0 (again ok due to filter
+             * weight being 0.0). Both issues need to be fixed for gather.
+             */
+            LLVMValueRef isNeg;
  
-         /* convert to int, compute lerp weight */
-         lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
-         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
-         /* coord1 = min(coord1, length-1) */
-         coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
+            /*
+             * Actually wanted to cheat here and use:
+             * coord1 = lp_build_iround(coord_bld, coord);
+             * but it's not good enough for some tests (even piglit
+             * textureGather is set up in a way so the coords area always
+             * .5, that is right at the crossover points).
+             * So do ordinary sub/floor, then do ones' complement
+             * for negative numbers.
+             * (Note can't just do sub|add/abs/itrunc per coord neither -
+             * because the spec demands that mirror(3.0) = 3 but
+             * mirror(-3.0) = 2.)
+             */
+            coord = lp_build_sub(coord_bld, coord, half);
+            coord0 = lp_build_ifloor(coord_bld, coord);
+            coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+            isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord0,
+                                 int_coord_bld->zero);
+            coord0 = lp_build_xor(int_coord_bld, isNeg, coord0);
+            coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
+
+            isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord1,
+                                 int_coord_bld->zero);
+            coord1 = lp_build_xor(int_coord_bld, isNeg, coord1);
+            coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
+
+            weight = coord_bld->undef;
+         }
        }
        break;
  
@@ -502,11 +637,20 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
              offset = lp_build_int_to_float(coord_bld, offset);
              coord = lp_build_add(coord_bld, coord, offset);
           }
+         /*
+          * XXX: probably not correct for gather due to swapped
+          * order if coord is negative (same rationale as for
+          * MIRROR_CLAMP).
+          */
           coord = lp_build_abs(coord_bld, coord);
  
-         /* was: clamp to [-0.5, length + 0.5] then sub 0.5 */
-         /* skip clamp - always positive, and other side
-            only potentially matters for very large coords */
+         /*
+          * We don't need any clamp. Technically, for very large
+          * (or infinite) values, clamp against length would be
+          * correct, but we don't need to guarantee any specific
+          * result for such coords (the ifloor will be undefined, but
+          * for modes requiring border all resulting coords are safe).
+          */
           coord = lp_build_sub(coord_bld, coord, half);
  
           /* convert to int, compute lerp weight */
@@ -614,7 +758,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
           coord = lp_build_add(coord_bld, coord, offset);
        }
        /* compute mirror function */
-      coord = lp_build_coord_mirror(bld, coord);
+      coord = lp_build_coord_mirror(bld, coord, TRUE);
  
        /* scale coord to length */
        assert(bld->static_sampler_state->normalized_coords);
@@ -722,7 +866,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                                LLVMValueRef img_stride_vec,
                                LLVMValueRef data_ptr,
                                LLVMValueRef mipoffsets,
-                              LLVMValueRef *coords,
+                              const LLVMValueRef *coords,
                                const LLVMValueRef *offsets,
                                LLVMValueRef colors_out[4])
  {
@@ -869,7 +1013,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                               LLVMValueRef img_stride_vec,
                               LLVMValueRef data_ptr,
                               LLVMValueRef mipoffsets,
-                             LLVMValueRef *coords,
+                             const LLVMValueRef *coords,
                               const LLVMValueRef *offsets,
                               LLVMValueRef colors_out[4])
  {
@@ -895,20 +1039,29 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
     LLVMValueRef neighbors[2][2][4];
     int chan, texel_index;
     boolean seamless_cube_filter, accurate_cube_corners;
+   unsigned chan_swiz = bld->static_texture_state->swizzle_r;
+
+   if (is_gather) {
+      switch (bld->gather_comp) {
+      case 0: chan_swiz = bld->static_texture_state->swizzle_r; break;
+      case 1: chan_swiz = bld->static_texture_state->swizzle_g; break;
+      case 2: chan_swiz = bld->static_texture_state->swizzle_b; break;
+      case 3: chan_swiz = bld->static_texture_state->swizzle_a; break;
+      default:
+        break;
+      }
+   }
  
     seamless_cube_filter = (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
                             bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
                            bld->static_sampler_state->seamless_cube_map;
+
     /*
-    * XXX I don't know how this is really supposed to work with gather. From GL
-    * spec wording (not gather specific) it sounds like the 4th missing texel
-    * should be an average of the other 3, hence for gather could return this.
-    * This is however NOT how the code here works, which just fixes up the
-    * weights used for filtering instead. And of course for gather there is
-    * no filter to tweak...
+    * Disable accurate cube corners for integer textures, which should only
+    * get here in the gather path.
      */
     accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter &&
-                           !is_gather;
+     !util_format_is_pure_integer(bld->static_texture_state->format);
  
     lp_build_extract_image_sizes(bld,
                                  &bld->int_size_bld,
@@ -929,7 +1082,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
      */
  
     if (!seamless_cube_filter) {
-      lp_build_sample_wrap_linear(bld, coords[0], width_vec,
+      lp_build_sample_wrap_linear(bld, is_gather, coords[0], width_vec,
                                    flt_width_vec, offsets[0],
                                    bld->static_texture_state->pot_width,
                                    bld->static_sampler_state->wrap_s,
@@ -940,7 +1093,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
        x11 = x01;
  
        if (dims >= 2) {
-         lp_build_sample_wrap_linear(bld, coords[1], height_vec,
+         lp_build_sample_wrap_linear(bld, is_gather, coords[1], height_vec,
                                       flt_height_vec, offsets[1],
                                       bld->static_texture_state->pot_height,
                                       bld->static_sampler_state->wrap_t,
@@ -951,7 +1104,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
           y11 = y10;
  
           if (dims == 3) {
-            lp_build_sample_wrap_linear(bld, coords[2], depth_vec,
+            lp_build_sample_wrap_linear(bld, is_gather, coords[2], depth_vec,
                                          flt_depth_vec, offsets[2],
                                          bld->static_texture_state->pot_depth,
                                          bld->static_sampler_state->wrap_r,
@@ -978,7 +1131,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
        struct lp_build_if_state edge_if;
        LLVMTypeRef int1t;
        LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
-      LLVMValueRef coord, have_edge, have_corner;
+      LLVMValueRef coord0, coord1, have_edge, have_corner;
        LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, fall_off_y;
        LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
        LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
@@ -995,16 +1148,27 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
         */
        /* should always have normalized coords, and offsets are undefined */
        assert(bld->static_sampler_state->normalized_coords);
-      coord = lp_build_mul(coord_bld, coords[0], flt_width_vec);
+      /*
+       * The coords should all be between [0,1] however we can have NaNs,
+       * which will wreak havoc. In particular the y1_clamped value below
+       * can be -INT_MAX (on x86) and be propagated right through (probably
+       * other values might be bogus in the end too).
+       * So kill off the NaNs here.
+       */
+      coord0 = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero,
+                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
+      coord0 = lp_build_mul(coord_bld, coord0, flt_width_vec);
        /* instead of clamp, build mask if overflowed */
-      coord = lp_build_sub(coord_bld, coord, half);
+      coord0 = lp_build_sub(coord_bld, coord0, half);
        /* convert to int, compute lerp weight */
        /* not ideal with AVX (and no AVX2) */
-      lp_build_ifloor_fract(coord_bld, coord, &x0, &s_fpart);
+      lp_build_ifloor_fract(coord_bld, coord0, &x0, &s_fpart);
        x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
-      coord = lp_build_mul(coord_bld, coords[1], flt_height_vec);
-      coord = lp_build_sub(coord_bld, coord, half);
-      lp_build_ifloor_fract(coord_bld, coord, &y0, &t_fpart);
+      coord1 = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero,
+                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
+      coord1 = lp_build_mul(coord_bld, coord1, flt_height_vec);
+      coord1 = lp_build_sub(coord_bld, coord1, half);
+      lp_build_ifloor_fract(coord_bld, coord1, &y0, &t_fpart);
        y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
  
        fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero);
@@ -1236,94 +1400,191 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
         * as well) here.
         */
        if (accurate_cube_corners) {
-         LLVMValueRef w00, w01, w10, w11, wx0, wy0;
-         LLVMValueRef c_weight, c00, c01, c10, c11;
-         LLVMValueRef have_corner, one_third, tmp;
+         LLVMValueRef c00, c01, c10, c11, c00f, c01f, c10f, c11f;
+         LLVMValueRef have_corner, one_third;
  
-         colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
-         colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
-         colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
-         colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
+         colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs0");
+         colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs1");
+         colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs2");
+         colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs3");
  
           have_corner = LLVMBuildLoad(builder, have_corners, "");
  
           lp_build_if(&corner_if, bld->gallivm, have_corner);
  
-         /*
-          * we can't use standard 2d lerp as we need per-element weight
-          * in case of corners, so just calculate bilinear result as
-          * w00*s00 + w01*s01 + w10*s10 + w11*s11.
-          * (This is actually less work than using 2d lerp, 7 vs. 9 instructions,
-          * however calculating the weights needs another 6, so actually probably
-          * not slower than 2d lerp only for 4 channels as weights only need
-          * to be calculated once - of course fixing the weights has additional cost.)
-          */
-         wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
-         wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
-         w00 = lp_build_mul(coord_bld, wx0, wy0);
-         w01 = lp_build_mul(coord_bld, s_fpart, wy0);
-         w10 = lp_build_mul(coord_bld, wx0, t_fpart);
-         w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
-
-         /* find corner weight */
+         one_third = lp_build_const_vec(bld->gallivm, coord_bld->type,
+                                        1.0f/3.0f);
+
+         /* find corner */
           c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]);
-         c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
+         c00f = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
           c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]);
-         c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
+         c01f = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
           c10 = lp_build_and(ivec_bld, fall_off[0], fall_off[3]);
-         c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
+         c10f = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
           c11 = lp_build_and(ivec_bld, fall_off[1], fall_off[3]);
-         c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
+         c11f = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
  
-         /*
-          * add 1/3 of the corner weight to each of the 3 other samples
-          * and null out corner weight
-          */
-         one_third = lp_build_const_vec(bld->gallivm, coord_bld->type, 1.0f/3.0f);
-         c_weight = lp_build_mul(coord_bld, c_weight, one_third);
-         w00 = lp_build_add(coord_bld, w00, c_weight);
-         c00 = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
-         w00 = lp_build_andnot(coord_bld, w00, c00);
-         w01 = lp_build_add(coord_bld, w01, c_weight);
-         c01 = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
-         w01 = lp_build_andnot(coord_bld, w01, c01);
-         w10 = lp_build_add(coord_bld, w10, c_weight);
-         c10 = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
-         w10 = lp_build_andnot(coord_bld, w10, c10);
-         w11 = lp_build_add(coord_bld, w11, c_weight);
-         c11 = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
-         w11 = lp_build_andnot(coord_bld, w11, c11);
+         if (!is_gather) {
+            /*
+             * we can't use standard 2d lerp as we need per-element weight
+             * in case of corners, so just calculate bilinear result as
+             * w00*s00 + w01*s01 + w10*s10 + w11*s11.
+             * (This is actually less work than using 2d lerp, 7 vs. 9
+             * instructions, however calculating the weights needs another 6,
+             * so actually probably not slower than 2d lerp only for 4 channels
+             * as weights only need to be calculated once - of course fixing
+             * the weights has additional cost.)
+             */
+            LLVMValueRef w00, w01, w10, w11, wx0, wy0, c_weight, tmp;
+            wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
+            wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
+            w00 = lp_build_mul(coord_bld, wx0, wy0);
+            w01 = lp_build_mul(coord_bld, s_fpart, wy0);
+            w10 = lp_build_mul(coord_bld, wx0, t_fpart);
+            w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
+
+            /* find corner weight */
+            c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
+            c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
+            c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
+            c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
  
-         if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
-            for (chan = 0; chan < 4; chan++) {
-               colors0[chan] = lp_build_mul(coord_bld, w00, neighbors[0][0][chan]);
-               tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
-               colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
-               tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
-               colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
-               tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
-               colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
+            /*
+             * add 1/3 of the corner weight to the weight of the 3 other
+             * samples and null out corner weight.
+             */
+            c_weight = lp_build_mul(coord_bld, c_weight, one_third);
+            w00 = lp_build_add(coord_bld, w00, c_weight);
+            w00 = lp_build_andnot(coord_bld, w00, c00f);
+            w01 = lp_build_add(coord_bld, w01, c_weight);
+            w01 = lp_build_andnot(coord_bld, w01, c01f);
+            w10 = lp_build_add(coord_bld, w10, c_weight);
+            w10 = lp_build_andnot(coord_bld, w10, c10f);
+            w11 = lp_build_add(coord_bld, w11, c_weight);
+            w11 = lp_build_andnot(coord_bld, w11, c11f);
+
+            if (bld->static_sampler_state->compare_mode ==
+                PIPE_TEX_COMPARE_NONE) {
+               for (chan = 0; chan < 4; chan++) {
+                  colors0[chan] = lp_build_mul(coord_bld, w00,
+                                               neighbors[0][0][chan]);
+                  tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
+                  colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
+                  tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
+                  colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
+                  tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
+                  colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
+               }
+            }
+            else {
+               LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
+               cmpval00 = lp_build_sample_comparefunc(bld, coords[4],
+                                                      neighbors[0][0][0]);
+               cmpval01 = lp_build_sample_comparefunc(bld, coords[4],
+                                                      neighbors[0][1][0]);
+               cmpval10 = lp_build_sample_comparefunc(bld, coords[4],
+                                                      neighbors[1][0][0]);
+               cmpval11 = lp_build_sample_comparefunc(bld, coords[4],
+                                                      neighbors[1][1][0]);
+               /*
+                * inputs to interpolation are just masks so just add
+                * masked weights together
+                */
+               cmpval00 = LLVMBuildBitCast(builder, cmpval00,
+                                           coord_bld->vec_type, "");
+               cmpval01 = LLVMBuildBitCast(builder, cmpval01,
+                                           coord_bld->vec_type, "");
+               cmpval10 = LLVMBuildBitCast(builder, cmpval10,
+                                           coord_bld->vec_type, "");
+               cmpval11 = LLVMBuildBitCast(builder, cmpval11,
+                                           coord_bld->vec_type, "");
+               colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
+               tmp = lp_build_and(coord_bld, w01, cmpval01);
+               colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
+               tmp = lp_build_and(coord_bld, w10, cmpval10);
+               colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
+               tmp = lp_build_and(coord_bld, w11, cmpval11);
+               colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
+               colors0[1] = colors0[2] = colors0[3] = colors0[0];
              }
           }
           else {
-            LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
-            cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
-            cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
-            cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
-            cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
-            /* inputs to interpolation are just masks so just add masked weights together */
-            cmpval00 = LLVMBuildBitCast(builder, cmpval00, coord_bld->vec_type, "");
-            cmpval01 = LLVMBuildBitCast(builder, cmpval01, coord_bld->vec_type, "");
-            cmpval10 = LLVMBuildBitCast(builder, cmpval10, coord_bld->vec_type, "");
-            cmpval11 = LLVMBuildBitCast(builder, cmpval11, coord_bld->vec_type, "");
-            colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
-            tmp = lp_build_and(coord_bld, w01, cmpval01);
-            colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
-            tmp = lp_build_and(coord_bld, w10, cmpval10);
-            colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
-            tmp = lp_build_and(coord_bld, w11, cmpval11);
-            colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
-            colors0[1] = colors0[2] = colors0[3] = colors0[0];
+            /*
+             * We don't have any weights to adjust, so instead calculate
+             * the fourth texel as simply the average of the other 3.
+             * (This would work for non-gather too, however we'd have
+             * a boatload more of the select stuff due to there being
+             * 4 times as many colors as weights.)
+             */
+            LLVMValueRef col00, col01, col10, col11;
+            LLVMValueRef colc, colc0, colc1;
+            col10 = lp_build_swizzle_soa_channel(texel_bld,
+                                                 neighbors[1][0], chan_swiz);
+            col11 = lp_build_swizzle_soa_channel(texel_bld,
+                                                 neighbors[1][1], chan_swiz);
+            col01 = lp_build_swizzle_soa_channel(texel_bld,
+                                                 neighbors[0][1], chan_swiz);
+            col00 = lp_build_swizzle_soa_channel(texel_bld,
+                                                 neighbors[0][0], chan_swiz);
+
+            /*
+             * The spec says for comparison filtering, the comparison
+             * must happen before synthesizing the new value.
+             * This means all gathered values are always 0 or 1,
+             * except for the non-existing texel, which can be 0,1/3,2/3,1...
+             * Seems like we'd be allowed to just return 0 or 1 too, so we
+             * could simplify and pass down the compare mask values to the
+             * end (using int arithmetic/compare on the mask values to
+             * construct the fourth texel) and only there convert to floats
+             * but it's probably not worth it (it might be easier for the cpu
+             * but not for the code)...
+             */
+            if (bld->static_sampler_state->compare_mode !=
+                PIPE_TEX_COMPARE_NONE) {
+               LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
+               cmpval00 = lp_build_sample_comparefunc(bld, coords[4], col00);
+               cmpval01 = lp_build_sample_comparefunc(bld, coords[4], col01);
+               cmpval10 = lp_build_sample_comparefunc(bld, coords[4], col10);
+               cmpval11 = lp_build_sample_comparefunc(bld, coords[4], col11);
+               col00 = lp_build_select(texel_bld, cmpval00,
+                                       texel_bld->one, texel_bld->zero);
+               col01 = lp_build_select(texel_bld, cmpval01,
+                                       texel_bld->one, texel_bld->zero);
+               col10 = lp_build_select(texel_bld, cmpval10,
+                                       texel_bld->one, texel_bld->zero);
+               col11 = lp_build_select(texel_bld, cmpval11,
+                                       texel_bld->one, texel_bld->zero);
+            }
+
+            /*
+             * Null out corner color.
+             */
+            col00 = lp_build_andnot(coord_bld, col00, c00f);
+            col01 = lp_build_andnot(coord_bld, col01, c01f);
+            col10 = lp_build_andnot(coord_bld, col10, c10f);
+            col11 = lp_build_andnot(coord_bld, col11, c11f);
+
+            /*
+             * New corner texel color is all colors added / 3.
+             */
+            colc0 = lp_build_add(coord_bld, col00, col01);
+            colc1 = lp_build_add(coord_bld, col10, col11);
+            colc = lp_build_add(coord_bld, colc0, colc1);
+            colc = lp_build_mul(coord_bld, one_third, colc);
+
+            /*
+             * Replace the corner texel color with the new value.
+             */
+            col00 = lp_build_select(coord_bld, c00, colc, col00);
+            col01 = lp_build_select(coord_bld, c01, colc, col01);
+            col10 = lp_build_select(coord_bld, c10, colc, col10);
+            col11 = lp_build_select(coord_bld, c11, colc, col11);
+
+            colors0[0] = col10;
+            colors0[1] = col11;
+            colors0[2] = col01;
+            colors0[3] = col00;
           }
  
           LLVMBuildStore(builder, colors0[0], colorss[0]);
@@ -1342,7 +1603,6 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
               * end of sampling (much less values to swizzle), but this
               * obviously cannot work when using gather.
               */
-            unsigned chan_swiz = bld->static_texture_state->swizzle_r;
              colors0[0] = lp_build_swizzle_soa_channel(texel_bld,
                                                        neighbors[1][0],
                                                        chan_swiz);
@@ -1378,25 +1638,14 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
  
           if (is_gather) {
              /* more hacks for swizzling, should be X, ONE or ZERO... */
-            unsigned chan_swiz = bld->static_texture_state->swizzle_r;
-            if (chan_swiz <= PIPE_SWIZZLE_W) {
-               colors0[0] = lp_build_select(texel_bld, cmpval10,
-                                            texel_bld->one, texel_bld->zero);
-               colors0[1] = lp_build_select(texel_bld, cmpval11,
-                                            texel_bld->one, texel_bld->zero);
-               colors0[2] = lp_build_select(texel_bld, cmpval01,
-                                            texel_bld->one, texel_bld->zero);
-               colors0[3] = lp_build_select(texel_bld, cmpval00,
-                                            texel_bld->one, texel_bld->zero);
-            }
-            else if (chan_swiz == PIPE_SWIZZLE_0) {
-               colors0[0] = colors0[1] = colors0[2] = colors0[3] =
-                            texel_bld->zero;
-            }
-            else {
-               colors0[0] = colors0[1] = colors0[2] = colors0[3] =
-                            texel_bld->one;
-            }
+            colors0[0] = lp_build_select(texel_bld, cmpval10,
+                                         texel_bld->one, texel_bld->zero);
+            colors0[1] = lp_build_select(texel_bld, cmpval11,
+                                         texel_bld->one, texel_bld->zero);
+            colors0[2] = lp_build_select(texel_bld, cmpval01,
+                                         texel_bld->one, texel_bld->zero);
+            colors0[3] = lp_build_select(texel_bld, cmpval00,
+                                         texel_bld->one, texel_bld->zero);
           }
           else {
              colors0[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
@@ -1489,6 +1738,26 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
           }
        }
     }
+   if (is_gather) {
+      /*
+       * For gather, we can't do our usual channel swizzling done later,
+       * so do it here. It only really matters for 0/1 swizzles in case
+       * of comparison filtering, since in this case the results would be
+       * wrong, without comparison it should all work out alright but it
+       * can't hurt to do that here, since it will instantly drop all
+       * calculations above, though it's a rather stupid idea to do
+       * gather on a channel which will always return 0 or 1 in any case...
+       */
+      if (chan_swiz == PIPE_SWIZZLE_1) {
+         for (chan = 0; chan < 4; chan++) {
+            colors_out[chan] = texel_bld->one;
+         }
+      } else if (chan_swiz == PIPE_SWIZZLE_0) {
+         for (chan = 0; chan < 4; chan++) {
+            colors_out[chan] = texel_bld->zero;
+         }
+      }
+   }
  }
  
  
@@ -1503,7 +1772,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                         unsigned img_filter,
                         unsigned mip_filter,
                         boolean is_gather,
-                       LLVMValueRef *coords,
+                       const LLVMValueRef *coords,
                         const LLVMValueRef *offsets,
                         LLVMValueRef ilevel0,
                         LLVMValueRef ilevel1,
@@ -1576,6 +1845,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                                        PIPE_FUNC_GREATER,
                                        lod_fpart, bld->lodf_bld.zero);
           need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, need_lerp);
+         lp_build_name(need_lerp, "need_lerp");
        }
  
        lp_build_if(&if_ctx, bld->gallivm, need_lerp);
@@ -1644,7 +1914,7 @@ static void
  lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
                              LLVMValueRef linear_mask,
                              unsigned mip_filter,
-                            LLVMValueRef *coords,
+                            const LLVMValueRef *coords,
                              const LLVMValueRef *offsets,
                              LLVMValueRef ilevel0,
                              LLVMValueRef ilevel1,
@@ -1701,6 +1971,7 @@ lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
         * should be able to merge the branches in this case.
         */
        need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_positive);
+      lp_build_name(need_lerp, "need_lerp");
  
        lp_build_if(&if_ctx, bld->gallivm, need_lerp);
        {
@@ -1791,6 +2062,7 @@ lp_build_layer_coord(struct lp_build_sample_context *bld,
   */
  static void
  lp_build_sample_common(struct lp_build_sample_context *bld,
+                       boolean is_lodq,
                         unsigned texture_index,
                         unsigned sampler_index,
                         LLVMValueRef *coords,
@@ -1798,6 +2070,7 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
                         LLVMValueRef lod_bias, /* optional */
                         LLVMValueRef explicit_lod, /* optional */
                         LLVMValueRef *lod_pos_or_zero,
+                       LLVMValueRef *lod,
                         LLVMValueRef *lod_fpart,
                         LLVMValueRef *ilevel0,
                         LLVMValueRef *ilevel1)
@@ -1870,15 +2143,44 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
      * Compute the level of detail (float).
      */
     if (min_filter != mag_filter ||
-       mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+       mip_filter != PIPE_TEX_MIPFILTER_NONE || is_lodq) {
        /* Need to compute lod either to choose mipmap levels or to
         * distinguish between minification/magnification with one mipmap level.
         */
-      lp_build_lod_selector(bld, texture_index, sampler_index,
+      lp_build_lod_selector(bld, is_lodq, texture_index, sampler_index,
                              coords[0], coords[1], coords[2], cube_rho,
                              derivs, lod_bias, explicit_lod,
-                            mip_filter,
+                            mip_filter, lod,
                              &lod_ipart, lod_fpart, lod_pos_or_zero);
+      if (is_lodq) {
+         LLVMValueRef last_level;
+         last_level = bld->dynamic_state->last_level(bld->dynamic_state,
+                                                     bld->gallivm,
+                                                     bld->context_ptr,
+                                                     texture_index);
+         first_level = bld->dynamic_state->first_level(bld->dynamic_state,
+                                                       bld->gallivm,
+                                                       bld->context_ptr,
+                                                       texture_index);
+         last_level = lp_build_sub(&bld->int_bld, last_level, first_level);
+         last_level = lp_build_int_to_float(&bld->float_bld, last_level);
+         last_level = lp_build_broadcast_scalar(&bld->lodf_bld, last_level);
+
+         switch (mip_filter) {
+         case PIPE_TEX_MIPFILTER_NONE:
+            *lod_fpart = bld->lodf_bld.zero;
+            break;
+         case PIPE_TEX_MIPFILTER_NEAREST:
+             *lod_fpart = lp_build_round(&bld->lodf_bld, *lod_fpart);
+             /* fallthrough */
+         case PIPE_TEX_MIPFILTER_LINEAR:
+            *lod_fpart = lp_build_clamp(&bld->lodf_bld, *lod_fpart,
+                                        bld->lodf_bld.zero, last_level);
+            break;
+         }
+         return;
+      }
+
     } else {
        lod_ipart = bld->lodi_bld.zero;
        *lod_pos_or_zero = bld->lodi_bld.zero;
@@ -2063,13 +2365,16 @@ lp_build_clamp_border_color(struct lp_build_sample_context *bld,
           max_clamp = vec4_bld.one;
        }
        else if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
-               format_desc->layout == UTIL_FORMAT_LAYOUT_ETC) {
+               format_desc->layout == UTIL_FORMAT_LAYOUT_ETC ||
+               format_desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
           switch (format_desc->format) {
           case PIPE_FORMAT_RGTC1_UNORM:
           case PIPE_FORMAT_RGTC2_UNORM:
           case PIPE_FORMAT_LATC1_UNORM:
           case PIPE_FORMAT_LATC2_UNORM:
           case PIPE_FORMAT_ETC1_RGB8:
+         case PIPE_FORMAT_BPTC_RGBA_UNORM:
+         case PIPE_FORMAT_BPTC_SRGBA:
              min_clamp = vec4_bld.zero;
              max_clamp = vec4_bld.one;
              break;
@@ -2080,6 +2385,12 @@ lp_build_clamp_border_color(struct lp_build_sample_context *bld,
              min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
              max_clamp = vec4_bld.one;
              break;
+         case PIPE_FORMAT_BPTC_RGB_FLOAT:
+            /* not sure if we should clamp to max half float? */
+            break;
+         case PIPE_FORMAT_BPTC_RGB_UFLOAT:
+            min_clamp = vec4_bld.zero;
+            break;
           default:
              assert(0);
              break;
@@ -2147,7 +2458,7 @@ static void
  lp_build_sample_general(struct lp_build_sample_context *bld,
                          unsigned sampler_unit,
                          boolean is_gather,
-                        LLVMValueRef *coords,
+                        const LLVMValueRef *coords,
                          const LLVMValueRef *offsets,
                          LLVMValueRef lod_positive,
                          LLVMValueRef lod_fpart,
@@ -2208,7 +2519,8 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
           struct lp_build_if_state if_ctx;
  
           lod_positive = LLVMBuildTrunc(builder, lod_positive,
-                                       LLVMInt1TypeInContext(bld->gallivm->context), "");
+                                       LLVMInt1TypeInContext(bld->gallivm->context),
+                                       "lod_pos");
  
           lp_build_if(&if_ctx, bld->gallivm, lod_positive);
           {
@@ -2244,6 +2556,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
           }
           need_linear = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods,
                                                 linear_mask);
+         lp_build_name(need_linear, "need_linear");
  
           if (bld->num_lods != bld->coord_type.length) {
              linear_mask = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
@@ -2303,6 +2616,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
  static void
  lp_build_fetch_texel(struct lp_build_sample_context *bld,
                       unsigned texture_unit,
+                     LLVMValueRef ms_index,
                       const LLVMValueRef *coords,
                       LLVMValueRef explicit_lod,
                       const LLVMValueRef *offsets,
@@ -2402,6 +2716,18 @@ lp_build_fetch_texel(struct lp_build_sample_context *bld,
                              lp_build_get_mip_offsets(bld, ilevel));
     }
  
+   if (bld->fetch_ms) {
+      LLVMValueRef num_samples;
+      num_samples = bld->dynamic_state->num_samples(bld->dynamic_state, bld->gallivm,
+                                                    bld->context_ptr, texture_unit);
+      out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, ms_index, int_coord_bld->zero);
+      out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
+      out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, ms_index, lp_build_broadcast_scalar(int_coord_bld, num_samples));
+      out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
+      offset = lp_build_add(int_coord_bld, offset,
+                            lp_build_mul(int_coord_bld, bld->sample_stride, ms_index));
+   }
+
     offset = lp_build_andnot(int_coord_bld, offset, out_of_bounds);
  
     lp_build_fetch_rgba_soa(bld->gallivm,
@@ -2468,6 +2794,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                           const LLVMValueRef *offsets,
                           const struct lp_derivatives *derivs, /* optional */
                           LLVMValueRef lod, /* optional */
+                         LLVMValueRef ms_index, /* optional */
                           LLVMValueRef texel_out[4])
  {
     unsigned target = static_texture_state->target;
@@ -2484,7 +2811,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
     enum lp_sampler_op_type op_type;
     LLVMValueRef lod_bias = NULL;
     LLVMValueRef explicit_lod = NULL;
-   boolean op_is_tex;
+   boolean op_is_tex, op_is_lodq, op_is_gather, fetch_ms;
  
     if (0) {
        enum pipe_format fmt = static_texture_state->format;
@@ -2497,8 +2824,11 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                      LP_SAMPLER_LOD_CONTROL_SHIFT;
     op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
                   LP_SAMPLER_OP_TYPE_SHIFT;
+   fetch_ms = !!(sample_key & LP_SAMPLER_FETCH_MS);
  
     op_is_tex = op_type == LP_SAMPLER_OP_TEXTURE;
+   op_is_lodq = op_type == LP_SAMPLER_OP_LODQ;
+   op_is_gather = op_type == LP_SAMPLER_OP_GATHER;
  
     if (lod_control == LP_SAMPLER_LOD_BIAS) {
        lod_bias = lod;
@@ -2544,6 +2874,16 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
     bld.format_desc = util_format_description(static_texture_state->format);
     bld.dims = dims;
  
+   if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD || op_is_lodq) {
+      bld.no_quad_lod = TRUE;
+   }
+   if (gallivm_perf & GALLIVM_PERF_NO_RHO_APPROX || op_is_lodq) {
+      bld.no_rho_approx = TRUE;
+   }
+   if (gallivm_perf & GALLIVM_PERF_NO_BRILINEAR || op_is_lodq) {
+      bld.no_brilinear = TRUE;
+   }
+
     bld.vector_width = lp_type_width(type);
  
     bld.float_type = lp_type_float(32);
@@ -2570,15 +2910,16 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
     else if (util_format_has_stencil(bld.format_desc) &&
         !util_format_has_depth(bld.format_desc)) {
        /* for stencil only formats, sample stencil (uint) */
-      bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
+      bld.texel_type = lp_type_uint_vec(type.width, type.width * type.length);
     }
  
-   if (!static_texture_state->level_zero_only) {
+   if (!static_texture_state->level_zero_only ||
+       !static_sampler_state->max_lod_pos || op_is_lodq) {
        derived_sampler_state.min_mip_filter = static_sampler_state->min_mip_filter;
     } else {
        derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
     }
-   if (op_type == LP_SAMPLER_OP_GATHER) {
+   if (op_is_gather) {
        /*
         * gather4 is exactly like GL_LINEAR filtering but in the end skipping
         * the actual filtering. Using mostly the same paths, so cube face
@@ -2639,11 +2980,11 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
      */
     bld.num_mips = bld.num_lods = 1;
  
-   if ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) &&
-       (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) &&
-       (static_texture_state->target == PIPE_TEXTURE_CUBE ||
-        static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
-       (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
+   if (bld.no_quad_lod && bld.no_rho_approx &&
+       ((mip_filter != PIPE_TEX_MIPFILTER_NONE && op_is_tex &&
+         (static_texture_state->target == PIPE_TEXTURE_CUBE ||
+          static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)) ||
+        op_is_lodq)) {
        /*
         * special case for using per-pixel lod even for implicit lod,
         * which is generally never required (ok by APIs) except to please
@@ -2651,6 +2992,8 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
         * can cause derivatives to be different for pixels outside the primitive
         * due to the major axis division even if pre-project derivatives are
         * looking normal).
+       * For lodq, we do it to simply avoid scalar pack / unpack (albeit for
+       * cube maps we do indeed get per-pixel lod values).
         */
        bld.num_mips = type.length;
        bld.num_lods = type.length;
@@ -2678,7 +3021,9 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
        bld.num_lods = num_quads;
     }
  
-
+   bld.fetch_ms = fetch_ms;
+   if (op_is_gather)
+      bld.gather_comp = (sample_key & LP_SAMPLER_GATHER_COMP_MASK) >> LP_SAMPLER_GATHER_COMP_SHIFT;
     bld.lodf_type = type;
     /* we want native vector size to be able to use our intrinsics */
     if (bld.num_lods != type.length) {
@@ -2727,6 +3072,10 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                                            context_ptr, texture_index);
     bld.mip_offsets = dynamic_state->mip_offsets(dynamic_state, gallivm,
                                                  context_ptr, texture_index);
+
+   if (fetch_ms)
+      bld.sample_stride = lp_build_broadcast_scalar(&bld.int_coord_bld, dynamic_state->sample_stride(dynamic_state, gallivm,
+                                                                                                     context_ptr, texture_index));
     /* Note that mip_offsets is an array[level] of offsets to texture images */
  
     if (dynamic_state->cache_ptr && thread_data_ptr) {
@@ -2764,6 +3113,32 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
        newcoords[i] = coords[i];
     }
  
+   if (util_format_is_pure_integer(static_texture_state->format) &&
+       !util_format_has_depth(bld.format_desc) && op_is_tex &&
+       (static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR ||
+        static_sampler_state->min_img_filter == PIPE_TEX_FILTER_LINEAR ||
+        static_sampler_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
+      /*
+       * Bail if impossible filtering is specified (the awkard additional
+       * depth check is because it is legal in gallium to have things like S8Z24
+       * here which would say it's pure int despite such formats should sample
+       * the depth component).
+       * In GL such filters make the texture incomplete, this makes it robust
+       * against gallium frontends which set this up regardless (we'd crash in the
+       * lerp later otherwise).
+       * At least in some apis it may be legal to use such filters with lod
+       * queries and/or gather (at least for gather d3d10 says only the wrap
+       * bits are really used hence filter bits are likely simply ignored).
+       * For fetch, we don't get valid samplers either way here.
+       */
+      unsigned chan;
+      LLVMValueRef zero = lp_build_zero(gallivm, type);
+      for (chan = 0; chan < 4; chan++) {
+         texel_out[chan] = zero;
+      }
+      return;
+   }
+
     if (0) {
        /* For debug: no-op texture sampling */
        lp_build_sample_nop(gallivm,
@@ -2773,40 +3148,16 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
     }
  
     else if (op_type == LP_SAMPLER_OP_FETCH) {
-      lp_build_fetch_texel(&bld, texture_index, newcoords,
+      lp_build_fetch_texel(&bld, texture_index, ms_index, newcoords,
                             lod, offsets,
                             texel_out);
     }
  
     else {
        LLVMValueRef lod_fpart = NULL, lod_positive = NULL;
-      LLVMValueRef ilevel0 = NULL, ilevel1 = NULL;
+      LLVMValueRef ilevel0 = NULL, ilevel1 = NULL, lod = NULL;
        boolean use_aos;
  
-      if (util_format_is_pure_integer(static_texture_state->format) &&
-          !util_format_has_depth(bld.format_desc) &&
-          (static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR ||
-           static_sampler_state->min_img_filter == PIPE_TEX_FILTER_LINEAR ||
-           static_sampler_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
-         /*
-          * Bail if impossible filtering is specified (the awkard additional
-          * depth check is because it is legal in gallium to have things like S8Z24
-          * here which would say it's pure int despite such formats should sample
-          * the depth component).
-          * In GL such filters make the texture incomplete, this makes it robust
-          * against state trackers which set this up regardless (we'd crash in the
-          * lerp later (except for gather)).
-          * Must do this after fetch_texel code since with GL state tracker we'll
-          * get some junk sampler for buffer textures.
-          */
-         unsigned chan;
-         LLVMValueRef zero = lp_build_zero(gallivm, type);
-         for (chan = 0; chan < 4; chan++) {
-            texel_out[chan] = zero;
-         }
-         return;
-      }
-
        use_aos = util_format_fits_8unorm(bld.format_desc) &&
                  op_is_tex &&
                  /* not sure this is strictly needed or simply impossible */
@@ -2816,6 +3167,11 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
        use_aos &= bld.num_lods <= num_quads ||
                   derived_sampler_state.min_img_filter ==
                      derived_sampler_state.mag_img_filter;
+
+      if(gallivm_perf & GALLIVM_PERF_NO_AOS_SAMPLING) {
+         use_aos = 0;
+      }
+
        if (dims > 1) {
           use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_t);
           if (dims > 2) {
@@ -2847,12 +3203,19 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                        derived_sampler_state.wrap_r);
        }
  
-      lp_build_sample_common(&bld, texture_index, sampler_index,
+      lp_build_sample_common(&bld, op_is_lodq, texture_index, sampler_index,
                               newcoords,
                               derivs, lod_bias, explicit_lod,
-                             &lod_positive, &lod_fpart,
+                             &lod_positive, &lod, &lod_fpart,
                               &ilevel0, &ilevel1);
  
+      if (op_is_lodq) {
+         texel_out[0] = lod_fpart;
+         texel_out[1] = lod;
+         texel_out[2] = texel_out[3] = bld.coord_bld.zero;
+         return;
+      }
+
        if (use_aos && static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
           /* The aos path doesn't do seamless filtering so simply add cube layer
            * to face now.
@@ -2899,6 +3262,9 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
  
           /* Setup our build context */
           memset(&bld4, 0, sizeof bld4);
+         bld4.no_quad_lod = bld.no_quad_lod;
+         bld4.no_rho_approx = bld.no_rho_approx;
+         bld4.no_brilinear = bld.no_brilinear;
           bld4.gallivm = bld.gallivm;
           bld4.context_ptr = bld.context_ptr;
           bld4.static_texture_state = bld.static_texture_state;
@@ -2926,8 +3292,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
           bld4.texel_type.length = 4;
  
           bld4.num_mips = bld4.num_lods = 1;
-         if ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) &&
-             (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) &&
+         if (bld4.no_quad_lod && bld4.no_rho_approx &&
               (static_texture_state->target == PIPE_TEXTURE_CUBE ||
                static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
               (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
@@ -3109,6 +3474,7 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
     LLVMValueRef coords[5];
     LLVMValueRef offsets[3] = { NULL };
     LLVMValueRef lod = NULL;
+   LLVMValueRef ms_index = NULL;
     LLVMValueRef context_ptr;
     LLVMValueRef thread_data_ptr = NULL;
     LLVMValueRef texel_out[4];
@@ -3151,6 +3517,9 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
     if (sample_key & LP_SAMPLER_SHADOW) {
        coords[4] = LLVMGetParam(function, num_param++);
     }
+   if (sample_key & LP_SAMPLER_FETCH_MS) {
+      ms_index = LLVMGetParam(function, num_param++);
+   }
     if (sample_key & LP_SAMPLER_OFFSETS) {
        for (i = 0; i < num_offsets; i++) {
           offsets[i] = LLVMGetParam(function, num_param++);
@@ -3193,6 +3562,7 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
                              offsets,
                              deriv_ptr,
                              lod,
+                            ms_index,
                              texel_out);
  
     LLVMBuildAggregateRet(gallivm->builder, texel_out, 4);
@@ -3213,7 +3583,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
                           const struct lp_static_texture_state *static_texture_state,
                           const struct lp_static_sampler_state *static_sampler_state,
                           struct lp_sampler_dynamic_state *dynamic_state,
-                         const struct lp_sampler_params *params)
+                         const struct lp_sampler_params *params,
+                         int texture_index, int sampler_index,
+                         LLVMValueRef *tex_ret)
  {
     LLVMBuilderRef builder = gallivm->builder;
     LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(
@@ -3221,12 +3593,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
     LLVMValueRef function, inst;
     LLVMValueRef args[LP_MAX_TEX_FUNC_ARGS];
     LLVMBasicBlockRef bb;
-   LLVMValueRef tex_ret;
     unsigned num_args = 0;
     char func_name[64];
     unsigned i, num_coords, num_derivs, num_offsets, layer;
-   unsigned texture_index = params->texture_index;
-   unsigned sampler_index = params->sampler_index;
     unsigned sample_key = params->sample_key;
     const LLVMValueRef *coords = params->coords;
     const LLVMValueRef *offsets = params->offsets;
@@ -3244,10 +3613,6 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
        const struct util_format_description *format_desc;
        format_desc = util_format_description(static_texture_state->format);
        if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
-         /*
-          * This is not 100% correct, if we have cache but the
-          * util_format_s3tc_prefer is true the cache won't get used
-          * regardless (could hook up the block decode there...) */
           need_cache = TRUE;
        }
     }
@@ -3259,8 +3624,8 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
      * Additionally lod_property has to be included too.
      */
  
-   util_snprintf(func_name, sizeof(func_name), "texfunc_res_%d_sam_%d_%x",
-                 texture_index, sampler_index, sample_key);
+   snprintf(func_name, sizeof(func_name), "texfunc_res_%d_sam_%d_%x",
+            texture_index, sampler_index, sample_key);
  
     function = LLVMGetNamedFunction(module, func_name);
  
@@ -3290,6 +3655,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
        if (sample_key & LP_SAMPLER_SHADOW) {
           arg_types[num_param++] = LLVMTypeOf(coords[0]);
        }
+      if (sample_key & LP_SAMPLER_FETCH_MS) {
+         arg_types[num_param++] = LLVMTypeOf(params->ms_index);
+      }
        if (sample_key & LP_SAMPLER_OFFSETS) {
           for (i = 0; i < num_offsets; i++) {
              arg_types[num_param++] = LLVMTypeOf(offsets[0]);
@@ -3351,6 +3719,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
     if (sample_key & LP_SAMPLER_SHADOW) {
        args[num_args++] = coords[4];
     }
+   if (sample_key & LP_SAMPLER_FETCH_MS) {
+      args[num_args++] = params->ms_index;
+   }
     if (sample_key & LP_SAMPLER_OFFSETS) {
        for (i = 0; i < num_offsets; i++) {
           args[num_args++] = offsets[i];
@@ -3369,14 +3740,11 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
  
     assert(num_args <= LP_MAX_TEX_FUNC_ARGS);
  
-   tex_ret = LLVMBuildCall(builder, function, args, num_args, "");
+   *tex_ret = LLVMBuildCall(builder, function, args, num_args, "");
     bb = LLVMGetInsertBlock(builder);
     inst = LLVMGetLastInstruction(bb);
     LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
  
-   for (i = 0; i < 4; i++) {
-      params->texel[i] = LLVMBuildExtractValue(gallivm->builder, tex_ret, i, "");
-   }
  }
  
  
@@ -3430,11 +3798,16 @@ lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
     }
  
     if (use_tex_func) {
+      LLVMValueRef tex_ret;
        lp_build_sample_soa_func(gallivm,
                                 static_texture_state,
                                 static_sampler_state,
                                 dynamic_state,
-                               params);
+                               params, params->texture_index, params->sampler_index, &tex_ret);
+
+      for (unsigned i = 0; i < 4; i++) {
+         params->texel[i] = LLVMBuildExtractValue(gallivm->builder, tex_ret, i, "");
+      }
     }
     else {
        lp_build_sample_soa_code(gallivm,
@@ -3451,6 +3824,7 @@ lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
                                 params->offsets,
                                 params->derivs,
                                 params->lod,
+                               params->ms_index,
                                 params->texel);
     }
  }
@@ -3529,6 +3903,12 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
  
     lp_build_context_init(&bld_int_vec4, gallivm, lp_type_int_vec(32, 128));
  
+   if (params->samples_only) {
+      params->sizes_out[0] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, params->int_type),
+                                                dynamic_state->num_samples(dynamic_state, gallivm,
+                                                                           context_ptr, texture_unit));
+      return;
+   }
     if (params->explicit_lod) {
        /* FIXME: this needs to honor per-element lod */
        lod = LLVMBuildExtractElement(gallivm->builder, params->explicit_lod,
@@ -3641,3 +4021,210 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
                                          num_levels);
     }
  }
+
+static void
+lp_build_do_atomic_soa(struct gallivm_state *gallivm,
+                       const struct util_format_description *format_desc,
+                       struct lp_type type,
+                       LLVMValueRef exec_mask,
+                       LLVMValueRef base_ptr,
+                       LLVMValueRef offset,
+                       LLVMValueRef out_of_bounds,
+                       unsigned img_op,
+                       LLVMAtomicRMWBinOp op,
+                       const LLVMValueRef rgba_in[4],
+                       const LLVMValueRef rgba2_in[4],
+                       LLVMValueRef atomic_result[4])
+{
+   enum pipe_format format = format_desc->format;
+
+   if (format != PIPE_FORMAT_R32_UINT && format != PIPE_FORMAT_R32_SINT && format != PIPE_FORMAT_R32_FLOAT) {
+      atomic_result[0] = lp_build_zero(gallivm, type);
+      return;
+   }
+
+   LLVMValueRef atom_res = lp_build_alloca(gallivm,
+                                           LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), type.length), "");
+
+   offset = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, "");
+   struct lp_build_loop_state loop_state;
+   lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+   struct lp_build_if_state ifthen;
+   LLVMValueRef cond;
+   LLVMValueRef packed = rgba_in[0], packed2 = rgba2_in[0];
+
+   LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask");
+   assert(exec_mask);
+
+   cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), "");
+   cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
+   lp_build_if(&ifthen, gallivm, cond);
+
+   LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed, loop_state.counter, "");
+   LLVMValueRef cast_base_ptr = LLVMBuildExtractElement(gallivm->builder, offset, loop_state.counter, "");
+   cast_base_ptr = LLVMBuildBitCast(gallivm->builder, cast_base_ptr, LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0), "");
+   data = LLVMBuildBitCast(gallivm->builder, data, LLVMInt32TypeInContext(gallivm->context), "");
+
+   if (img_op == LP_IMG_ATOMIC_CAS) {
+      LLVMValueRef cas_src_ptr = LLVMBuildExtractElement(gallivm->builder, packed2, loop_state.counter, "");
+      LLVMValueRef cas_src = LLVMBuildBitCast(gallivm->builder, cas_src_ptr, LLVMInt32TypeInContext(gallivm->context), "");
+      data = LLVMBuildAtomicCmpXchg(gallivm->builder, cast_base_ptr, data,
+                                    cas_src,
+                                    LLVMAtomicOrderingSequentiallyConsistent,
+                                    LLVMAtomicOrderingSequentiallyConsistent,
+                                    false);
+      data = LLVMBuildExtractValue(gallivm->builder, data, 0, "");
+   } else {
+      data = LLVMBuildAtomicRMW(gallivm->builder, op,
+                                cast_base_ptr, data,
+                                LLVMAtomicOrderingSequentiallyConsistent,
+                                false);
+   }
+
+   LLVMValueRef temp_res = LLVMBuildLoad(gallivm->builder, atom_res, "");
+   temp_res = LLVMBuildInsertElement(gallivm->builder, temp_res, data, loop_state.counter, "");
+   LLVMBuildStore(gallivm->builder, temp_res, atom_res);
+
+   lp_build_endif(&ifthen);
+   lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length),
+                          NULL, LLVMIntUGE);
+   atomic_result[0] = LLVMBuildLoad(gallivm->builder, atom_res, "");
+}
+
+void
+lp_build_img_op_soa(const struct lp_static_texture_state *static_texture_state,
+                    struct lp_sampler_dynamic_state *dynamic_state,
+                    struct gallivm_state *gallivm,
+                    const struct lp_img_params *params)
+{
+   unsigned target = params->target;
+   unsigned dims = texture_dims(target);
+   /** regular scalar int type */
+   struct lp_type int_type, int_coord_type;
+   struct lp_build_context int_bld, int_coord_bld;
+   const struct util_format_description *format_desc = util_format_description(static_texture_state->format);
+   LLVMValueRef x = params->coords[0], y = params->coords[1], z = params->coords[2];
+   LLVMValueRef ms_index = params->ms_index;
+   LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
+   int_type = lp_type_int(32);
+   int_coord_type = lp_int_type(params->type);
+   lp_build_context_init(&int_bld, gallivm, int_type);
+   lp_build_context_init(&int_coord_bld, gallivm, int_coord_type);
+
+   LLVMValueRef offset, i, j;
+
+   LLVMValueRef row_stride = dynamic_state->row_stride(dynamic_state, gallivm,
+                                                       params->context_ptr, params->image_index);
+   LLVMValueRef img_stride = dynamic_state->img_stride(dynamic_state, gallivm,
+                                                       params->context_ptr, params->image_index);
+   LLVMValueRef base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
+                                                   params->context_ptr, params->image_index);
+   LLVMValueRef width = dynamic_state->width(dynamic_state, gallivm,
+                                                params->context_ptr, params->image_index);
+   LLVMValueRef height = dynamic_state->height(dynamic_state, gallivm,
+                                               params->context_ptr, params->image_index);
+   LLVMValueRef depth = dynamic_state->depth(dynamic_state, gallivm,
+                                              params->context_ptr, params->image_index);
+   LLVMValueRef num_samples = NULL, sample_stride = NULL;
+   if (ms_index) {
+      num_samples = dynamic_state->num_samples(dynamic_state, gallivm,
+                                               params->context_ptr, params->image_index);
+      sample_stride = dynamic_state->sample_stride(dynamic_state, gallivm,
+                                                   params->context_ptr, params->image_index);
+   }
+
+   boolean layer_coord = has_layer_coord(target);
+
+   width = lp_build_broadcast_scalar(&int_coord_bld, width);
+   if (dims >= 2) {
+      height = lp_build_broadcast_scalar(&int_coord_bld, height);
+      row_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, row_stride);
+   }
+   if (dims >= 3 || layer_coord) {
+      depth = lp_build_broadcast_scalar(&int_coord_bld, depth);
+      img_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, img_stride);
+   }
+
+   LLVMValueRef out_of_bounds = int_coord_bld.zero;
+   LLVMValueRef out1;
+   out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
+   out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
+
+   if (dims >= 2) {
+      out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
+      out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
+   }
+   if (dims >= 3) {
+      out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
+      out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
+   }
+   lp_build_sample_offset(&int_coord_bld,
+                          format_desc,
+                          x, y, z, row_stride_vec, img_stride_vec,
+                          &offset, &i, &j);
+
+   if (ms_index) {
+      out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, ms_index, lp_build_broadcast_scalar(&int_coord_bld, num_samples));
+      out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
+
+      offset = lp_build_add(&int_coord_bld, offset,
+                            lp_build_mul(&int_coord_bld, lp_build_broadcast_scalar(&int_coord_bld, sample_stride),
+                                         ms_index));
+   }
+   if (params->img_op == LP_IMG_LOAD) {
+      struct lp_type texel_type = params->type;
+      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
+          format_desc->channel[0].pure_integer) {
+         if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
+            texel_type = lp_type_int_vec(params->type.width, params->type.width * params->type.length);
+         } else if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+            texel_type = lp_type_uint_vec(params->type.width, params->type.width * params->type.length);
+         }
+      }
+
+      if (static_texture_state->format == PIPE_FORMAT_NONE) {
+         /*
+          * If there's nothing bound, format is NONE, and we must return
+          * all zero as mandated by d3d10 in this case.
+          */
+         unsigned chan;
+         LLVMValueRef zero = lp_build_zero(gallivm, params->type);
+         for (chan = 0; chan < 4; chan++) {
+            params->outdata[chan] = zero;
+         }
+         return;
+      }
+
+      offset = lp_build_andnot(&int_coord_bld, offset, out_of_bounds);
+      struct lp_build_context texel_bld;
+      lp_build_context_init(&texel_bld, gallivm, texel_type);
+      lp_build_fetch_rgba_soa(gallivm,
+                              format_desc,
+                              texel_type, TRUE,
+                              base_ptr, offset,
+                              i, j,
+                              NULL,
+                              params->outdata);
+
+      for (unsigned chan = 0; chan < 4; chan++) {
+         params->outdata[chan] = lp_build_select(&texel_bld, out_of_bounds,
+                                                 texel_bld.zero, params->outdata[chan]);
+      }
+   } else if (params->img_op == LP_IMG_STORE) {
+      if (static_texture_state->format == PIPE_FORMAT_NONE)
+         return;
+      lp_build_store_rgba_soa(gallivm, format_desc, params->type, params->exec_mask, base_ptr, offset, out_of_bounds,
+                              params->indata);
+   } else {
+      if (static_texture_state->format == PIPE_FORMAT_NONE) {
+         /*
+         * For atomic operation just return 0 in the unbound case to avoid a crash.
+          */
+         LLVMValueRef zero = lp_build_zero(gallivm, params->type);
+         params->outdata[0] = zero;
+         return;
+      }
+      lp_build_do_atomic_soa(gallivm, format_desc, params->type, params->exec_mask, base_ptr, offset, out_of_bounds,
+                             params->img_op, params->op, params->indata, params->indata2, params->outdata);
+   }
+}