gallivm: nr_channels is only valid for formats with plain layout.
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_sample_soa.c
index d1592c22f4399baea1d9ed53f4f3407cddd66d9d..baf0402f56a4be3faecd8a33727a4c84fe991aec 100644 (file)
@@ -30,6 +30,7 @@
  * Texture sampling -- SoA.
  *
  * @author Jose Fonseca <jfonseca@vmware.com>
+ * @author Brian Paul <brianp@vmware.com>
  */
 
 #include "pipe/p_defines.h"
@@ -39,7 +40,6 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "util/u_format.h"
-#include "util/u_cpu_detect.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_pack.h"
 #include "lp_bld_flow.h"
+#include "lp_bld_gather.h"
 #include "lp_bld_format.h"
 #include "lp_bld_sample.h"
+#include "lp_bld_quad.h"
 
 
 /**
@@ -174,6 +176,7 @@ texture_dims(enum pipe_texture_target tex)
    case PIPE_TEXTURE_1D:
       return 1;
    case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
    case PIPE_TEXTURE_CUBE:
       return 2;
    case PIPE_TEXTURE_3D:
@@ -185,50 +188,18 @@ texture_dims(enum pipe_texture_target tex)
 }
 
 
-static LLVMValueRef
-lp_build_swizzle_chan_soa(struct lp_type type,
-                          const LLVMValueRef *unswizzled,
-                          enum util_format_swizzle swizzle)
-{
-   switch (swizzle) {
-   case PIPE_SWIZZLE_RED:
-   case PIPE_SWIZZLE_GREEN:
-   case PIPE_SWIZZLE_BLUE:
-   case PIPE_SWIZZLE_ALPHA:
-      return unswizzled[swizzle];
-   case PIPE_SWIZZLE_ZERO:
-      return lp_build_zero(type);
-   case PIPE_SWIZZLE_ONE:
-      return lp_build_one(type);
-   default:
-      assert(0);
-      return lp_build_undef(type);
-   }
-}
-
-
 static void
-lp_build_swizzle_soa(struct lp_build_sample_context *bld,
-                     LLVMValueRef *texel)
+apply_sampler_swizzle(struct lp_build_sample_context *bld,
+                      LLVMValueRef *texel)
 {
-   LLVMValueRef unswizzled[4];
    unsigned char swizzles[4];
-   unsigned chan;
-
-   for (chan = 0; chan < 4; ++chan) {
-      unswizzled[chan] = texel[chan];
-   }
 
    swizzles[0] = bld->static_state->swizzle_r;
    swizzles[1] = bld->static_state->swizzle_g;
    swizzles[2] = bld->static_state->swizzle_b;
    swizzles[3] = bld->static_state->swizzle_a;
 
-   for (chan = 0; chan < 4; ++chan) {
-      unsigned swizzle = swizzles[chan];
-      texel[chan] = lp_build_swizzle_chan_soa(bld->texel_type,
-                                              unswizzled, swizzle);
-   }
+   lp_build_swizzle_soa_inplace(&bld->texel_bld, texel, swizzles);
 }
 
 
@@ -253,7 +224,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
                           LLVMValueRef y_stride,
                           LLVMValueRef z_stride,
                           LLVMValueRef data_ptr,
-                          LLVMValueRef *texel)
+                          LLVMValueRef texel_out[4])
 {
    const int dims = texture_dims(bld->static_state->target);
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
@@ -295,35 +266,11 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
       }
    }
 
-   /*
-    * Describe the coordinates in terms of pixel blocks.
-    *
-    * TODO: pixel blocks are power of two. LLVM should convert rem/div to
-    * bit arithmetic. Verify this.
-    */
-
-   if (bld->format_desc->block.width == 1) {
-      i = bld->uint_coord_bld.zero;
-   }
-   else {
-      LLVMValueRef block_width = lp_build_const_int_vec(bld->uint_coord_bld.type, bld->format_desc->block.width);
-      i = LLVMBuildURem(bld->builder, x, block_width, "");
-      x = LLVMBuildUDiv(bld->builder, x, block_width, "");
-   }
-
-   if (bld->format_desc->block.height == 1) {
-      j = bld->uint_coord_bld.zero;
-   }
-   else {
-      LLVMValueRef block_height = lp_build_const_int_vec(bld->uint_coord_bld.type, bld->format_desc->block.height);
-      j = LLVMBuildURem(bld->builder, y, block_height, "");
-      y = LLVMBuildUDiv(bld->builder, y, block_height, "");
-   }
-
    /* convert x,y,z coords to linear offset from start of texture, in bytes */
-   offset = lp_build_sample_offset(&bld->uint_coord_bld,
-                                   bld->format_desc,
-                                   x, y, z, y_stride, z_stride);
+   lp_build_sample_offset(&bld->uint_coord_bld,
+                          bld->format_desc,
+                          x, y, z, y_stride, z_stride,
+                          &offset, &i, &j);
 
    if (use_border) {
       /* If we can sample the border color, it means that texcoords may
@@ -342,9 +289,9 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
                            bld->texel_type,
                            data_ptr, offset,
                            i, j,
-                           texel);
+                           texel_out);
 
-   lp_build_swizzle_soa(bld, texel);
+   apply_sampler_swizzle(bld, texel_out);
 
    /*
     * Note: if we find an app which frequently samples the texture border
@@ -368,42 +315,13 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
          LLVMValueRef border_chan =
             lp_build_const_vec(bld->texel_type,
                                   bld->static_state->border_color[chan]);
-         texel[chan] = lp_build_select(&bld->texel_bld, use_border,
-                                       border_chan, texel[chan]);
+         texel_out[chan] = lp_build_select(&bld->texel_bld, use_border,
+                                           border_chan, texel_out[chan]);
       }
    }
 }
 
 
-static LLVMValueRef
-lp_build_sample_packed(struct lp_build_sample_context *bld,
-                       LLVMValueRef x,
-                       LLVMValueRef y,
-                       LLVMValueRef y_stride,
-                       LLVMValueRef data_array)
-{
-   LLVMValueRef offset;
-   LLVMValueRef data_ptr;
-
-   offset = lp_build_sample_offset(&bld->uint_coord_bld,
-                                   bld->format_desc,
-                                   x, y, NULL, y_stride, NULL);
-
-   assert(bld->format_desc->block.width == 1);
-   assert(bld->format_desc->block.height == 1);
-   assert(bld->format_desc->block.bits <= bld->texel_type.width);
-
-   /* get pointer to mipmap level 0 data */
-   data_ptr = lp_build_get_const_mipmap_level(bld, data_array, 0);
-
-   return lp_build_gather(bld->builder,
-                          bld->texel_type.length,
-                          bld->format_desc->block.bits,
-                          bld->texel_type.width,
-                          data_ptr, offset);
-}
-
-
 /**
  * Helper to compute the mirror function for the PIPE_WRAP_MIRROR modes.
  */
@@ -438,7 +356,7 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld,
 
 
 /**
- * We only support a few wrap modes in lp_build_sample_wrap_int() at this time.
+ * We only support a few wrap modes in lp_build_sample_wrap_linear_int() at this time.
  * Return whether the given mode is supported by that function.
  */
 static boolean
@@ -446,10 +364,8 @@ is_simple_wrap_mode(unsigned mode)
 {
    switch (mode) {
    case PIPE_TEX_WRAP_REPEAT:
-   case PIPE_TEX_WRAP_CLAMP:
    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
       return TRUE;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
    default:
       return FALSE;
    }
@@ -462,13 +378,18 @@ is_simple_wrap_mode(unsigned mode)
  * \param length  the texture size along one dimension
  * \param is_pot  if TRUE, length is a power of two
  * \param wrap_mode  one of PIPE_TEX_WRAP_x
+ * \param i0  resulting sub-block pixel coordinate for coord0
  */
-static LLVMValueRef
-lp_build_sample_wrap_int(struct lp_build_sample_context *bld,
-                         LLVMValueRef coord,
-                         LLVMValueRef length,
-                         boolean is_pot,
-                         unsigned wrap_mode)
+static void
+lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
+                                 unsigned block_length,
+                                 LLVMValueRef coord,
+                                 LLVMValueRef length,
+                                 LLVMValueRef stride,
+                                 boolean is_pot,
+                                 unsigned wrap_mode,
+                                 LLVMValueRef *out_offset,
+                                 LLVMValueRef *out_i)
 {
    struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
@@ -486,29 +407,149 @@ lp_build_sample_wrap_int(struct lp_build_sample_context *bld,
          coord = LLVMBuildURem(bld->builder, coord, length, "");
       break;
 
-   case PIPE_TEX_WRAP_CLAMP:
    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
       coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
       coord = lp_build_min(int_coord_bld, coord, length_minus_one);
       break;
 
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
    case PIPE_TEX_WRAP_MIRROR_REPEAT:
    case PIPE_TEX_WRAP_MIRROR_CLAMP:
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      /* FIXME */
-      _debug_printf("llvmpipe: failed to translate texture wrap mode %s\n",
-                    util_dump_tex_wrap(wrap_mode, TRUE));
-      coord = lp_build_max(uint_coord_bld, coord, uint_coord_bld->zero);
-      coord = lp_build_min(uint_coord_bld, coord, length_minus_one);
+   default:
+      assert(0);
+   }
+
+   lp_build_sample_partial_offset(uint_coord_bld, block_length, coord, stride,
+                                  out_offset, out_i);
+}
+
+
+/**
+ * Build LLVM code for texture wrap mode, for scaled integer texcoords.
+ * \param coord0  the incoming texcoord (s,t,r or q) scaled to the texture size
+ * \param length  the texture size along one dimension
+ * \param stride  pixel stride along the coordinate axis
+ * \param block_length  is the length of the pixel block along the
+ *                      coordinate axis
+ * \param is_pot  if TRUE, length is a power of two
+ * \param wrap_mode  one of PIPE_TEX_WRAP_x
+ * \param offset0  resulting relative offset for coord0
+ * \param offset1  resulting relative offset for coord0 + 1
+ * \param i0  resulting sub-block pixel coordinate for coord0
+ * \param i1  resulting sub-block pixel coordinate for coord0 + 1
+ */
+static void
+lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
+                                unsigned block_length,
+                                LLVMValueRef coord0,
+                                LLVMValueRef length,
+                                LLVMValueRef stride,
+                                boolean is_pot,
+                                unsigned wrap_mode,
+                                LLVMValueRef *offset0,
+                                LLVMValueRef *offset1,
+                                LLVMValueRef *i0,
+                                LLVMValueRef *i1)
+{
+   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef length_minus_one;
+   LLVMValueRef lmask, umask, mask;
+
+   if (block_length != 1) {
+      /*
+       * If the pixel block covers more than one pixel then there is no easy
+       * way to calculate offset1 relative to offset0. Instead, compute them
+       * independently.
+       */
+
+      LLVMValueRef coord1;
+
+      lp_build_sample_wrap_nearest_int(bld,
+                                       block_length,
+                                       coord0,
+                                       length,
+                                       stride,
+                                       is_pot,
+                                       wrap_mode,
+                                       offset0, i0);
+
+      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+
+      lp_build_sample_wrap_nearest_int(bld,
+                                       block_length,
+                                       coord1,
+                                       length,
+                                       stride,
+                                       is_pot,
+                                       wrap_mode,
+                                       offset1, i1);
+
+      return;
+   }
+
+   /*
+    * Scalar pixels -- try to compute offset0 and offset1 with a single stride
+    * multiplication.
+    */
+
+   *i0 = uint_coord_bld->zero;
+   *i1 = uint_coord_bld->zero;
+
+   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      if (is_pot) {
+         coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, "");
+      }
+      else {
+         /* Signed remainder won't give the right results for negative
+          * dividends but unsigned remainder does.*/
+         coord0 = LLVMBuildURem(bld->builder, coord0, length, "");
+      }
+
+      mask = lp_build_compare(bld->builder, int_coord_bld->type,
+                              PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
+
+      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
+      *offset1 = LLVMBuildAnd(bld->builder,
+                              lp_build_add(uint_coord_bld, *offset0, stride),
+                              mask, "");
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      lmask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type,
+                               PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
+      umask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type,
+                               PIPE_FUNC_LESS, coord0, length_minus_one);
+
+      coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero);
+      coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one);
+
+      mask = LLVMBuildAnd(bld->builder, lmask, umask, "");
+
+      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
+      *offset1 = lp_build_add(uint_coord_bld,
+                              *offset0,
+                              LLVMBuildAnd(bld->builder, stride, mask, ""));
       break;
 
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
    default:
       assert(0);
+      *offset0 = uint_coord_bld->zero;
+      *offset1 = uint_coord_bld->zero;
+      break;
    }
-
-   return coord;
 }
 
 
@@ -531,11 +572,9 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   LLVMValueRef two = lp_build_const_vec(coord_bld->type, 2.0);
    LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5);
    LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
    LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
-   LLVMValueRef length_f_minus_one = lp_build_sub(coord_bld, length_f, coord_bld->one);
    LLVMValueRef coord0, coord1, weight;
 
    switch(wrap_mode) {
@@ -563,16 +602,18 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 
    case PIPE_TEX_WRAP_CLAMP:
       if (bld->static_state->normalized_coords) {
+         /* scale coord to length */
          coord = lp_build_mul(coord_bld, coord, length_f);
       }
+
+      /* clamp to [0, length] */
+      coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
+
+      coord = lp_build_sub(coord_bld, coord, half);
+
       weight = lp_build_fract(coord_bld, coord);
-      coord0 = lp_build_clamp(coord_bld, coord, coord_bld->zero,
-                              length_f_minus_one);
-      coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
-      coord1 = lp_build_clamp(coord_bld, coord1, coord_bld->zero,
-                              length_f_minus_one);
-      coord0 = lp_build_ifloor(coord_bld, coord0);
-      coord1 = lp_build_ifloor(coord_bld, coord1);
+      coord0 = lp_build_ifloor(coord_bld, coord);
+      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       break;
 
    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
@@ -586,7 +627,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       else {
          LLVMValueRef min, max;
          /* clamp to [0.5, length - 0.5] */
-         min = lp_build_const_vec(coord_bld->type, 0.5F);
+         min = half;
          max = lp_build_sub(coord_bld, length_f, min);
          coord = lp_build_clamp(coord_bld, coord, min, max);
       }
@@ -605,25 +646,14 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       {
          LLVMValueRef min, max;
          if (bld->static_state->normalized_coords) {
-            /* min = -1.0 / (2 * length) = -0.5 / length */
-            min = lp_build_mul(coord_bld,
-                               lp_build_const_vec(coord_bld->type, -0.5F),
-                               lp_build_rcp(coord_bld, length_f));
-            /* max = 1.0 - min */
-            max = lp_build_sub(coord_bld, coord_bld->one, min);
-            /* coord = clamp(coord, min, max) */
-            coord = lp_build_clamp(coord_bld, coord, min, max);
-            /* scale coord to length (and sub 0.5?) */
+            /* scale coord to length */
             coord = lp_build_mul(coord_bld, coord, length_f);
-            coord = lp_build_sub(coord_bld, coord, half);
-         }
-         else {
-            /* clamp to [-0.5, length + 0.5] */
-            min = lp_build_const_vec(coord_bld->type, -0.5F);
-            max = lp_build_sub(coord_bld, length_f, min);
-            coord = lp_build_clamp(coord_bld, coord, min, max);
-            coord = lp_build_sub(coord_bld, coord, half);
          }
+         /* clamp to [-0.5, length + 0.5] */
+         min = lp_build_const_vec(coord_bld->type, -0.5F);
+         max = lp_build_sub(coord_bld, length_f, min);
+         coord = lp_build_clamp(coord_bld, coord, min, max);
+         coord = lp_build_sub(coord_bld, coord, half);
          /* compute lerp weight */
          weight = lp_build_fract(coord_bld, coord);
          /* convert to int */
@@ -654,35 +684,41 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      {
-         LLVMValueRef min, max;
-         /* min = 1.0 / (2 * length) */
-         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-         /* max = 1.0 - min */
-         max = lp_build_sub(coord_bld, coord_bld->one, min);
+      coord = lp_build_abs(coord_bld, coord);
 
-         coord = lp_build_abs(coord_bld, coord);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
+      if (bld->static_state->normalized_coords) {
+         /* scale coord to length */
          coord = lp_build_mul(coord_bld, coord, length_f);
-         if(0)coord = lp_build_sub(coord_bld, coord, half);
-         weight = lp_build_fract(coord_bld, coord);
-         coord0 = lp_build_ifloor(coord_bld, coord);
-         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       }
+
+      /* clamp to [0, length] */
+      coord = lp_build_min(coord_bld, coord, length_f);
+
+      coord = lp_build_sub(coord_bld, coord, half);
+
+      weight = lp_build_fract(coord_bld, coord);
+      coord0 = lp_build_ifloor(coord_bld, coord);
+      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
       {
          LLVMValueRef min, max;
-         /* min = 1.0 / (2 * length) */
-         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-         /* max = 1.0 - min */
-         max = lp_build_sub(coord_bld, coord_bld->one, min);
 
          coord = lp_build_abs(coord_bld, coord);
+
+         if (bld->static_state->normalized_coords) {
+            /* scale coord to length */
+            coord = lp_build_mul(coord_bld, coord, length_f);
+         }
+
+         /* clamp to [0.5, length - 0.5] */
+         min = half;
+         max = lp_build_sub(coord_bld, length_f, min);
          coord = lp_build_clamp(coord_bld, coord, min, max);
-         coord = lp_build_mul(coord_bld, coord, length_f);
+
          coord = lp_build_sub(coord_bld, coord, half);
+
          weight = lp_build_fract(coord_bld, coord);
          coord0 = lp_build_ifloor(coord_bld, coord);
          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
@@ -692,17 +728,21 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
       {
          LLVMValueRef min, max;
-         /* min = -1.0 / (2 * length) = -0.5 / length */
-         min = lp_build_mul(coord_bld,
-                            lp_build_const_vec(coord_bld->type, -0.5F),
-                            lp_build_rcp(coord_bld, length_f));
-         /* max = 1.0 - min */
-         max = lp_build_sub(coord_bld, coord_bld->one, min);
 
          coord = lp_build_abs(coord_bld, coord);
+
+         if (bld->static_state->normalized_coords) {
+            /* scale coord to length */
+            coord = lp_build_mul(coord_bld, coord, length_f);
+         }
+
+         /* clamp to [-0.5, length + 0.5] */
+         min = lp_build_negate(coord_bld, half);
+         max = lp_build_sub(coord_bld, length_f, min);
          coord = lp_build_clamp(coord_bld, coord, min, max);
-         coord = lp_build_mul(coord_bld, coord, length_f);
+
          coord = lp_build_sub(coord_bld, coord, half);
+
          weight = lp_build_fract(coord_bld, coord);
          coord0 = lp_build_ifloor(coord_bld, coord);
          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
@@ -739,10 +779,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   LLVMValueRef two = lp_build_const_vec(coord_bld->type, 2.0);
    LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
    LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
-   LLVMValueRef length_f_minus_one = lp_build_sub(coord_bld, length_f, coord_bld->one);
    LLVMValueRef icoord;
    
    switch(wrap_mode) {
@@ -758,120 +796,80 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
       break;
 
    case PIPE_TEX_WRAP_CLAMP:
-      /* mul by size */
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
       if (bld->static_state->normalized_coords) {
+         /* scale coord to length */
          coord = lp_build_mul(coord_bld, coord, length_f);
       }
+
       /* floor */
       icoord = lp_build_ifloor(coord_bld, coord);
-      /* clamp to [0, size-1].  Note: int coord builder type */
+
+      /* clamp to [0, length - 1]. */
       icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
                               length_minus_one);
       break;
 
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      {
-         LLVMValueRef min, max;
-         if (bld->static_state->normalized_coords) {
-            /* min = 1.0 / (2 * length) */
-            min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-            /* max = length - min */
-            max = lp_build_sub(coord_bld, length_f, min);
-            /* scale coord to length */
-            coord = lp_build_mul(coord_bld, coord, length_f);
-         }
-         else {
-            /* clamp to [0.5, length - 0.5] */
-            min = lp_build_const_vec(coord_bld->type, 0.5F);
-            max = lp_build_sub(coord_bld, length_f, min);
-         }
-         /* coord = clamp(coord, min, max) */
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-         icoord = lp_build_ifloor(coord_bld, coord);
-      }
-      break;
-
    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
       /* Note: this is the same as CLAMP_TO_EDGE, except min = -min */
       {
          LLVMValueRef min, max;
+
          if (bld->static_state->normalized_coords) {
-            /* min = -1.0 / (2 * length) = -0.5 / length */
-            min = lp_build_mul(coord_bld,
-                               lp_build_const_vec(coord_bld->type, -0.5F),
-                               lp_build_rcp(coord_bld, length_f));
-            /* max = length - min */
-            max = lp_build_sub(coord_bld, length_f, min);
             /* scale coord to length */
             coord = lp_build_mul(coord_bld, coord, length_f);
          }
-         else {
-            /* clamp to [-0.5, length + 0.5] */
-            min = lp_build_const_vec(coord_bld->type, -0.5F);
-            max = lp_build_sub(coord_bld, length_f, min);
-         }
-         /* coord = clamp(coord, min, max) */
-         coord = lp_build_clamp(coord_bld, coord, min, max);
+
          icoord = lp_build_ifloor(coord_bld, coord);
+
+         /* clamp to [-1, length] */
+         min = lp_build_negate(int_coord_bld, int_coord_bld->one);
+         max = length;
+         icoord = lp_build_clamp(int_coord_bld, icoord, min, max);
       }
       break;
 
    case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      {
-         LLVMValueRef min, max;
-         /* min = 1.0 / (2 * length) */
-         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-         /* max = length - min */
-         max = lp_build_sub(coord_bld, length_f, min);
+      /* compute mirror function */
+      coord = lp_build_coord_mirror(bld, coord);
 
-         /* compute mirror function */
-         coord = lp_build_coord_mirror(bld, coord);
+      /* scale coord to length */
+      assert(bld->static_state->normalized_coords);
+      coord = lp_build_mul(coord_bld, coord, length_f);
 
-         /* scale coord to length */
-         coord = lp_build_mul(coord_bld, coord, length_f);
+      icoord = lp_build_ifloor(coord_bld, coord);
 
-         /* coord = clamp(coord, min, max) */
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-         icoord = lp_build_ifloor(coord_bld, coord);
-      }
+      /* clamp to [0, length - 1] */
+      icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      coord = lp_build_abs(coord_bld, coord);
-      coord = lp_build_mul(coord_bld, coord, length_f);
-      coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f_minus_one);
-      icoord = lp_build_ifloor(coord_bld, coord);
-      break;
-
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      {
-         LLVMValueRef min, max;
-         /* min = 1.0 / (2 * length) */
-         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-         /* max = length - min */
-         max = lp_build_sub(coord_bld, length_f, min);
+      coord = lp_build_abs(coord_bld, coord);
 
-         coord = lp_build_abs(coord_bld, coord);
+      if (bld->static_state->normalized_coords) {
+         /* scale coord to length */
          coord = lp_build_mul(coord_bld, coord, length_f);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-         icoord = lp_build_ifloor(coord_bld, coord);
       }
+
+      icoord = lp_build_ifloor(coord_bld, coord);
+
+      /* clamp to [0, length - 1] */
+      icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      {
-         LLVMValueRef min, max;
-         /* min = 1.0 / (2 * length) */
-         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-         min = lp_build_negate(coord_bld, min);
-         /* max = length - min */
-         max = lp_build_sub(coord_bld, length_f, min);
+      coord = lp_build_abs(coord_bld, coord);
 
-         coord = lp_build_abs(coord_bld, coord);
+      if (bld->static_state->normalized_coords) {
+         /* scale coord to length */
          coord = lp_build_mul(coord_bld, coord, length_f);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-         icoord = lp_build_ifloor(coord_bld, coord);
       }
+
+      icoord = lp_build_ifloor(coord_bld, coord);
+
+      /* clamp to [0, length] */
+      icoord = lp_build_min(int_coord_bld, icoord, length);
       break;
 
    default:
@@ -892,7 +890,7 @@ lp_build_minify(struct lp_build_sample_context *bld,
                 LLVMValueRef base_size,
                 LLVMValueRef level)
 {
-   LLVMValueRef size = LLVMBuildAShr(bld->builder, base_size, level, "minify");
+   LLVMValueRef size = LLVMBuildLShr(bld->builder, base_size, level, "minify");
    size = lp_build_max(&bld->int_coord_bld, size, bld->int_coord_bld.one);
    return size;
 }
@@ -900,22 +898,23 @@ lp_build_minify(struct lp_build_sample_context *bld,
 
 /**
  * Generate code to compute texture level of detail (lambda).
- * \param s  vector of texcoord s values
- * \param t  vector of texcoord t values
- * \param r  vector of texcoord r values
- * \param shader_lod_bias  vector float with the shader lod bias,
+ * \param ddx  partial derivatives of (s, t, r, q) with respect to X
+ * \param ddy  partial derivatives of (s, t, r, q) with respect to Y
+ * \param lod_bias  optional float vector with the shader lod bias
+ * \param explicit_lod  optional float vector with the explicit lod
  * \param width  scalar int texture width
  * \param height  scalar int texture height
  * \param depth  scalar int texture depth
+ *
+ * XXX: The resulting lod is scalar, so ignore all but the first element of
+ * derivatives, lod_bias, etc that are passed by the shader.
  */
 static LLVMValueRef
 lp_build_lod_selector(struct lp_build_sample_context *bld,
-                      LLVMValueRef s,
-                      LLVMValueRef t,
-                      LLVMValueRef r,
-                      const LLVMValueRef *ddx,
-                      const LLVMValueRef *ddy,
-                      LLVMValueRef shader_lod_bias,
+                      const LLVMValueRef ddx[4],
+                      const LLVMValueRef ddy[4],
+                      LLVMValueRef lod_bias, /* optional */
+                      LLVMValueRef explicit_lod, /* optional */
                       LLVMValueRef width,
                       LLVMValueRef height,
                       LLVMValueRef depth)
@@ -928,7 +927,6 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
       return LLVMConstReal(LLVMFloatType(), bld->static_state->min_lod);
    }
    else {
-      const int dims = texture_dims(bld->static_state->target);
       struct lp_build_context *float_bld = &bld->float_bld;
       LLVMValueRef sampler_lod_bias = LLVMConstReal(LLVMFloatType(),
                                                     bld->static_state->lod_bias);
@@ -936,67 +934,69 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
                                            bld->static_state->min_lod);
       LLVMValueRef max_lod = LLVMConstReal(LLVMFloatType(),
                                            bld->static_state->max_lod);
-
       LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
-      LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy;
-      LLVMValueRef rho, lod;
+      LLVMValueRef lod;
 
-      /*
-       * dsdx = abs(s[1] - s[0]);
-       * dsdy = abs(s[2] - s[0]);
-       * dtdx = abs(t[1] - t[0]);
-       * dtdy = abs(t[2] - t[0]);
-       * drdx = abs(r[1] - r[0]);
-       * drdy = abs(r[2] - r[0]);
-       */
-      dsdx = LLVMBuildExtractElement(bld->builder, ddx[0], index0, "dsdx");
-      dsdx = lp_build_abs(float_bld, dsdx);
-      dsdy = LLVMBuildExtractElement(bld->builder, ddy[0], index0, "dsdy");
-      dsdy = lp_build_abs(float_bld, dsdy);
-      if (dims > 1) {
-         dtdx = LLVMBuildExtractElement(bld->builder, ddx[1], index0, "dtdx");
-         dtdx = lp_build_abs(float_bld, dtdx);
-         dtdy = LLVMBuildExtractElement(bld->builder, ddy[1], index0, "dtdy");
-         dtdy = lp_build_abs(float_bld, dtdy);
-         if (dims > 2) {
-            drdx = LLVMBuildExtractElement(bld->builder, ddx[2], index0, "drdx");
-            drdx = lp_build_abs(float_bld, drdx);
-            drdy = LLVMBuildExtractElement(bld->builder, ddy[2], index0, "drdy");
-            drdy = lp_build_abs(float_bld, drdy);
-         }
+      if (explicit_lod) {
+         lod = LLVMBuildExtractElement(bld->builder, explicit_lod,
+                                       index0, "");
       }
+      else {
+         const int dims = texture_dims(bld->static_state->target);
+         LLVMValueRef dsdx, dsdy;
+         LLVMValueRef dtdx = NULL, dtdy = NULL, drdx = NULL, drdy = NULL;
+         LLVMValueRef rho;
+
+         dsdx = LLVMBuildExtractElement(bld->builder, ddx[0], index0, "dsdx");
+         dsdx = lp_build_abs(float_bld, dsdx);
+         dsdy = LLVMBuildExtractElement(bld->builder, ddy[0], index0, "dsdy");
+         dsdy = lp_build_abs(float_bld, dsdy);
+         if (dims > 1) {
+            dtdx = LLVMBuildExtractElement(bld->builder, ddx[1], index0, "dtdx");
+            dtdx = lp_build_abs(float_bld, dtdx);
+            dtdy = LLVMBuildExtractElement(bld->builder, ddy[1], index0, "dtdy");
+            dtdy = lp_build_abs(float_bld, dtdy);
+            if (dims > 2) {
+               drdx = LLVMBuildExtractElement(bld->builder, ddx[2], index0, "drdx");
+               drdx = lp_build_abs(float_bld, drdx);
+               drdy = LLVMBuildExtractElement(bld->builder, ddy[2], index0, "drdy");
+               drdy = lp_build_abs(float_bld, drdy);
+            }
+         }
 
-      /* Compute rho = max of all partial derivatives scaled by texture size.
-       * XXX this could be vectorized somewhat
-       */
-      rho = LLVMBuildMul(bld->builder,
-                         lp_build_max(float_bld, dsdx, dsdy),
-                         lp_build_int_to_float(float_bld, width), "");
-      if (dims > 1) {
-         LLVMValueRef max;
-         max = LLVMBuildMul(bld->builder,
-                            lp_build_max(float_bld, dtdx, dtdy),
-                            lp_build_int_to_float(float_bld, height), "");
-         rho = lp_build_max(float_bld, rho, max);
-         if (dims > 2) {
-            max = LLVMBuildMul(bld->builder,
-                               lp_build_max(float_bld, drdx, drdy),
-                               lp_build_int_to_float(float_bld, depth), "");
+         /* Compute rho = max of all partial derivatives scaled by texture size.
+          * XXX this could be vectorized somewhat
+          */
+         rho = LLVMBuildFMul(bld->builder,
+                            lp_build_max(float_bld, dsdx, dsdy),
+                            lp_build_int_to_float(float_bld, width), "");
+         if (dims > 1) {
+            LLVMValueRef max;
+            max = LLVMBuildFMul(bld->builder,
+                               lp_build_max(float_bld, dtdx, dtdy),
+                               lp_build_int_to_float(float_bld, height), "");
             rho = lp_build_max(float_bld, rho, max);
+            if (dims > 2) {
+               max = LLVMBuildFMul(bld->builder,
+                                  lp_build_max(float_bld, drdx, drdy),
+                                  lp_build_int_to_float(float_bld, depth), "");
+               rho = lp_build_max(float_bld, rho, max);
+            }
          }
-      }
 
-      /* compute lod = log2(rho) */
-      lod = lp_build_log2(float_bld, rho);
+         /* compute lod = log2(rho) */
+         lod = lp_build_log2(float_bld, rho);
 
-      /* add sampler lod bias */
-      lod = LLVMBuildAdd(bld->builder, lod, sampler_lod_bias, "sampler LOD bias");
+         /* add shader lod bias */
+         if (lod_bias) {
+            lod_bias = LLVMBuildExtractElement(bld->builder, lod_bias,
+                                               index0, "");
+            lod = LLVMBuildFAdd(bld->builder, lod, lod_bias, "shader_lod_bias");
+         }
+      }
 
-      /* add shader lod bias */
-      /* XXX for now we take only the first element since our lod is scalar */
-      shader_lod_bias = LLVMBuildExtractElement(bld->builder, shader_lod_bias,
-                                                LLVMConstInt(LLVMInt32Type(), 0, 0), "");
-      lod = LLVMBuildAdd(bld->builder, lod, shader_lod_bias, "shader LOD bias");
+      /* add sampler lod bias */
+      lod = LLVMBuildFAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias");
 
       /* clamp lod */
       lod = lp_build_clamp(float_bld, lod, min_lod, max_lod);
@@ -1064,8 +1064,10 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                                 int_bld->zero,
                                 last_level);
    /* compute level 1 and clamp to legal range of levels */
-   *level1_out = lp_build_add(int_bld, *level0_out, int_bld->one);
-   *level1_out = lp_build_min(int_bld, *level1_out, last_level);
+   level = lp_build_add(int_bld, level, int_bld->one);
+   *level1_out = lp_build_clamp(int_bld, level,
+                                int_bld->zero,
+                                last_level);
 
    *weight_out = lp_build_fract(float_bld, lod);
 }
@@ -1296,8 +1298,7 @@ lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
    /* ima = -0.5 / abs(coord); */
    LLVMValueRef negHalf = lp_build_const_vec(coord_bld->type, -0.5);
    LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
-   LLVMValueRef ima = lp_build_mul(coord_bld, negHalf,
-                                   lp_build_rcp(coord_bld, absCoord));
+   LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord);
    return ima;
 }
 
@@ -1354,7 +1355,7 @@ lp_build_cube_face(struct lp_build_sample_context *bld,
 
 
 /**
- * Generate code to do cube face selection and per-face texcoords.
+ * Generate code to do cube face selection and compute per-face texcoords.
  */
 static void
 lp_build_cube_lookup(struct lp_build_sample_context *bld,
@@ -1478,7 +1479,6 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
          lp_build_endif(&if_ctx2);
          lp_build_flow_scope_end(flow_ctx2);
          lp_build_flow_destroy(flow_ctx2);
-
          *face_s = face_s2;
          *face_t = face_t2;
          *face = face2;
@@ -1524,13 +1524,14 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    int chan;
 
    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+      /* sample the first mipmap level */
       lp_build_sample_image_nearest(bld,
                                     width0_vec, height0_vec, depth0_vec,
                                     row_stride0_vec, img_stride0_vec,
                                     data_ptr0, s, t, r, colors0);
 
       if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         /* sample the second mipmap level, and interp */
+         /* sample the second mipmap level */
          lp_build_sample_image_nearest(bld,
                                        width1_vec, height1_vec, depth1_vec,
                                        row_stride1_vec, img_stride1_vec,
@@ -1540,13 +1541,14 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    else {
       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
 
+      /* sample the first mipmap level */
       lp_build_sample_image_linear(bld,
                                    width0_vec, height0_vec, depth0_vec,
                                    row_stride0_vec, img_stride0_vec,
                                    data_ptr0, s, t, r, colors0);
 
       if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         /* sample the second mipmap level, and interp */
+         /* sample the second mipmap level */
          lp_build_sample_image_linear(bld,
                                       width1_vec, height1_vec, depth1_vec,
                                       row_stride1_vec, img_stride1_vec,
@@ -1584,7 +1586,8 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
                         LLVMValueRef r,
                         const LLVMValueRef *ddx,
                         const LLVMValueRef *ddy,
-                        LLVMValueRef lodbias,
+                        LLVMValueRef lod_bias, /* optional */
+                        LLVMValueRef explicit_lod, /* optional */
                         LLVMValueRef width,
                         LLVMValueRef height,
                         LLVMValueRef depth,
@@ -1608,12 +1611,37 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
    LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL;
    LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL;
    LLVMValueRef data_ptr0, data_ptr1 = NULL;
+   LLVMValueRef face_ddx[4], face_ddy[4];
 
    /*
    printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
           mip_filter, min_filter, mag_filter);
    */
 
+   /*
+    * Choose cube face, recompute texcoords and derivatives for the chosen face.
+    */
+   if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+      LLVMValueRef face, face_s, face_t;
+      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
+      s = face_s; /* vec */
+      t = face_t; /* vec */
+      /* use 'r' to indicate cube face */
+      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
+
+      /* recompute ddx, ddy using the new (s,t) face texcoords */
+      face_ddx[0] = lp_build_ddx(&bld->coord_bld, s);
+      face_ddx[1] = lp_build_ddx(&bld->coord_bld, t);
+      face_ddx[2] = NULL;
+      face_ddx[3] = NULL;
+      face_ddy[0] = lp_build_ddy(&bld->coord_bld, s);
+      face_ddy[1] = lp_build_ddy(&bld->coord_bld, t);
+      face_ddy[2] = NULL;
+      face_ddy[3] = NULL;
+      ddx = face_ddx;
+      ddy = face_ddy;
+   }
+
    /*
     * Compute the level of detail (float).
     */
@@ -1622,7 +1650,8 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       /* Need to compute lod either to choose mipmap levels or to
        * distinguish between minification/magnification with one mipmap level.
        */
-      lod = lp_build_lod_selector(bld, s, t, r, ddx, ddy, lodbias,
+      lod = lp_build_lod_selector(bld, ddx, ddy,
+                                  lod_bias, explicit_lod,
                                   width, height, depth);
    }
 
@@ -1631,9 +1660,20 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
     */
    if (mip_filter == PIPE_TEX_MIPFILTER_NONE) {
       /* always use mip level 0 */
-      ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+      if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         /* XXX this is a work-around for an apparent bug in LLVM 2.7.
+          * We should be able to set ilevel0 = const(0) but that causes
+          * bad x86 code to be emitted.
+          */
+         lod = lp_build_const_elem(bld->coord_bld.type, 0.0);
+         lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
+      }
+      else {
+         ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+      }
    }
    else {
+      assert(lod);
       if (mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
          lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
       }
@@ -1687,18 +1727,6 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       }
    }
 
-   /*
-    * Choose cube face, recompute per-face texcoords.
-    */
-   if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
-      LLVMValueRef face, face_s, face_t;
-      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
-      s = face_s; /* vec */
-      t = face_t; /* vec */
-      /* use 'r' to indicate cube face */
-      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
-   }
-
    /*
     * Get pointer(s) to image data for mipmap level(s).
     */
@@ -1776,36 +1804,6 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
 
 
 
-static void
-lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
-                          struct lp_type dst_type,
-                          LLVMValueRef packed,
-                          LLVMValueRef *rgba)
-{
-   LLVMValueRef mask = lp_build_const_int_vec(dst_type, 0xff);
-   unsigned chan;
-
-   /* Decode the input vector components */
-   for (chan = 0; chan < 4; ++chan) {
-      unsigned start = chan*8;
-      unsigned stop = start + 8;
-      LLVMValueRef input;
-
-      input = packed;
-
-      if(start)
-         input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(dst_type, start), "");
-
-      if(stop < 32)
-         input = LLVMBuildAnd(builder, input, mask, "");
-
-      input = lp_build_unsigned_norm_to_float(builder, 8, dst_type, input);
-
-      rgba[chan] = input;
-   }
-}
-
-
 static void
 lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
                               LLVMValueRef s,
@@ -1814,7 +1812,7 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
                               LLVMValueRef height,
                               LLVMValueRef stride_array,
                               LLVMValueRef data_array,
-                              LLVMValueRef *texel)
+                              LLVMValueRef texel_out[4])
 {
    LLVMBuilderRef builder = bld->builder;
    struct lp_build_context i32, h16, u8n;
@@ -1822,16 +1820,21 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
    LLVMValueRef i32_c8, i32_c128, i32_c255;
    LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
    LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi;
-   LLVMValueRef x0, x1;
-   LLVMValueRef y0, y1;
-   LLVMValueRef neighbors[2][2];
+   LLVMValueRef data_ptr;
+   LLVMValueRef x_stride, y_stride;
+   LLVMValueRef x_offset0, x_offset1;
+   LLVMValueRef y_offset0, y_offset1;
+   LLVMValueRef offset[2][2];
+   LLVMValueRef x_subcoord[2], y_subcoord[2];
    LLVMValueRef neighbors_lo[2][2];
    LLVMValueRef neighbors_hi[2][2];
    LLVMValueRef packed, packed_lo, packed_hi;
    LLVMValueRef unswizzled[4];
-   LLVMValueRef stride;
+   const unsigned level = 0;
+   unsigned i, j;
 
-   assert(bld->static_state->target == PIPE_TEXTURE_2D);
+   assert(bld->static_state->target == PIPE_TEXTURE_2D
+         || bld->static_state->target == PIPE_TEXTURE_RECT);
    assert(bld->static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR);
    assert(bld->static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR);
    assert(bld->static_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE);
@@ -1875,21 +1878,30 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
    s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
    t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
 
-   x0 = s_ipart;
-   y0 = t_ipart;
-
-   x1 = lp_build_add(&bld->int_coord_bld, x0, bld->int_coord_bld.one);
-   y1 = lp_build_add(&bld->int_coord_bld, y0, bld->int_coord_bld.one);
-
-   x0 = lp_build_sample_wrap_int(bld, x0, width,  bld->static_state->pot_width,
-                                 bld->static_state->wrap_s);
-   y0 = lp_build_sample_wrap_int(bld, y0, height, bld->static_state->pot_height,
-                                 bld->static_state->wrap_t);
-
-   x1 = lp_build_sample_wrap_int(bld, x1, width,  bld->static_state->pot_width,
-                                 bld->static_state->wrap_s);
-   y1 = lp_build_sample_wrap_int(bld, y1, height, bld->static_state->pot_height,
-                                 bld->static_state->wrap_t);
+   x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
+                                 bld->format_desc->block.bits/8);
+
+   y_stride = lp_build_get_const_level_stride_vec(bld, stride_array, level);
+
+   lp_build_sample_wrap_linear_int(bld,
+                                   bld->format_desc->block.width,
+                                   s_ipart, width, x_stride,
+                                   bld->static_state->pot_width,
+                                   bld->static_state->wrap_s,
+                                   &x_offset0, &x_offset1,
+                                   &x_subcoord[0], &x_subcoord[1]);
+   lp_build_sample_wrap_linear_int(bld,
+                                   bld->format_desc->block.height,
+                                   t_ipart, height, y_stride,
+                                   bld->static_state->pot_height,
+                                   bld->static_state->wrap_t,
+                                   &y_offset0, &y_offset1,
+                                   &y_subcoord[0], &y_subcoord[1]);
+
+   offset[0][0] = lp_build_add(&bld->uint_coord_bld, x_offset0, y_offset0);
+   offset[0][1] = lp_build_add(&bld->uint_coord_bld, x_offset1, y_offset0);
+   offset[1][0] = lp_build_add(&bld->uint_coord_bld, x_offset0, y_offset1);
+   offset[1][1] = lp_build_add(&bld->uint_coord_bld, x_offset1, y_offset1);
 
    /*
     * Transform 4 x i32 in
@@ -1918,10 +1930,13 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
       LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
       LLVMValueRef shuffle_lo;
       LLVMValueRef shuffle_hi;
-      unsigned i, j;
 
       for(j = 0; j < h16.type.length; j += 4) {
-         unsigned subindex = util_cpu_caps.little_endian ? 0 : 1;
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+         unsigned subindex = 0;
+#else
+         unsigned subindex = 1;
+#endif
          LLVMValueRef index;
 
          index = LLVMConstInt(elem_type, j/2 + subindex, 0);
@@ -1942,7 +1957,10 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
       t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_hi, "");
    }
 
-   stride = lp_build_get_const_level_stride_vec(bld, stride_array, 0);
+   /*
+    * get pointer to mipmap level 0 data
+    */
+   data_ptr = lp_build_get_const_mipmap_level(bld, data_array, level);
 
    /*
     * Fetch the pixels as 4 x 32bit (rgba order might differ):
@@ -1961,20 +1979,38 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
     * The higher 8 bits of the resulting elements will be zero.
     */
 
-   neighbors[0][0] = lp_build_sample_packed(bld, x0, y0, stride, data_array);
-   neighbors[0][1] = lp_build_sample_packed(bld, x1, y0, stride, data_array);
-   neighbors[1][0] = lp_build_sample_packed(bld, x0, y1, stride, data_array);
-   neighbors[1][1] = lp_build_sample_packed(bld, x1, y1, stride, data_array);
+   for (j = 0; j < 2; ++j) {
+      for (i = 0; i < 2; ++i) {
+         LLVMValueRef rgba8;
 
-   neighbors[0][0] = LLVMBuildBitCast(builder, neighbors[0][0], u8n_vec_type, "");
-   neighbors[0][1] = LLVMBuildBitCast(builder, neighbors[0][1], u8n_vec_type, "");
-   neighbors[1][0] = LLVMBuildBitCast(builder, neighbors[1][0], u8n_vec_type, "");
-   neighbors[1][1] = LLVMBuildBitCast(builder, neighbors[1][1], u8n_vec_type, "");
+         if (util_format_is_rgba8_variant(bld->format_desc)) {
+            /*
+             * Given the format is a rgba8, just read the pixels as is,
+             * without any swizzling. Swizzling will be done later.
+             */
+            rgba8 = lp_build_gather(bld->builder,
+                                    bld->texel_type.length,
+                                    bld->format_desc->block.bits,
+                                    bld->texel_type.width,
+                                    data_ptr, offset[j][i]);
 
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][0], &neighbors_lo[0][0], &neighbors_hi[0][0]);
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][1], &neighbors_lo[0][1], &neighbors_hi[0][1]);
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][0], &neighbors_lo[1][0], &neighbors_hi[1][0]);
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][1], &neighbors_lo[1][1], &neighbors_hi[1][1]);
+            rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+
+         }
+         else {
+            rgba8 = lp_build_fetch_rgba_aos(bld->builder,
+                                            bld->format_desc,
+                                            u8n.type,
+                                            data_ptr, offset[j][i],
+                                            x_subcoord[i],
+                                            y_subcoord[j]);
+         }
+
+         lp_build_unpack2(builder, u8n.type, h16.type,
+                          rgba8,
+                          &neighbors_lo[j][i], &neighbors_hi[j][i]);
+      }
+   }
 
    /*
     * Linear interpolate with 8.8 fixed point.
@@ -2000,24 +2036,29 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
     * Convert to SoA and swizzle.
     */
 
-   packed = LLVMBuildBitCast(builder, packed, i32_vec_type, "");
-
    lp_build_rgba8_to_f32_soa(bld->builder,
                              bld->texel_type,
                              packed, unswizzled);
 
-   lp_build_format_swizzle_soa(bld->format_desc,
-                               bld->texel_type, unswizzled,
-                               texel);
+   if (util_format_is_rgba8_variant(bld->format_desc)) {
+      lp_build_format_swizzle_soa(bld->format_desc,
+                                  &bld->texel_bld,
+                                  unswizzled, texel_out);
+   } else {
+      texel_out[0] = unswizzled[0];
+      texel_out[1] = unswizzled[1];
+      texel_out[2] = unswizzled[2];
+      texel_out[3] = unswizzled[3];
+   }
 
-   lp_build_swizzle_soa(bld, texel);
+   apply_sampler_swizzle(bld, texel_out);
 }
 
 
 static void
 lp_build_sample_compare(struct lp_build_sample_context *bld,
                         LLVMValueRef p,
-                        LLVMValueRef *texel)
+                        LLVMValueRef texel[4])
 {
    struct lp_build_context *texel_bld = &bld->texel_bld;
    LLVMValueRef res;
@@ -2055,14 +2096,14 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
  */
 static void
 lp_build_sample_nop(struct lp_build_sample_context *bld,
-                    LLVMValueRef *texel)
+                    LLVMValueRef texel_out[4])
 {
    struct lp_build_context *texel_bld = &bld->texel_bld;
    unsigned chan;
 
    for (chan = 0; chan < 4; chan++) {
       /*lp_bld_mov(texel_bld, texel, texel_bld->one);*/
-      texel[chan] = texel_bld->one;
+      texel_out[chan] = texel_bld->one;
    }  
 }
 
@@ -2072,6 +2113,8 @@ lp_build_sample_nop(struct lp_build_sample_context *bld,
  * 'texel' will return a vector of four LLVMValueRefs corresponding to
  * R, G, B, A.
  * \param type  vector float type to use for coords, etc.
+ * \param ddx  partial derivatives of (s,t,r,q) with respect to x
+ * \param ddy  partial derivatives of (s,t,r,q) with respect to y
  */
 void
 lp_build_sample_soa(LLVMBuilderRef builder,
@@ -2081,10 +2124,11 @@ lp_build_sample_soa(LLVMBuilderRef builder,
                     unsigned unit,
                     unsigned num_coords,
                     const LLVMValueRef *coords,
-                    const LLVMValueRef *ddx,
-                    const LLVMValueRef *ddy,
-                    LLVMValueRef lodbias,
-                    LLVMValueRef *texel)
+                    const LLVMValueRef ddx[4],
+                    const LLVMValueRef ddy[4],
+                    LLVMValueRef lod_bias, /* optional */
+                    LLVMValueRef explicit_lod, /* optional */
+                    LLVMValueRef texel_out[4])
 {
    struct lp_build_sample_context bld;
    LLVMValueRef width, width_vec;
@@ -2096,6 +2140,13 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    LLVMValueRef t;
    LLVMValueRef r;
 
+   if (0) {
+      enum pipe_format fmt = static_state->format;
+      debug_printf("Sample from %s\n", util_format_name(fmt));
+   }
+
+   assert(type.floating);
+
    /* Setup our build context */
    memset(&bld, 0, sizeof bld);
    bld.builder = builder;
@@ -2136,10 +2187,11 @@ lp_build_sample_soa(LLVMBuilderRef builder,
 
    if (0) {
       /* For debug: no-op texture sampling */
-      lp_build_sample_nop(&bld, texel);
+      lp_build_sample_nop(&bld, texel_out);
    }
-   else if (util_format_is_rgba8_variant(bld.format_desc) &&
-            static_state->target == PIPE_TEXTURE_2D &&
+   else if (util_format_fits_8unorm(bld.format_desc) &&
+            (static_state->target == PIPE_TEXTURE_2D ||
+             static_state->target == PIPE_TEXTURE_RECT) &&
             static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR &&
             static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR &&
             static_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
@@ -2147,16 +2199,26 @@ lp_build_sample_soa(LLVMBuilderRef builder,
             is_simple_wrap_mode(static_state->wrap_t)) {
       /* special case */
       lp_build_sample_2d_linear_aos(&bld, s, t, width_vec, height_vec,
-                                    row_stride_array, data_array, texel);
+                                    row_stride_array, data_array, texel_out);
    }
    else {
-      lp_build_sample_general(&bld, unit, s, t, r, ddx, ddy, lodbias,
+      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
+          (static_state->min_img_filter != PIPE_TEX_FILTER_NEAREST ||
+           static_state->mag_img_filter != PIPE_TEX_FILTER_NEAREST ||
+           static_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) &&
+          util_format_fits_8unorm(bld.format_desc)) {
+         debug_printf("%s: using floating point linear filtering for %s\n",
+                      __FUNCTION__, bld.format_desc->short_name);
+      }
+
+      lp_build_sample_general(&bld, unit, s, t, r, ddx, ddy,
+                              lod_bias, explicit_lod,
                               width, height, depth,
                               width_vec, height_vec, depth_vec,
                               row_stride_array, img_stride_array,
                               data_array,
-                              texel);
+                              texel_out);
    }
 
-   lp_build_sample_compare(&bld, r, texel);
+   lp_build_sample_compare(&bld, r, texel_out);
 }