From 0b6554ba6f2aa8a771852566340c24205e406d02 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Tue, 27 Nov 2012 03:26:49 +0100
Subject: [PATCH] gallivm,llvmpipe: handle TXF (texelFetch) instruction,
 including offsets
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

This also adds some code to handle per-quad lods for more than 4-wide fetches,
because otherwise I'd have to integrate the texelFetch function into
the splitting stuff... (but it is not used yet outside texelFetch).
passes piglit fs-texelFetch-2D, fails fs-texelFetchOffset-2D due to I believe
a test error (results are undefined for out-of-bounds fetches, we return
whatever is at offset 0, whereas the test expects [0,0,0,1]).
Texel offsets are only handled by texelFetch for now, though the interface
can handle it for everything.

Reviewed-by: JosÃ© Fonseca <jfonseca@vmware.com>
---
 src/gallium/auxiliary/draw/draw_llvm_sample.c |   9 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample.c | 268 +++++++++++++++---
 src/gallium/auxiliary/gallivm/lp_bld_sample.h |  25 +-
 .../auxiliary/gallivm/lp_bld_sample_aos.c     |   8 +-
 .../auxiliary/gallivm/lp_bld_sample_soa.c     | 167 ++++++++++-
 .../auxiliary/gallivm/lp_bld_swizzle.c        |   9 +-
 .../auxiliary/gallivm/lp_bld_swizzle.h        |   3 +-
 src/gallium/auxiliary/gallivm/lp_bld_tgsi.c   |  60 ++++
 src/gallium/auxiliary/gallivm/lp_bld_tgsi.h   |  11 +-
 .../auxiliary/gallivm/lp_bld_tgsi_soa.c       | 115 +++++++-
 src/gallium/auxiliary/tgsi/tgsi_info.c        |   1 +
 src/gallium/drivers/llvmpipe/lp_screen.c      |   4 +-
 src/gallium/drivers/llvmpipe/lp_tex_sample.c  |   9 +-
 13 files changed, 618 insertions(+), 71 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm_sample.c b/src/gallium/auxiliary/draw/draw_llvm_sample.c
index 0892d16bd6d..67d4e9339d6 100644
--- a/src/gallium/auxiliary/draw/draw_llvm_sample.c
+++ b/src/gallium/auxiliary/draw/draw_llvm_sample.c
@@ -171,9 +171,10 @@ static void
 draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                                        struct gallivm_state *gallivm,
                                        struct lp_type type,
+                                       boolean is_fetch,
                                        unsigned unit,
-                                       unsigned num_coords,
                                        const LLVMValueRef *coords,
+                                       const LLVMValueRef *offsets,
                                        const struct lp_derivatives *derivs,
                                        LLVMValueRef lod_bias, /* optional */
                                        LLVMValueRef explicit_lod, /* optional */
@@ -187,8 +188,10 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                        &sampler->dynamic_state.static_state[unit],
                        &sampler->dynamic_state.base,
                        type,
+                       is_fetch,
                        unit,
-                       num_coords, coords,
+                       coords,
+                       offsets,
                        derivs,
                        lod_bias, explicit_lod,
                        texel);
@@ -213,7 +216,7 @@ draw_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
    lp_build_size_query_soa(gallivm,
                            &sampler->dynamic_state.static_state[unit],
                            &sampler->dynamic_state.base,
-			   type,
+                           type,
                            unit,
                            explicit_lod,
                            sizes_out);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 0727fd2b91a..ea7dd95b78b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -186,8 +186,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
              const struct lp_derivatives *derivs)
 {
    struct gallivm_state *gallivm = bld->gallivm;
-   struct lp_build_context *int_size_bld = &bld->int_size_bld;
-   struct lp_build_context *float_size_bld = &bld->float_size_bld;
+   struct lp_build_context *int_size_bld = &bld->int_size_in_bld;
+   struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
    struct lp_build_context *float_bld = &bld->float_bld;
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
@@ -316,7 +316,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
          }
       }
       rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                      perquadf_bld->type, rho);
+                                      perquadf_bld->type, rho, 0);
    }
    else {
       if (dims <= 1) {
@@ -517,7 +517,7 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
    else {
       if (explicit_lod) {
          lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
-                                         perquadf_bld->type, explicit_lod);
+                                         perquadf_bld->type, explicit_lod, 0);
       }
       else {
          LLVMValueRef rho;
@@ -562,7 +562,7 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
          /* add shader lod bias */
          if (lod_bias) {
             lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
-                  perquadf_bld->type, lod_bias);
+                  perquadf_bld->type, lod_bias, 0);
             lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
          }
       }
@@ -725,7 +725,6 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
 
 /**
  * Return pointer to a single mipmap level.
- * \param data_array  array of pointers to mipmap levels
  * \param level  integer mipmap level
  */
 LLVMValueRef
@@ -743,6 +742,55 @@ lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
    return data_ptr;
 }
 
+/**
+ * Return (per-pixel) offsets to mip levels.
+ * \param level  integer mipmap level
+ */
+LLVMValueRef
+lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
+                         LLVMValueRef level)
+{
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   LLVMValueRef indexes[2], offsets, offset1;
+
+   indexes[0] = lp_build_const_int32(bld->gallivm, 0);
+   if (bld->num_lods == 1) {
+      indexes[1] = level;
+      offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
+      offset1 = LLVMBuildLoad(builder, offset1, "");
+      offsets = lp_build_broadcast_scalar(&bld->int_coord_bld, offset1);
+   }
+   else if (bld->num_lods == bld->coord_bld.type.length / 4) {
+      unsigned i;
+
+      offsets = bld->int_coord_bld.undef;
+      for (i = 0; i < bld->num_lods; i++) {
+         LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
+         LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
+         indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
+         offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
+         offset1 = LLVMBuildLoad(builder, offset1, "");
+         offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, "");
+      }
+      offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0);
+   }
+   else {
+      unsigned i;
+
+      assert (bld->num_lods == bld->coord_bld.type.length);
+
+      offsets = bld->int_coord_bld.undef;
+      for (i = 0; i < bld->num_lods; i++) {
+         LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
+         indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
+         offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
+         offset1 = LLVMBuildLoad(builder, offset1, "");
+         offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexi, "");
+      }
+   }
+   return offsets;
+}
+
 
 /**
  * Codegen equivalent for u_minify().
@@ -780,12 +828,44 @@ lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
                               LLVMValueRef stride_array, LLVMValueRef level)
 {
    LLVMBuilderRef builder = bld->gallivm->builder;
-   LLVMValueRef indexes[2], stride;
+   LLVMValueRef indexes[2], stride, stride1;
    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
-   indexes[1] = level;
-   stride = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
-   stride = LLVMBuildLoad(builder, stride, "");
-   stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride);
+   if (bld->num_lods == 1) {
+      indexes[1] = level;
+      stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
+      stride1 = LLVMBuildLoad(builder, stride1, "");
+      stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride1);
+   }
+   else if (bld->num_lods == bld->coord_bld.type.length / 4) {
+      LLVMValueRef stride1;
+      unsigned i;
+
+      stride = bld->int_coord_bld.undef;
+      for (i = 0; i < bld->num_lods; i++) {
+         LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
+         LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, i);
+         indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
+         stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
+         stride1 = LLVMBuildLoad(builder, stride1, "");
+         stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, "");
+      }
+      stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0);
+   }
+   else {
+      LLVMValueRef stride1;
+      unsigned i;
+
+      assert (bld->num_lods == bld->coord_bld.type.length);
+
+      stride = bld->int_coord_bld.undef;
+      for (i = 0; i < bld->coord_bld.type.length; i++) {
+         LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
+         indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
+         stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
+         stride1 = LLVMBuildLoad(builder, stride1, "");
+         stride = LLVMBuildInsertElement(builder, stride, stride1, indexi, "");
+      }
+   }
    return stride;
 }
 
@@ -805,12 +885,102 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
    const unsigned dims = bld->dims;
    LLVMValueRef ilevel_vec;
 
-   ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
-
    /*
     * Compute width, height, depth at mipmap level 'ilevel'
     */
-   *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
+   if (bld->num_lods == 1) {
+      ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
+      *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
+   }
+   else {
+      LLVMValueRef int_size_vec;
+      LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
+      unsigned num_quads = bld->coord_bld.type.length / 4;
+      unsigned i;
+
+      if (bld->num_lods == num_quads) {
+         /*
+          * XXX: this should be #ifndef SANE_INSTRUCTION_SET.
+          * intel "forgot" the variable shift count instruction until avx2.
+          * A harmless 8x32 shift gets translated into 32 instructions
+          * (16 extracts, 8 scalar shifts, 8 inserts), llvm is apparently
+          * unable to recognize if there are really just 2 different shift
+          * count values. So do the shift 4-wide before expansion.
+          */
+         struct lp_build_context bld4;
+         struct lp_type type4;
+
+         type4 = bld->int_coord_bld.type;
+         type4.length = 4;
+
+         lp_build_context_init(&bld4, bld->gallivm, type4);
+
+         if (bld->dims == 1) {
+            assert(bld->int_size_in_bld.type.length == 1);
+            int_size_vec = lp_build_broadcast_scalar(&bld4,
+                                                     bld->int_size);
+         }
+         else {
+            assert(bld->int_size_in_bld.type.length == 4);
+            int_size_vec = bld->int_size;
+         }
+
+         for (i = 0; i < num_quads; i++) {
+            LLVMValueRef ileveli;
+            LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
+
+            ileveli = lp_build_extract_broadcast(bld->gallivm,
+                                                 bld->perquadi_bld.type,
+                                                 bld4.type,
+                                                 ilevel,
+                                                 indexi);
+            tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli);
+         }
+         /*
+          * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims > 1,
+          * [w0, w0, w0, w0, w1, w1, w1, w1, ...] otherwise.
+          */
+         *out_size = lp_build_concat(bld->gallivm,
+                                     tmp,
+                                     bld4.type,
+                                     num_quads);
+      }
+      else {
+        /* FIXME: this is terrible and results in _huge_ vector
+         * (for the dims > 1 case).
+         * Should refactor this (together with extract_image_sizes) and do
+         * something more useful. Could for instance if we have width,height
+         * with 4-wide vector pack all elements into a 8xi16 vector
+         * (on which we can still do useful math) instead of using a 16xi32
+         * vector.
+         * FIXME: some callers can't handle this yet.
+         * For dims == 1 this will create [w0, w1, w2, w3, ...] vector.
+         * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...] vector.
+         */
+         assert(bld->num_lods == bld->coord_bld.type.length);
+         if (bld->dims == 1) {
+            assert(bld->int_size_bld.type.length == 1);
+            int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
+                                                     bld->int_size);
+            /* vector shift with variable shift count alert... */
+            *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel);
+         }
+         else {
+            LLVMValueRef ilevel1;
+            for (i = 0; i < bld->num_lods; i++) {
+               LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
+               ilevel1 = lp_build_extract_broadcast(bld->gallivm, bld->int_coord_type,
+                                                    bld->int_size_in_bld.type, ilevel, indexi);
+               tmp[i] = bld->int_size;
+               tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1);
+            }
+            int_size_vec = lp_build_concat(bld->gallivm,
+                                           tmp,
+                                           bld->int_size_in_bld.type,
+                                           bld->num_lods);
+         }
+      }
+   }
 
    if (dims >= 2) {
       *row_stride_vec = lp_build_get_level_stride_vec(bld,
@@ -836,7 +1006,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
  */
 void
 lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
-                             struct lp_type size_type,
+                             struct lp_build_context *size_bld,
                              struct lp_type coord_type,
                              LLVMValueRef size,
                              LLVMValueRef *out_width,
@@ -845,24 +1015,56 @@ lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
 {
    const unsigned dims = bld->dims;
    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
+   struct lp_type size_type = size_bld->type;
+
+   if (bld->num_lods == 1) {
+      *out_width = lp_build_extract_broadcast(bld->gallivm,
+                                              size_type,
+                                              coord_type,
+                                              size,
+                                              LLVMConstInt(i32t, 0, 0));
+      if (dims >= 2) {
+         *out_height = lp_build_extract_broadcast(bld->gallivm,
+                                                  size_type,
+                                                  coord_type,
+                                                  size,
+                                                  LLVMConstInt(i32t, 1, 0));
+         if (dims == 3) {
+            *out_depth = lp_build_extract_broadcast(bld->gallivm,
+                                                    size_type,
+                                                    coord_type,
+                                                    size,
+                                                    LLVMConstInt(i32t, 2, 0));
+         }
+      }
+   }
+   else {
+      unsigned num_quads = bld->coord_bld.type.length / 4;
 
-   *out_width = lp_build_extract_broadcast(bld->gallivm,
-                                           size_type,
-                                           coord_type,
-                                           size,
-                                           LLVMConstInt(i32t, 0, 0));
-   if (dims >= 2) {
-      *out_height = lp_build_extract_broadcast(bld->gallivm,
-                                               size_type,
-                                               coord_type,
-                                               size,
-                                               LLVMConstInt(i32t, 1, 0));
-      if (dims == 3) {
-         *out_depth = lp_build_extract_broadcast(bld->gallivm,
-                                                 size_type,
-                                                 coord_type,
-                                                 size,
-                                                 LLVMConstInt(i32t, 2, 0));
+      if (dims == 1) {
+         *out_width = size;
+      }
+      else if (bld->num_lods == num_quads) {
+         *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0);
+         if (dims >= 2) {
+            *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1);
+            if (dims == 3) {
+               *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2);
+            }
+         }
+      }
+      else {
+         assert(bld->num_lods == bld->coord_type.length);
+         *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
+                                                coord_type, size, 0);
+         if (dims >= 2) {
+            *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
+                                                   coord_type, size, 1);
+            if (dims == 3) {
+               *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
+                                                      coord_type, size, 2);
+            }
+         }
       }
    }
 }
@@ -886,7 +1088,7 @@ lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
    LLVMValueRef depth;
 
    lp_build_extract_image_sizes(bld,
-                                bld->float_size_type,
+                                &bld->float_size_bld,
                                 bld->coord_type,
                                 flt_size,
                                 &width,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index d8a068d5497..7fc432cb4c2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -210,6 +210,9 @@ struct lp_build_sample_context
    /** SIMD vector width */
    unsigned vector_width;
 
+   /**Â number of lod values (valid are 1, length/4, length) */
+   unsigned num_lods;
+
    /** regular scalar float type */
    struct lp_type float_type;
    struct lp_build_context float_bld;
@@ -230,10 +233,18 @@ struct lp_build_sample_context
    struct lp_build_context int_coord_bld;
 
    /** Unsigned integer texture size */
+   struct lp_type int_size_in_type;
+   struct lp_build_context int_size_in_bld;
+
+   /** Float incoming texture size */
+   struct lp_type float_size_in_type;
+   struct lp_build_context float_size_in_bld;
+
+   /** Unsigned integer texture size (might be per quad) */
    struct lp_type int_size_type;
    struct lp_build_context int_size_bld;
 
-   /** Unsigned integer texture size */
+   /** Float texture size (might be per quad) */
    struct lp_type float_size_type;
    struct lp_build_context float_size_bld;
 
@@ -298,6 +309,7 @@ texture_dims(enum pipe_texture_target tex)
 {
    switch (tex) {
    case PIPE_TEXTURE_1D:
+   case PIPE_BUFFER:
       return 1;
    case PIPE_TEXTURE_2D:
    case PIPE_TEXTURE_RECT:
@@ -355,6 +367,11 @@ lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
                           LLVMValueRef level);
 
 
+LLVMValueRef
+lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
+                         LLVMValueRef level);
+
+
 void
 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
                             LLVMValueRef ilevel,
@@ -365,7 +382,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
 
 void
 lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
-                             struct lp_type size_type,
+                             struct lp_build_context *size_bld,
                              struct lp_type coord_type,
                              LLVMValueRef size,
                              LLVMValueRef *out_width,
@@ -418,9 +435,10 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                     const struct lp_sampler_static_state *static_state,
                     struct lp_sampler_dynamic_state *dynamic_state,
                     struct lp_type fp_type,
+                    boolean is_fetch,
                     unsigned unit,
-                    unsigned num_coords,
                     const LLVMValueRef *coords,
+                    const LLVMValueRef *offsets,
                     const struct lp_derivatives *derivs,
                     LLVMValueRef lod_bias,
                     LLVMValueRef explicit_lod,
@@ -448,7 +466,6 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
 void
 lp_build_sample_nop(struct gallivm_state *gallivm, 
                     struct lp_type type,
-                    unsigned num_coords,
                     const LLVMValueRef *coords,
                     LLVMValueRef texel_out[4]);
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index d81033f83a0..236b68bb0ce 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -539,7 +539,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
    i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
 
    lp_build_extract_image_sizes(bld,
-                                bld->int_size_type,
+                                &bld->int_size_bld,
                                 bld->int_coord_type,
                                 int_size,
                                 &width_vec,
@@ -661,7 +661,7 @@ lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld,
    flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
 
    lp_build_extract_image_sizes(bld,
-                                bld->float_size_type,
+                                &bld->float_size_bld,
                                 bld->coord_type,
                                 flt_size,
                                 &width_vec,
@@ -994,7 +994,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
 
    lp_build_extract_image_sizes(bld,
-                                bld->int_size_type,
+                                &bld->int_size_bld,
                                 bld->int_coord_type,
                                 int_size,
                                 &width_vec,
@@ -1175,7 +1175,7 @@ lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld,
    flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
 
    lp_build_extract_image_sizes(bld,
-                                bld->float_size_type,
+                                &bld->float_size_bld,
                                 bld->coord_type,
                                 flt_size,
                                 &width_vec,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 00a5b187bcb..daa49506ca5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -610,7 +610,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
    LLVMValueRef x, y, z;
 
    lp_build_extract_image_sizes(bld,
-                                bld->int_size_type,
+                                &bld->int_size_bld,
                                 bld->int_coord_type,
                                 size,
                                 &width_vec, &height_vec, &depth_vec);
@@ -618,7 +618,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
 
    lp_build_extract_image_sizes(bld,
-                                bld->float_size_type,
+                                &bld->float_size_bld,
                                 bld->coord_type,
                                 flt_size,
                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
@@ -695,7 +695,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    int chan;
 
    lp_build_extract_image_sizes(bld,
-                                bld->int_size_type,
+                                &bld->int_size_bld,
                                 bld->int_coord_type,
                                 size,
                                 &width_vec, &height_vec, &depth_vec);
@@ -703,7 +703,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
 
    lp_build_extract_image_sizes(bld,
-                                bld->float_size_type,
+                                &bld->float_size_bld,
                                 bld->coord_type,
                                 flt_size,
                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
@@ -1157,6 +1157,120 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
 }
 
 
+/**
+ * Texel fetch function.
+ * In contrast to general sampling there is no filtering, no coord minification,
+ * lod (if any) is always explicit uint, coords are uints (in terms of texel units)
+ * directly to be applied to the selected mip level (after adding texel offsets).
+ * This function handles texel fetch for all targets where texel fetch is supported
+ * (no cube maps, but 1d, 2d, 3d are supported, arrays and buffers should be too).
+ */
+static void
+lp_build_fetch_texel(struct lp_build_sample_context *bld,
+                     unsigned unit,
+                     const LLVMValueRef *coords,
+                     LLVMValueRef explicit_lod,
+                     const LLVMValueRef *offsets,
+                     LLVMValueRef *colors_out)
+{
+   struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   unsigned dims = bld->dims, chan;
+   LLVMValueRef size, ilevel;
+   LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
+   LLVMValueRef x = coords[0], y = coords[1], z = coords[2];
+   LLVMValueRef width, height, depth, i, j;
+   LLVMValueRef offset, out_of_bounds, out1;
+
+   /* XXX just like ordinary sampling, we don't handle per-pixel lod (yet). */
+   if (explicit_lod && bld->static_state->target != PIPE_BUFFER) {
+      /* could also avoid this if there are no mipmaps */
+      /* XXX temporary hack until ordinary sampling handles per-quad lod the same */
+      bld->num_lods = bld->coord_type.length / 4;
+      bld->float_size_type = bld->float_size_in_type;
+      bld->float_size_type.length = bld->num_lods > 1 ? bld->coord_type.length :
+                                      bld->float_size_in_type.length;
+      bld->int_size_type = lp_int_type(bld->float_size_type);
+      lp_build_context_init(&bld->int_size_bld, bld->gallivm, bld->int_size_type);
+      lp_build_context_init(&bld->float_size_bld, bld->gallivm, bld->float_size_type);
+
+      ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
+                                         perquadi_bld->type, explicit_lod, 0);
+      lp_build_nearest_mip_level(bld, unit, ilevel, &ilevel);
+   }
+   else {
+      bld->num_lods = 1;
+      ilevel = lp_build_const_int32(bld->gallivm, 0);
+   }
+   lp_build_mipmap_level_sizes(bld, ilevel,
+                               &size,
+                               &row_stride_vec, &img_stride_vec);
+   lp_build_extract_image_sizes(bld, &bld->int_size_bld, int_coord_bld->type,
+                                size, &width, &height, &depth);
+
+   /* This is a lot like border sampling */
+   if (offsets[0]) {
+      /* XXX coords are really unsigned, offsets are signed */
+      x = lp_build_add(int_coord_bld, x, offsets[0]);
+   }
+   out_of_bounds = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
+   out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
+   out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
+
+   if (dims >= 2) {
+      if (offsets[1]) {
+         y = lp_build_add(int_coord_bld, y, offsets[1]);
+      }
+      out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
+      out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
+      out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
+      out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
+
+      if (dims >= 3) {
+         if (offsets[2]) {
+            z = lp_build_add(int_coord_bld, z, offsets[2]);
+         }
+         out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
+         out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
+         out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
+         out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
+      }
+   }
+
+   lp_build_sample_offset(int_coord_bld,
+                          bld->format_desc,
+                          x, y, z, row_stride_vec, img_stride_vec,
+                          &offset, &i, &j);
+
+   if (bld->static_state->target != PIPE_BUFFER) {
+      offset = lp_build_add(int_coord_bld, offset,
+                            lp_build_get_mip_offsets(bld, ilevel));
+   }
+
+   offset = lp_build_andnot(int_coord_bld, offset, out_of_bounds);
+
+   lp_build_fetch_rgba_soa(bld->gallivm,
+                           bld->format_desc,
+                           bld->texel_type,
+                           bld->base_ptr, offset,
+                           i, j,
+                           colors_out);
+
+   if (0) {
+      /*
+       * Not needed except for ARB_robust_buffer_access_behavior.
+       * Could use min/max above instead of out-of-bounds comparisons
+       * (in fact cast to unsigned and min only is sufficient)
+       * if we don't care about the result returned for out-of-bounds.
+       */
+      for (chan = 0; chan < 4; chan++) {
+         colors_out[chan] = lp_build_select(&bld->texel_bld, out_of_bounds,
+                                            bld->texel_bld.zero, colors_out[chan]);
+      }
+   }
+}
+
+
 /**
  * Do shadow test/comparison.
  * \param p  the texcoord Z (aka R, aka P) component
@@ -1209,7 +1323,6 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
 void
 lp_build_sample_nop(struct gallivm_state *gallivm,
                     struct lp_type type,
-                    unsigned num_coords,
                     const LLVMValueRef *coords,
                     LLVMValueRef texel_out[4])
 {
@@ -1227,6 +1340,7 @@ lp_build_sample_nop(struct gallivm_state *gallivm,
  * 'texel' will return a vector of four LLVMValueRefs corresponding to
  * R, G, B, A.
  * \param type  vector float type to use for coords, etc.
+ * \param is_fetch  if this is a texel fetch instruction.
  * \param derivs  partial derivatives of (s,t,r,q) with respect to x and y
  */
 void
@@ -1234,9 +1348,10 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                     const struct lp_sampler_static_state *static_state,
                     struct lp_sampler_dynamic_state *dynamic_state,
                     struct lp_type type,
+                    boolean is_fetch,
                     unsigned unit,
-                    unsigned num_coords,
                     const LLVMValueRef *coords,
+                    const LLVMValueRef *offsets,
                     const struct lp_derivatives *derivs,
                     LLVMValueRef lod_bias, /* optional */
                     LLVMValueRef explicit_lod, /* optional */
@@ -1272,20 +1387,28 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
    bld.int_type = lp_type_int(32);
    bld.coord_type = type;
    bld.int_coord_type = lp_int_type(type);
-   bld.float_size_type = lp_type_float(32);
-   bld.float_size_type.length = dims > 1 ? 4 : 1;
-   bld.int_size_type = lp_int_type(bld.float_size_type);
+   bld.float_size_in_type = lp_type_float(32);
+   bld.float_size_in_type.length = dims > 1 ? 4 : 1;
+   bld.int_size_in_type = lp_int_type(bld.float_size_in_type);
    bld.texel_type = type;
    bld.perquadf_type = type;
    /* we want native vector size to be able to use our intrinsics */
    bld.perquadf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
    bld.perquadi_type = lp_int_type(bld.perquadf_type);
 
+   bld.num_lods = 1;
+   bld.float_size_type = bld.float_size_in_type;
+   bld.float_size_type.length = bld.num_lods > 1 ? type.length :
+                                   bld.float_size_in_type.length;
+   bld.int_size_type = lp_int_type(bld.float_size_type);
+
    lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
    lp_build_context_init(&bld.float_vec_bld, gallivm, type);
    lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
    lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
    lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
+   lp_build_context_init(&bld.int_size_in_bld, gallivm, bld.int_size_in_type);
+   lp_build_context_init(&bld.float_size_in_bld, gallivm, bld.float_size_in_type);
    lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
    lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
    lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
@@ -1311,7 +1434,7 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
       bld.int_size = tex_width;
    }
    else {
-      bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_bld.undef,
+      bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_in_bld.undef,
                                             tex_width, LLVMConstInt(i32t, 0, 0), "");
       if (dims >= 2) {
          bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
@@ -1327,7 +1450,6 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
       /* For debug: no-op texture sampling */
       lp_build_sample_nop(gallivm,
                           bld.texel_type,
-                          num_coords,
                           coords,
                           texel_out);
    }
@@ -1352,6 +1474,18 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                       static_state->wrap_t);
       }
 
+      if (is_fetch) {
+         lp_build_fetch_texel(&bld, unit, coords,
+                              explicit_lod, offsets,
+                              texel_out);
+
+         if (static_state->target != PIPE_BUFFER) {
+            apply_sampler_swizzle(&bld, texel_out);
+         }
+
+         return;
+      }
+
       lp_build_sample_common(&bld, unit,
                              &s, &t, &r,
                              derivs, lod_bias, explicit_lod,
@@ -1450,20 +1584,25 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
             bld4.int_type = lp_type_int(32);
             bld4.coord_type = type4;
             bld4.int_coord_type = lp_int_type(type4);
-            bld4.float_size_type = lp_type_float(32);
-            bld4.float_size_type.length = dims > 1 ? 4 : 1;
-            bld4.int_size_type = lp_int_type(bld4.float_size_type);
+            bld4.float_size_in_type = lp_type_float(32);
+            bld4.float_size_in_type.length = dims > 1 ? 4 : 1;
+            bld4.int_size_in_type = lp_int_type(bld4.float_size_in_type);
+            bld4.float_size_type = bld4.float_size_in_type;
+            bld4.int_size_type =  bld4.int_size_in_type;
             bld4.texel_type = type4;
             bld4.perquadf_type = type4;
             /* we want native vector size to be able to use our intrinsics */
             bld4.perquadf_type.length = 1;
             bld4.perquadi_type = lp_int_type(bld4.perquadf_type);
+            bld4.num_lods = 1;
 
             lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
             lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
             lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
             lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
             lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
+            lp_build_context_init(&bld4.int_size_in_bld, gallivm, bld4.int_size_in_type);
+            lp_build_context_init(&bld4.float_size_in_bld, gallivm, bld4.float_size_in_type);
             lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
             lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
             lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index 201a3487588..3d70252e75a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -554,15 +554,16 @@ lp_build_transpose_aos(struct gallivm_state *gallivm,
 
 
 /**
- * Pack first element of aos values,
+ * Pack n-th element of aos values,
  * pad out to destination size.
- * i.e. x1 _ _ _ x2 _ _ _ will become x1 x2 _ _
+ * i.e. x1 y1 _ _ x2 y2 _ _ will become x1 x2 _ _
  */
 LLVMValueRef
 lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
                           struct lp_type src_type,
                           struct lp_type dst_type,
-                          const LLVMValueRef src)
+                          const LLVMValueRef src,
+                          unsigned channel)
 {
    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
    LLVMValueRef undef = LLVMGetUndef(i32t);
@@ -574,7 +575,7 @@ lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
    assert(num_src <= num_dst);
 
    for (i = 0; i < num_src; i++) {
-      shuffles[i] = LLVMConstInt(i32t, i * 4, 0);
+      shuffles[i] = LLVMConstInt(i32t, i * 4 + channel, 0);
    }
    for (i = num_src; i < num_dst; i++) {
       shuffles[i] = undef;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
index 0bf4ce988a2..c49d9167231 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -117,7 +117,8 @@ LLVMValueRef
 lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
                           struct lp_type src_type,
                           struct lp_type dst_type,
-                          const LLVMValueRef src);
+                          const LLVMValueRef src,
+                          unsigned channel);
 
 
 LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
index 680c85f843c..a4fea7d2961 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -334,6 +334,66 @@ lp_build_emit_fetch(
 
 }
 
+
+LLVMValueRef
+lp_build_emit_fetch_texoffset(
+   struct lp_build_tgsi_context *bld_base,
+   const struct tgsi_full_instruction *inst,
+   unsigned tex_off_op,
+   const unsigned chan_index)
+{
+   const struct tgsi_texture_offset *off = &inst->TexOffsets[tex_off_op];
+   struct tgsi_full_src_register reg;
+   unsigned swizzle;
+   LLVMValueRef res;
+   enum tgsi_opcode_type stype = TGSI_TYPE_SIGNED;
+
+   /* convert offset "register" to ordinary register so can use normal emit funcs */
+   memset(&reg, 0, sizeof(reg));
+   reg.Register.File = off->File;
+   reg.Register.Index = off->Index;
+   reg.Register.SwizzleX = off->SwizzleX;
+   reg.Register.SwizzleY = off->SwizzleY;
+   reg.Register.SwizzleZ = off->SwizzleZ;
+
+   if (chan_index == LP_CHAN_ALL) {
+      swizzle = ~0;
+   } else {
+      swizzle = tgsi_util_get_src_register_swizzle(&reg.Register, chan_index);
+      if (swizzle > 2) {
+         assert(0 && "invalid swizzle in emit_fetch_texoffset()");
+         return bld_base->base.undef;
+      }
+   }
+
+   assert(off->Index <= bld_base->info->file_max[off->File]);
+
+   if (bld_base->emit_fetch_funcs[off->File]) {
+      res = bld_base->emit_fetch_funcs[off->File](bld_base, &reg, stype,
+                                                           swizzle);
+   } else {
+      assert(0 && "invalid src register in emit_fetch_texoffset()");
+      return bld_base->base.undef;
+   }
+
+   /*
+    * Swizzle the argument
+    */
+
+   if (swizzle == ~0) {
+      res = bld_base->emit_swizzle(bld_base, res,
+                                   off->SwizzleX,
+                                   off->SwizzleY,
+                                   off->SwizzleZ,
+                                   /* there's no 4th channel */
+                                   off->SwizzleX);
+   }
+
+   return res;
+
+}
+
+
 boolean
 lp_build_tgsi_llvm(
    struct lp_build_tgsi_context * bld_base,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index e292420a61a..16d2ed9e6f7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -172,9 +172,10 @@ struct lp_build_sampler_soa
    (*emit_fetch_texel)( const struct lp_build_sampler_soa *sampler,
                         struct gallivm_state *gallivm,
                         struct lp_type type,
+                        boolean is_fetch,
                         unsigned unit,
-                        unsigned num_coords,
                         const LLVMValueRef *coords,
+                        const LLVMValueRef *offsets,
                         const struct lp_derivatives *derivs,
                         LLVMValueRef lod_bias, /* optional */
                         LLVMValueRef explicit_lod, /* optional */
@@ -555,6 +556,14 @@ lp_build_emit_fetch(
    unsigned src_op,
    const unsigned chan_index);
 
+
+LLVMValueRef
+lp_build_emit_fetch_texoffset(
+   struct lp_build_tgsi_context *bld_base,
+   const struct tgsi_full_instruction *inst,
+   unsigned tex_off_op,
+   const unsigned chan_index);
+
 boolean
 lp_build_tgsi_llvm(
    struct lp_build_tgsi_context * bld_base,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 85a4401b534..2afdd3027e0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1146,7 +1146,8 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
    unsigned unit;
    LLVMValueRef lod_bias, explicit_lod;
    LLVMValueRef oow = NULL;
-   LLVMValueRef coords[3];
+   LLVMValueRef coords[4];
+   LLVMValueRef offsets[3] = { NULL };
    struct lp_derivatives derivs;
    unsigned num_coords;
    unsigned dims;
@@ -1225,7 +1226,7 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
       if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED)
          coords[i] = lp_build_mul(&bld->bld_base.base, coords[i], oow);
    }
-   for (i = num_coords; i < 3; i++) {
+   for (i = num_coords; i < 4; i++) {
       coords[i] = bld->bld_base.base.undef;
    }
 
@@ -1285,15 +1286,111 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
       unit = inst->Src[1].Register.Index;
    }
 
+   /* some advanced gather instructions (txgo) would require 4 offsets */
+   if (inst->Texture.NumOffsets == 1) {
+      unsigned dim;
+      for (dim = 0; dim < dims; dim++) {
+         offsets[dim] = lp_build_emit_fetch_texoffset(&bld->bld_base, inst, 0, dim );
+      }
+   }
+
    bld->sampler->emit_fetch_texel(bld->sampler,
                                   bld->bld_base.base.gallivm,
                                   bld->bld_base.base.type,
-                                  unit, num_coords, coords,
+                                  FALSE,
+                                  unit, coords,
+                                  offsets,
                                   &derivs,
                                   lod_bias, explicit_lod,
                                   texel);
 }
 
+static void
+emit_txf( struct lp_build_tgsi_soa_context *bld,
+          const struct tgsi_full_instruction *inst,
+          LLVMValueRef *texel)
+{
+   unsigned unit;
+   LLVMValueRef coord_undef = LLVMGetUndef(bld->bld_base.base.int_vec_type);
+   LLVMValueRef explicit_lod = NULL;
+   LLVMValueRef coords[3];
+   LLVMValueRef offsets[3] = { NULL };
+   struct lp_derivatives derivs;
+   unsigned num_coords;
+   unsigned dims;
+   unsigned i;
+
+   if (!bld->sampler) {
+      _debug_printf("warning: found texture instruction but no sampler generator supplied\n");
+      for (i = 0; i < 4; i++) {
+         texel[i] = coord_undef;
+      }
+      return;
+   }
+
+   derivs.ddx_ddy[0] = coord_undef;
+   derivs.ddx_ddy[1] = coord_undef;
+
+   switch (inst->Texture.Texture) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_BUFFER:
+      num_coords = 1;
+      dims = 1;
+      break;
+   case TGSI_TEXTURE_1D_ARRAY:
+      num_coords = 2;
+      dims = 1;
+      break;
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+      num_coords = 2;
+      dims = 2;
+      break;
+   case TGSI_TEXTURE_2D_ARRAY:
+      num_coords = 3;
+      dims = 2;
+      break;
+   case TGSI_TEXTURE_3D:
+      num_coords = 3;
+      dims = 3;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   /* always have lod except for buffers ? */
+   if (inst->Texture.Texture != TGSI_TEXTURE_BUFFER) {
+      explicit_lod = lp_build_emit_fetch( &bld->bld_base, inst, 0, 3 );
+   }
+
+   for (i = 0; i < num_coords; i++) {
+      coords[i] = lp_build_emit_fetch( &bld->bld_base, inst, 0, i );
+   }
+   for (i = num_coords; i < 3; i++) {
+      coords[i] = coord_undef;
+   }
+
+   unit = inst->Src[1].Register.Index;
+
+   if (inst->Texture.NumOffsets == 1) {
+      unsigned dim;
+      for (dim = 0; dim < dims; dim++) {
+         offsets[dim] = lp_build_emit_fetch_texoffset(&bld->bld_base, inst, 0, dim );
+      }
+   }
+
+   bld->sampler->emit_fetch_texel(bld->sampler,
+                                  bld->bld_base.base.gallivm,
+                                  bld->bld_base.base.type,
+                                  TRUE,
+                                  unit, coords,
+                                  offsets,
+                                  &derivs,
+                                  NULL, explicit_lod,
+                                  texel);
+}
+
 static void
 emit_txq( struct lp_build_tgsi_soa_context *bld,
           const struct tgsi_full_instruction *inst,
@@ -1755,6 +1852,17 @@ txq_emit(
    emit_txq(bld, emit_data->inst, emit_data->output);
 }
 
+static void
+txf_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_txf(bld, emit_data->inst, emit_data->output);
+}
+
 static void
 cal_emit(
    const struct lp_build_tgsi_action * action,
@@ -2126,6 +2234,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
    bld.bld_base.op_actions[TGSI_OPCODE_TXL].emit = txl_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXP].emit = txp_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_TXF].emit = txf_emit;
 
    lp_exec_mask_init(&bld.exec_mask, &bld.bld_base.base);
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index 51ca373b6ba..458bc69d169 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -293,6 +293,7 @@ tgsi_opcode_infer_src_type( uint opcode )
    case TGSI_OPCODE_USHR:
    case TGSI_OPCODE_SHL:
    case TGSI_OPCODE_TXQ:
+   case TGSI_OPCODE_TXF:
       return TGSI_TYPE_UNSIGNED;
    case TGSI_OPCODE_MOD:
    case TGSI_OPCODE_I2F:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index e81c44151bb..5ff8024a223 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -175,9 +175,11 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return 0;
    case PIPE_CAP_SCALED_RESOLVE:
       return 0;
+   /* this is a lie could support arbitrary large offsets */
    case PIPE_CAP_MIN_TEXEL_OFFSET:
+      return -8;
    case PIPE_CAP_MAX_TEXEL_OFFSET:
-      return 0;
+      return 7;
    case PIPE_CAP_CONDITIONAL_RENDER:
       return 1;
    case PIPE_CAP_TEXTURE_BARRIER:
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
index 1c5c009b556..0bd5c4aa050 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
@@ -176,9 +176,10 @@ static void
 lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                                      struct gallivm_state *gallivm,
                                      struct lp_type type,
+                                     boolean is_fetch,
                                      unsigned unit,
-                                     unsigned num_coords,
                                      const LLVMValueRef *coords,
+                                     const LLVMValueRef *offsets,
                                      const struct lp_derivatives *derivs,
                                      LLVMValueRef lod_bias, /* optional */
                                      LLVMValueRef explicit_lod, /* optional */
@@ -189,7 +190,7 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
    assert(unit < PIPE_MAX_SAMPLERS);
    
    if (LP_PERF & PERF_NO_TEX) {
-      lp_build_sample_nop(gallivm, type, num_coords, coords, texel);
+      lp_build_sample_nop(gallivm, type, coords, texel);
       return;
    }
 
@@ -197,8 +198,10 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                        &sampler->dynamic_state.static_state[unit],
                        &sampler->dynamic_state.base,
                        type,
+                       is_fetch,
                        unit,
-                       num_coords, coords,
+                       coords,
+                       offsets,
                        derivs,
                        lod_bias, explicit_lod,
                        texel);
-- 
2.30.2