From f3bbf65929e395360e5565d08d015977dd5b79fa Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Thu, 4 Jul 2013 19:40:11 +0200
Subject: [PATCH] gallivm: do per-pixel lod calculations for explicit lod

d3d10 requires per-pixel lod calculations for explicit lod, lod bias and
explicit derivatives, and we should probably do it for OpenGL too - at least
if they are used from vertex or geometry shaders (so doesn't apply to lod
bias) this doesn't just affect neighboring pixels.
Some code was already there to handle this so fix it up and enable it.
There will no doubt be a performance hit unfortunately, we could do better
if we'd knew we had a real vector shift instruction (with variable shift
count) but this requires AVX2 on x86 (or a AMD Bulldozer family cpu).
Don't do anything for lod bias and explicit derivatives yet, though
no special magic should be needed for them neither.
Likewise, the size query is still broken just the same.

v2: Use information if lod is a (broadcast) scalar or not. The idea would be
to base this on the actual value, for now just pretend it's a scalar in fs
and not a scalar otherwise (so, per-pixel lod is only used in gs/vs but same
code is generated for fs as before).

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/auxiliary/draw/draw_llvm_sample.c |   3 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample.c | 110 ++++++-------
 src/gallium/auxiliary/gallivm/lp_bld_sample.h |  13 +-
 .../auxiliary/gallivm/lp_bld_sample_aos.c     |  26 ++--
 .../auxiliary/gallivm/lp_bld_sample_soa.c     | 144 ++++++++++++------
 src/gallium/auxiliary/gallivm/lp_bld_tgsi.h   |   1 +
 .../auxiliary/gallivm/lp_bld_tgsi_soa.c       |  18 ++-
 src/gallium/auxiliary/tgsi/tgsi_scan.c        |   1 +
 src/gallium/auxiliary/tgsi/tgsi_scan.h        |   2 +
 src/gallium/drivers/llvmpipe/lp_tex_sample.c  |   3 +-
 10 files changed, 195 insertions(+), 126 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm_sample.c b/src/gallium/auxiliary/draw/draw_llvm_sample.c
index e51e01101f6..0cb5c210351 100644
--- a/src/gallium/auxiliary/draw/draw_llvm_sample.c
+++ b/src/gallium/auxiliary/draw/draw_llvm_sample.c
@@ -238,6 +238,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                                        const struct lp_derivatives *derivs,
                                        LLVMValueRef lod_bias, /* optional */
                                        LLVMValueRef explicit_lod, /* optional */
+                                       boolean scalar_lod,
                                        LLVMValueRef *texel)
 {
    struct draw_llvm_sampler_soa *sampler = (struct draw_llvm_sampler_soa *)base;
@@ -256,7 +257,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                        coords,
                        offsets,
                        derivs,
-                       lod_bias, explicit_lod,
+                       lod_bias, explicit_lod, scalar_lod,
                        texel);
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index d689c7b3858..c2efec9f4cd 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -215,7 +215,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
    struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
    struct lp_build_context *float_bld = &bld->float_bld;
    struct lp_build_context *coord_bld = &bld->coord_bld;
-   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
+   struct lp_build_context *levelf_bld = &bld->levelf_bld;
    const unsigned dims = bld->dims;
    LLVMValueRef ddx_ddy[2];
    LLVMBuilderRef builder = bld->gallivm->builder;
@@ -235,6 +235,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
 
    /* Note that all simplified calculations will only work for isotropic filtering */
 
+   assert(bld->num_lods != length);
+
    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                  bld->gallivm, texture_unit);
    first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
@@ -248,14 +250,14 @@ lp_build_rho(struct lp_build_sample_context *bld,
        * Cube map code did already everything except size mul and per-quad extraction.
        */
       rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                      perquadf_bld->type, cube_rho, 0);
+                                      levelf_bld->type, cube_rho, 0);
       if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
-         rho = lp_build_sqrt(perquadf_bld, rho);
+         rho = lp_build_sqrt(levelf_bld, rho);
       }
       /* Could optimize this for single quad just skip the broadcast */
       cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
-                                            perquadf_bld->type, float_size, index0);
-      rho = lp_build_mul(perquadf_bld, cubesize, rho);
+                                            levelf_bld->type, float_size, index0);
+      rho = lp_build_mul(levelf_bld, cubesize, rho);
    }
    else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
       LLVMValueRef ddmax[3], ddx[3], ddy[3];
@@ -289,12 +291,12 @@ lp_build_rho(struct lp_build_sample_context *bld,
          }
          rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                         perquadf_bld->type, rho_vec, 0);
+                                         levelf_bld->type, rho_vec, 0);
          /*
           * note that as long as we don't care about per-pixel lod could reduce math
           * more (at some shuffle cost), but for now only do sqrt after packing.
           */
-         rho = lp_build_sqrt(perquadf_bld, rho);
+         rho = lp_build_sqrt(levelf_bld, rho);
       }
       else {
          rho_vec = ddmax[0];
@@ -309,7 +311,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
           * since we can't handle per-pixel rho/lod from now on (TODO).
           */
          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                         perquadf_bld->type, rho_vec, 0);
+                                         levelf_bld->type, rho_vec, 0);
       }
    }
    else {
@@ -381,8 +383,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
          rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
 
          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                         perquadf_bld->type, rho_vec, 0);
-         rho = lp_build_sqrt(perquadf_bld, rho);
+                                         levelf_bld->type, rho_vec, 0);
+         rho = lp_build_sqrt(levelf_bld, rho);
       }
       else {
          ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
@@ -462,7 +464,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
                }
             }
             rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                            perquadf_bld->type, rho, 0);
+                                            levelf_bld->type, rho, 0);
          }
          else {
             if (dims <= 1) {
@@ -652,11 +654,11 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
 
 {
    LLVMBuilderRef builder = bld->gallivm->builder;
-   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
+   struct lp_build_context *levelf_bld = &bld->levelf_bld;
    LLVMValueRef lod;
 
-   *out_lod_ipart = bld->perquadi_bld.zero;
-   *out_lod_fpart = perquadf_bld->zero;
+   *out_lod_ipart = bld->leveli_bld.zero;
+   *out_lod_fpart = levelf_bld->zero;
 
    if (bld->static_sampler_state->min_max_lod_equal) {
       /* User is forcing sampling from a particular mipmap level.
@@ -666,12 +668,15 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
          bld->dynamic_state->min_lod(bld->dynamic_state,
                                      bld->gallivm, sampler_unit);
 
-      lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
+      lod = lp_build_broadcast_scalar(levelf_bld, min_lod);
    }
    else {
       if (explicit_lod) {
-         lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
-                                         perquadf_bld->type, explicit_lod, 0);
+         if (bld->num_lods != bld->coord_type.length)
+            lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
+                                            levelf_bld->type, explicit_lod, 0);
+         else
+            lod = explicit_lod;
       }
       else {
          LLVMValueRef rho;
@@ -694,29 +699,29 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
 
             if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
                 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
-               *out_lod_ipart = lp_build_ilog2(perquadf_bld, rho);
-               *out_lod_fpart = perquadf_bld->zero;
+               *out_lod_ipart = lp_build_ilog2(levelf_bld, rho);
+               *out_lod_fpart = levelf_bld->zero;
                return;
             }
             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
                 !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
-               lp_build_brilinear_rho(perquadf_bld, rho, BRILINEAR_FACTOR,
+               lp_build_brilinear_rho(levelf_bld, rho, BRILINEAR_FACTOR,
                                       out_lod_ipart, out_lod_fpart);
                return;
             }
          }
 
          if (0) {
-            lod = lp_build_log2(perquadf_bld, rho);
+            lod = lp_build_log2(levelf_bld, rho);
          }
          else {
-            lod = lp_build_fast_log2(perquadf_bld, rho);
+            lod = lp_build_fast_log2(levelf_bld, rho);
          }
 
          /* add shader lod bias */
          if (lod_bias) {
             lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
-                  perquadf_bld->type, lod_bias, 0);
+                  levelf_bld->type, lod_bias, 0);
             lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
          }
       }
@@ -726,7 +731,7 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
          LLVMValueRef sampler_lod_bias =
             bld->dynamic_state->lod_bias(bld->dynamic_state,
                                          bld->gallivm, sampler_unit);
-         sampler_lod_bias = lp_build_broadcast_scalar(perquadf_bld,
+         sampler_lod_bias = lp_build_broadcast_scalar(levelf_bld,
                                                       sampler_lod_bias);
          lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
       }
@@ -736,33 +741,33 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
          LLVMValueRef max_lod =
             bld->dynamic_state->max_lod(bld->dynamic_state,
                                         bld->gallivm, sampler_unit);
-         max_lod = lp_build_broadcast_scalar(perquadf_bld, max_lod);
+         max_lod = lp_build_broadcast_scalar(levelf_bld, max_lod);
 
-         lod = lp_build_min(perquadf_bld, lod, max_lod);
+         lod = lp_build_min(levelf_bld, lod, max_lod);
       }
       if (bld->static_sampler_state->apply_min_lod) {
          LLVMValueRef min_lod =
             bld->dynamic_state->min_lod(bld->dynamic_state,
                                         bld->gallivm, sampler_unit);
-         min_lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
+         min_lod = lp_build_broadcast_scalar(levelf_bld, min_lod);
 
-         lod = lp_build_max(perquadf_bld, lod, min_lod);
+         lod = lp_build_max(levelf_bld, lod, min_lod);
       }
    }
 
    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
       if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
-         lp_build_brilinear_lod(perquadf_bld, lod, BRILINEAR_FACTOR,
+         lp_build_brilinear_lod(levelf_bld, lod, BRILINEAR_FACTOR,
                                 out_lod_ipart, out_lod_fpart);
       }
       else {
-         lp_build_ifloor_fract(perquadf_bld, lod, out_lod_ipart, out_lod_fpart);
+         lp_build_ifloor_fract(levelf_bld, lod, out_lod_ipart, out_lod_fpart);
       }
 
       lp_build_name(*out_lod_fpart, "lod_fpart");
    }
    else {
-      *out_lod_ipart = lp_build_iround(perquadf_bld, lod);
+      *out_lod_ipart = lp_build_iround(levelf_bld, lod);
    }
 
    lp_build_name(*out_lod_ipart, "lod_ipart");
@@ -784,20 +789,20 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
                            LLVMValueRef lod_ipart,
                            LLVMValueRef *level_out)
 {
-   struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
+   struct lp_build_context *leveli_bld = &bld->leveli_bld;
    LLVMValueRef first_level, last_level, level;
 
    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                  bld->gallivm, texture_unit);
    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
                                                bld->gallivm, texture_unit);
-   first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
-   last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
+   first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
+   last_level = lp_build_broadcast_scalar(leveli_bld, last_level);
 
-   level = lp_build_add(perquadi_bld, lod_ipart, first_level);
+   level = lp_build_add(leveli_bld, lod_ipart, first_level);
 
    /* clamp level to legal range of levels */
-   *level_out = lp_build_clamp(perquadi_bld, level, first_level, last_level);
+   *level_out = lp_build_clamp(leveli_bld, level, first_level, last_level);
 }
 
 
@@ -815,8 +820,8 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                            LLVMValueRef *level1_out)
 {
    LLVMBuilderRef builder = bld->gallivm->builder;
-   struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
-   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
+   struct lp_build_context *leveli_bld = &bld->leveli_bld;
+   struct lp_build_context *levelf_bld = &bld->levelf_bld;
    LLVMValueRef first_level, last_level;
    LLVMValueRef clamp_min;
    LLVMValueRef clamp_max;
@@ -825,11 +830,11 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                                                  bld->gallivm, texture_unit);
    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
                                                bld->gallivm, texture_unit);
-   first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
-   last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
+   first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
+   last_level = lp_build_broadcast_scalar(leveli_bld, last_level);
 
-   *level0_out = lp_build_add(perquadi_bld, lod_ipart, first_level);
-   *level1_out = lp_build_add(perquadi_bld, *level0_out, perquadi_bld->one);
+   *level0_out = lp_build_add(leveli_bld, lod_ipart, first_level);
+   *level1_out = lp_build_add(leveli_bld, *level0_out, leveli_bld->one);
 
    /*
     * Clamp both *level0_out and *level1_out to [first_level, last_level], with
@@ -843,7 +848,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
     * converting to our lp_bld_logic helpers.
     */
 #if HAVE_LLVM < 0x0301
-   assert(perquadi_bld->type.length == 1);
+   assert(leveli_bld->type.length == 1);
 #endif
 
    /* *level0_out < first_level */
@@ -858,7 +863,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                                  first_level, *level1_out, "");
 
    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
-                                      perquadf_bld->zero, *lod_fpart_inout, "");
+                                      levelf_bld->zero, *lod_fpart_inout, "");
 
    /* *level0_out >= last_level */
    clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
@@ -872,7 +877,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                                  last_level, *level1_out, "");
 
    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
-                                      perquadf_bld->zero, *lod_fpart_inout, "");
+                                      levelf_bld->zero, *lod_fpart_inout, "");
 
    lp_build_name(*level0_out, "texture%u_miplevel0", texture_unit);
    lp_build_name(*level1_out, "texture%u_miplevel1", texture_unit);
@@ -1087,7 +1092,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
             LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
 
             ileveli = lp_build_extract_broadcast(bld->gallivm,
-                                                 bld->perquadi_bld.type,
+                                                 bld->leveli_bld.type,
                                                  bld4.type,
                                                  ilevel,
                                                  indexi);
@@ -1131,10 +1136,9 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
                tmp[i] = bld->int_size;
                tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1);
             }
-            int_size_vec = lp_build_concat(bld->gallivm,
-                                           tmp,
-                                           bld->int_size_in_bld.type,
-                                           bld->num_lods);
+            *out_size = lp_build_concat(bld->gallivm, tmp,
+                                        bld->int_size_in_bld.type,
+                                        bld->num_lods);
          }
       }
    }
@@ -1218,10 +1222,10 @@ lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
          *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
                                                 coord_type, size, 0);
          if (dims >= 2) {
-            *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
-                                                   coord_type, size, 1);
+            *out_height = lp_build_pack_aos_scalars(bld->gallivm, size_type,
+                                                    coord_type, size, 1);
             if (dims == 3) {
-               *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
+               *out_depth = lp_build_pack_aos_scalars(bld->gallivm, size_type,
                                                       coord_type, size, 2);
             }
          }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index cde8ce96180..a3ecc05877c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -268,13 +268,13 @@ struct lp_build_sample_context
    struct lp_type texel_type;
    struct lp_build_context texel_bld;
 
-   /** Float per-quad type */
-   struct lp_type perquadf_type;
-   struct lp_build_context perquadf_bld;
+   /** Float level type */
+   struct lp_type levelf_type;
+   struct lp_build_context levelf_bld;
 
-   /** Int per-quad type */
-   struct lp_type perquadi_type;
-   struct lp_build_context perquadi_bld;
+   /** Int level type */
+   struct lp_type leveli_type;
+   struct lp_build_context leveli_bld;
 
    /* Common dynamic state values */
    LLVMValueRef row_stride_array;
@@ -477,6 +477,7 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                     const struct lp_derivatives *derivs,
                     LLVMValueRef lod_bias,
                     LLVMValueRef explicit_lod,
+                    boolean scalar_lod,
                     LLVMValueRef texel_out[4]);
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index 104c24d83b9..da416aab87e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -1422,8 +1422,8 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
 
    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
       LLVMValueRef h16vec_scale = lp_build_const_vec(bld->gallivm,
-                                                     bld->perquadf_bld.type, 256.0);
-      LLVMTypeRef i32vec_type = lp_build_vec_type(bld->gallivm, bld->perquadi_bld.type);
+                                                     bld->levelf_bld.type, 256.0);
+      LLVMTypeRef i32vec_type = bld->leveli_bld.vec_type;
       struct lp_build_if_state if_ctx;
       LLVMValueRef need_lerp;
       unsigned num_quads = bld->coord_bld.type.length / 4;
@@ -1433,9 +1433,9 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
       lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32vec_type, "lod_fpart.fixed16");
 
       /* need_lerp = lod_fpart > 0 */
-      if (num_quads == 1) {
+      if (bld->num_lods == 1) {
          need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
-                                   lod_fpart, bld->perquadi_bld.zero,
+                                   lod_fpart, bld->leveli_bld.zero,
                                    "need_lerp");
       }
       else {
@@ -1450,9 +1450,9 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
           * lod_fpart values have same sign.
           * We can however then skip the greater than comparison.
           */
-         lod_fpart = lp_build_max(&bld->perquadi_bld, lod_fpart,
-                                  bld->perquadi_bld.zero);
-         need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, lod_fpart);
+         lod_fpart = lp_build_max(&bld->leveli_bld, lod_fpart,
+                                  bld->leveli_bld.zero);
+         need_lerp = lp_build_any_true_range(&bld->leveli_bld, bld->num_lods, lod_fpart);
       }
 
       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
@@ -1462,9 +1462,6 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
          lp_build_context_init(&u8n_bld, bld->gallivm, lp_type_unorm(8, bld->vector_width));
 
          /* sample the second mipmap level */
-         lp_build_mipmap_level_sizes(bld, ilevel1,
-                                     &size1,
-                                     &row_stride1_vec, &img_stride1_vec);
          lp_build_mipmap_level_sizes(bld, ilevel1,
                                      &size1,
                                      &row_stride1_vec, &img_stride1_vec);
@@ -1511,7 +1508,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
 
          /* interpolate samples from the two mipmap levels */
 
-         if (num_quads == 1) {
+         if (num_quads == 1 && bld->num_lods == 1) {
             lod_fpart = LLVMBuildTrunc(builder, lod_fpart, u8n_bld.elem_type, "");
             lod_fpart = lp_build_broadcast_scalar(&u8n_bld, lod_fpart);
 
@@ -1526,17 +1523,16 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
 #endif
          }
          else {
-            const unsigned num_chans_per_quad = 4 * 4;
-            LLVMTypeRef tmp_vec_type = LLVMVectorType(u8n_bld.elem_type, bld->perquadi_bld.type.length);
+            unsigned num_chans_per_lod = 4 * bld->coord_type.length / bld->num_lods;
+            LLVMTypeRef tmp_vec_type = LLVMVectorType(u8n_bld.elem_type, bld->leveli_bld.type.length);
             LLVMValueRef shuffle[LP_MAX_VECTOR_LENGTH];
 
             /* Take the LSB of lod_fpart */
             lod_fpart = LLVMBuildTrunc(builder, lod_fpart, tmp_vec_type, "");
 
             /* Broadcast each lod weight into their respective channels */
-            assert(u8n_bld.type.length == num_quads * num_chans_per_quad);
             for (i = 0; i < u8n_bld.type.length; ++i) {
-               shuffle[i] = lp_build_const_int32(bld->gallivm, i / num_chans_per_quad);
+               shuffle[i] = lp_build_const_int32(bld->gallivm, i / num_chans_per_lod);
             }
             lod_fpart = LLVMBuildShuffleVector(builder, lod_fpart, LLVMGetUndef(tmp_vec_type),
                                                LLVMConstVector(shuffle, u8n_bld.type.length), "");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index e0a59d06795..53e3628f56e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -979,17 +979,17 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
       struct lp_build_if_state if_ctx;
       LLVMValueRef need_lerp;
-      unsigned num_quads = bld->coord_bld.type.length / 4;
 
       /* need_lerp = lod_fpart > 0 */
-      if (num_quads == 1) {
+      if (bld->num_lods == 1) {
          need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
-                                   lod_fpart, bld->perquadf_bld.zero,
+                                   lod_fpart, bld->levelf_bld.zero,
                                    "need_lerp");
       }
       else {
          /*
-          * We'll do mip filtering if any of the quads need it.
+          * We'll do mip filtering if any of the quads (or individual
+          * pixel in case of per-pixel lod) need it.
           * It might be better to split the vectors here and only fetch/filter
           * quads which need it.
           */
@@ -998,13 +998,13 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
           * negative values which would screw up filtering if not all
           * lod_fpart values have same sign.
           */
-         lod_fpart = lp_build_max(&bld->perquadf_bld, lod_fpart,
-                                  bld->perquadf_bld.zero);
-         need_lerp = lp_build_compare(bld->gallivm, bld->perquadf_bld.type,
+         lod_fpart = lp_build_max(&bld->levelf_bld, lod_fpart,
+                                  bld->levelf_bld.zero);
+         need_lerp = lp_build_compare(bld->gallivm, bld->levelf_bld.type,
                                       PIPE_FUNC_GREATER,
-                                      lod_fpart, bld->perquadf_bld.zero);
-         need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, need_lerp);
-     }
+                                      lod_fpart, bld->levelf_bld.zero);
+         need_lerp = lp_build_any_true_range(&bld->leveli_bld, bld->num_lods, need_lerp);
+      }
 
       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
       {
@@ -1036,10 +1036,11 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
 
          /* interpolate samples from the two mipmap levels */
 
-         lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
-                                                           bld->perquadf_bld.type,
-                                                           bld->texel_bld.type,
-                                                           lod_fpart);
+         if (bld->num_lods != bld->coord_type.length)
+            lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
+                                                              bld->levelf_bld.type,
+                                                              bld->texel_bld.type,
+                                                              lod_fpart);
 
          for (chan = 0; chan < 4; chan++) {
             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
@@ -1143,7 +1144,7 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
                             mip_filter,
                             lod_ipart, lod_fpart);
    } else {
-      *lod_ipart = bld->perquadi_bld.zero;
+      *lod_ipart = bld->leveli_bld.zero;
    }
 
    /*
@@ -1166,7 +1167,7 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
       else {
          first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                        bld->gallivm, texture_index);
-         first_level = lp_build_broadcast_scalar(&bld->perquadi_bld, first_level);
+         first_level = lp_build_broadcast_scalar(&bld->leveli_bld, first_level);
          *ilevel0 = first_level;
       }
       break;
@@ -1295,7 +1296,7 @@ lp_build_fetch_texel(struct lp_build_sample_context *bld,
                      const LLVMValueRef *offsets,
                      LLVMValueRef *colors_out)
 {
-   struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
+   struct lp_build_context *perquadi_bld = &bld->leveli_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    unsigned dims = bld->dims, chan;
    unsigned target = bld->static_texture_state->target;
@@ -1307,8 +1308,13 @@ lp_build_fetch_texel(struct lp_build_sample_context *bld,
 
    /* XXX just like ordinary sampling, we don't handle per-pixel lod (yet). */
    if (explicit_lod && bld->static_texture_state->target != PIPE_BUFFER) {
-      ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
-                                         perquadi_bld->type, explicit_lod, 0);
+      if (bld->num_lods != int_coord_bld->type.length) {
+         ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
+                                            perquadi_bld->type, explicit_lod, 0);
+      }
+      else {
+         ilevel = explicit_lod;
+      }
       lp_build_nearest_mip_level(bld, texture_unit, ilevel, &ilevel);
    }
    else {
@@ -1489,6 +1495,7 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                     const struct lp_derivatives *derivs, /* optional */
                     LLVMValueRef lod_bias, /* optional */
                     LLVMValueRef explicit_lod, /* optional */
+                    boolean scalar_lod,
                     LLVMValueRef texel_out[4])
 {
    unsigned dims = texture_dims(static_texture_state->target);
@@ -1529,10 +1536,6 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
    bld.float_size_in_type.length = dims > 1 ? 4 : 1;
    bld.int_size_in_type = lp_int_type(bld.float_size_in_type);
    bld.texel_type = type;
-   bld.perquadf_type = type;
-   /* we want native vector size to be able to use our intrinsics */
-   bld.perquadf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
-   bld.perquadi_type = lp_int_type(bld.perquadf_type);
 
    /* always using the first channel hopefully should be safe,
     * if not things WILL break in other places anyway.
@@ -1563,21 +1566,50 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
       debug_printf("  .min_mip_filter = %u\n", derived_sampler_state.min_mip_filter);
    }
 
+   /*
+    * This is all a bit complicated different paths are chosen for performance
+    * reasons.
+    * Essentially, there can be 1 lod per element, 1 lod per quad or 1 lod for
+    * everything (the last two options are equivalent for 4-wide case).
+    * If there's per-quad lod but we split to 4-wide so we can use AoS, per-quad
+    * lod is calculated then the lod value extracted afterwards so making this
+    * case basically the same as far as lod handling is concerned for the
+    * further sample/filter code as the 1 lod for everything case.
+    * Different lod handling mostly shows up when building mipmap sizes
+    * (lp_build_mipmap_level_sizes() and friends) and also in filtering
+    * (getting the fractional part of the lod to the right texels).
+    */
+
    /*
     * There are other situations where at least the multiple int lods could be
     * avoided like min and max lod being equal.
     */
-   if ((is_fetch && explicit_lod && bld.static_texture_state->target != PIPE_BUFFER) ||
-       (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
+   if (explicit_lod && !scalar_lod &&
+       ((is_fetch && bld.static_texture_state->target != PIPE_BUFFER) ||
+        (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)))
+      bld.num_lods = type.length;
+   /* TODO: for true scalar_lod should only use 1 lod value */
+   else if (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE) {
       bld.num_lods = num_quads;
    }
    else {
       bld.num_lods = 1;
    }
 
+   bld.levelf_type = type;
+   /* we want native vector size to be able to use our intrinsics */
+   if (bld.num_lods != type.length) {
+      bld.levelf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
+   }
+   bld.leveli_type = lp_int_type(bld.levelf_type);
    bld.float_size_type = bld.float_size_in_type;
-   bld.float_size_type.length = bld.num_lods > 1 ? type.length :
-                                   bld.float_size_in_type.length;
+   /* Note: size vectors may not be native. They contain minified w/h/d/_ values,
+    * with per-element lod that is w0/h0/d0/_/w1/h1/d1_/... so up to 8x4f32 */
+   if (bld.num_lods > 1) {
+      bld.float_size_type.length = bld.num_lods == type.length ?
+                                      bld.num_lods * bld.float_size_in_type.length :
+                                      type.length;
+   }
    bld.int_size_type = lp_int_type(bld.float_size_type);
 
    lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
@@ -1590,8 +1622,8 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
    lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
    lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
    lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
-   lp_build_context_init(&bld.perquadf_bld, gallivm, bld.perquadf_type);
-   lp_build_context_init(&bld.perquadi_bld, gallivm, bld.perquadi_type);
+   lp_build_context_init(&bld.levelf_bld, gallivm, bld.levelf_type);
+   lp_build_context_init(&bld.leveli_bld, gallivm, bld.leveli_type);
 
    /* Get the dynamic state */
    tex_width = dynamic_state->width(dynamic_state, gallivm, texture_index);
@@ -1735,14 +1767,31 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
          bld4.int_size_in_type = lp_int_type(bld4.float_size_in_type);
          bld4.texel_type = bld.texel_type;
          bld4.texel_type.length = 4;
-         bld4.perquadf_type = type4;
+         bld4.levelf_type = type4;
          /* we want native vector size to be able to use our intrinsics */
-         bld4.perquadf_type.length = 1;
-         bld4.perquadi_type = lp_int_type(bld4.perquadf_type);
+         bld4.levelf_type.length = 1;
+         bld4.leveli_type = lp_int_type(bld4.levelf_type);
+
+         if (explicit_lod && !scalar_lod &&
+             ((is_fetch && bld.static_texture_state->target != PIPE_BUFFER) ||
+              (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)))
+            bld4.num_lods = type4.length;
+         else
+            bld4.num_lods = 1;
 
-         bld4.num_lods = 1;
-         bld4.int_size_type = bld4.int_size_in_type;
+         bld4.levelf_type = type4;
+         /* we want native vector size to be able to use our intrinsics */
+         if (bld4.num_lods != type4.length) {
+            bld4.levelf_type.length = 1;
+         }
+         bld4.leveli_type = lp_int_type(bld4.levelf_type);
          bld4.float_size_type = bld4.float_size_in_type;
+         if (bld4.num_lods > 1) {
+            bld4.float_size_type.length = bld4.num_lods == type4.length ?
+                                            bld4.num_lods * bld4.float_size_in_type.length :
+                                            type4.length;
+         }
+         bld4.int_size_type = lp_int_type(bld4.float_size_type);
 
          lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
          lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
@@ -1754,15 +1803,15 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
          lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
          lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
          lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
-         lp_build_context_init(&bld4.perquadf_bld, gallivm, bld4.perquadf_type);
-         lp_build_context_init(&bld4.perquadi_bld, gallivm, bld4.perquadi_type);
+         lp_build_context_init(&bld4.levelf_bld, gallivm, bld4.levelf_type);
+         lp_build_context_init(&bld4.leveli_bld, gallivm, bld4.leveli_type);
 
          for (i = 0; i < num_quads; i++) {
             LLVMValueRef s4, t4, r4;
-            LLVMValueRef lod_iparts, lod_fparts = NULL;
-            LLVMValueRef ilevel0s, ilevel1s = NULL;
-            LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
+            LLVMValueRef lod_ipart4, lod_fpart4 = NULL;
+            LLVMValueRef ilevel04, ilevel14 = NULL;
             LLVMValueRef offsets4[4] = { NULL };
+            unsigned num_lods = bld4.num_lods;
 
             s4 = lp_build_extract_range(gallivm, s, 4*i, 4);
             t4 = lp_build_extract_range(gallivm, t, 4*i, 4);
@@ -1777,27 +1826,27 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                   }
                }
             }
-            lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, indexi, "");
-            ilevel0s = LLVMBuildExtractElement(builder, ilevel0, indexi, "");
+            lod_ipart4 = lp_build_extract_range(gallivm, lod_ipart, num_lods * i, num_lods);
+            ilevel04 = lp_build_extract_range(gallivm, ilevel0, num_lods * i, num_lods);
             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-               ilevel1s = LLVMBuildExtractElement(builder, ilevel1, indexi, "");
-               lod_fparts = LLVMBuildExtractElement(builder, lod_fpart, indexi, "");
+               ilevel14 = lp_build_extract_range(gallivm, ilevel1, num_lods * i, num_lods);
+               lod_fpart4 = lp_build_extract_range(gallivm, lod_fpart, num_lods * i, num_lods);
             }
 
             if (use_aos) {
                /* do sampling/filtering with fixed pt arithmetic */
                lp_build_sample_aos(&bld4, sampler_index,
                                    s4, t4, r4, offsets4,
-                                   lod_iparts, lod_fparts,
-                                   ilevel0s, ilevel1s,
+                                   lod_ipart4, lod_fpart4,
+                                   ilevel04, ilevel14,
                                    texelout4);
             }
 
             else {
                lp_build_sample_general(&bld4, sampler_index,
                                        s4, t4, r4, offsets4,
-                                       lod_iparts, lod_fparts,
-                                       ilevel0s, ilevel1s,
+                                       lod_ipart4, lod_fpart4,
+                                       ilevel04, ilevel14,
                                        texelout4);
             }
             for (j = 0; j < 4; j++) {
@@ -1864,6 +1913,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
    lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32, 128));
 
    if (explicit_lod) {
+      /* FIXME: this needs to honor per-element lod */
       lod = LLVMBuildExtractElement(gallivm->builder, explicit_lod, lp_build_const_int32(gallivm, 0), "");
       first_level = dynamic_state->first_level(dynamic_state, gallivm, texture_unit);
       lod = lp_build_broadcast_scalar(&bld_int_vec,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index fd566b1ff67..0b4845064a0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -184,6 +184,7 @@ struct lp_build_sampler_soa
                         const struct lp_derivatives *derivs,
                         LLVMValueRef lod_bias, /* optional */
                         LLVMValueRef explicit_lod, /* optional */
+                        boolean scalar_lod,
                         LLVMValueRef *texel);
 
    void
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index f6418591036..43724e77a13 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1576,6 +1576,7 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
    LLVMValueRef offsets[3] = { NULL };
    struct lp_derivatives derivs;
    struct lp_derivatives *deriv_ptr = NULL;
+   boolean scalar_lod;
    unsigned num_coords, num_derivs, num_offsets;
    unsigned i;
 
@@ -1693,6 +1694,9 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
       }
    }
 
+   /* TODO: use scalar lod if explicit_lod, lod_bias or derivs are broadcasted scalars */
+   scalar_lod = bld->bld_base.info->processor == TGSI_PROCESSOR_FRAGMENT;
+
    bld->sampler->emit_fetch_texel(bld->sampler,
                                   bld->bld_base.base.gallivm,
                                   bld->bld_base.base.type,
@@ -1701,7 +1705,7 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
                                   coords,
                                   offsets,
                                   deriv_ptr,
-                                  lod_bias, explicit_lod,
+                                  lod_bias, explicit_lod, scalar_lod,
                                   texel);
 }
 
@@ -1719,6 +1723,7 @@ emit_sample(struct lp_build_tgsi_soa_context *bld,
    LLVMValueRef offsets[3] = { NULL };
    struct lp_derivatives derivs;
    struct lp_derivatives *deriv_ptr = NULL;
+   boolean scalar_lod;
    unsigned num_coords, num_offsets, num_derivs;
    unsigned i;
 
@@ -1836,6 +1841,9 @@ emit_sample(struct lp_build_tgsi_soa_context *bld,
       }
    }
 
+   /* TODO: use scalar lod if explicit_lod, lod_bias or derivs are broadcasted scalars */
+   scalar_lod = bld->bld_base.info->processor == TGSI_PROCESSOR_FRAGMENT;
+
    bld->sampler->emit_fetch_texel(bld->sampler,
                                   bld->bld_base.base.gallivm,
                                   bld->bld_base.base.type,
@@ -1844,7 +1852,7 @@ emit_sample(struct lp_build_tgsi_soa_context *bld,
                                   coords,
                                   offsets,
                                   deriv_ptr,
-                                  lod_bias, explicit_lod,
+                                  lod_bias, explicit_lod, scalar_lod,
                                   texel);
 }
 
@@ -1859,6 +1867,7 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
    LLVMValueRef explicit_lod = NULL;
    LLVMValueRef coords[3];
    LLVMValueRef offsets[3] = { NULL };
+   boolean scalar_lod;
    unsigned num_coords;
    unsigned dims;
    unsigned i;
@@ -1927,6 +1936,9 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
       }
    }
 
+   /* TODO: use scalar lod if explicit_lod is broadcasted scalar */
+   scalar_lod = bld->bld_base.info->processor == TGSI_PROCESSOR_FRAGMENT;
+
    bld->sampler->emit_fetch_texel(bld->sampler,
                                   bld->bld_base.base.gallivm,
                                   bld->bld_base.base.type,
@@ -1935,7 +1947,7 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
                                   coords,
                                   offsets,
                                   NULL,
-                                  NULL, explicit_lod,
+                                  NULL, explicit_lod, scalar_lod,
                                   texel);
 }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index 2a37b38d3c7..a4737822566 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -72,6 +72,7 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
           procType == TGSI_PROCESSOR_VERTEX ||
           procType == TGSI_PROCESSOR_GEOMETRY ||
           procType == TGSI_PROCESSOR_COMPUTE);
+   info->processor = procType;
 
 
    /**
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 91eef67ca0b..b62c462356a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -54,6 +54,8 @@ struct tgsi_shader_info
    ubyte num_system_values;
    ubyte system_value_semantic_name[PIPE_MAX_SHADER_INPUTS];
 
+   ubyte processor;
+
    uint file_mask[TGSI_FILE_COUNT];  /**< bitmask of declared registers */
    uint file_count[TGSI_FILE_COUNT];  /**< number of declared registers */
    int file_max[TGSI_FILE_COUNT];  /**< highest index of declared registers */
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
index df2a610027c..2fb6f5b3c27 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
@@ -244,6 +244,7 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                                      const struct lp_derivatives *derivs,
                                      LLVMValueRef lod_bias, /* optional */
                                      LLVMValueRef explicit_lod, /* optional */
+                                     boolean scalar_lod,
                                      LLVMValueRef *texel)
 {
    struct lp_llvm_sampler_soa *sampler = (struct lp_llvm_sampler_soa *)base;
@@ -267,7 +268,7 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                        coords,
                        offsets,
                        derivs,
-                       lod_bias, explicit_lod,
+                       lod_bias, explicit_lod, scalar_lod,
                        texel);
 }
 
-- 
2.30.2