llvmpipe: pass interp location into interpolation code.

[mesa.git] / src / gallium / drivers / llvmpipe / lp_state_fs.c
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c

index b6f4c2a36c91b09de5b52f2e6e1247275d3ad9cc..93da53340e66e312cb7efc51462fa0c92f705f5e 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -62,12 +62,12 @@
  #include "util/u_inlines.h"
  #include "util/u_memory.h"
  #include "util/u_pointer.h"
-#include "util/u_format.h"
+#include "util/format/u_format.h"
  #include "util/u_dump.h"
  #include "util/u_string.h"
  #include "util/simple_list.h"
  #include "util/u_dual_blend.h"
-#include "os/os_time.h"
+#include "util/os_time.h"
  #include "pipe/p_shader_tokens.h"
  #include "draw/draw_context.h"
  #include "tgsi/tgsi_dump.h"
@@ -80,10 +80,12 @@
  #include "gallivm/lp_bld_intr.h"
  #include "gallivm/lp_bld_logic.h"
  #include "gallivm/lp_bld_tgsi.h"
+#include "gallivm/lp_bld_nir.h"
  #include "gallivm/lp_bld_swizzle.h"
  #include "gallivm/lp_bld_flow.h"
  #include "gallivm/lp_bld_debug.h"
  #include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_bitarit.h"
  #include "gallivm/lp_bld_pack.h"
  #include "gallivm/lp_bld_format.h"
  #include "gallivm/lp_bld_quad.h"
@@ -101,7 +103,7 @@
  #include "lp_flush.h"
  #include "lp_state_fs.h"
  #include "lp_rast.h"
-
+#include "nir/nir_to_tgsi_info.h"
  
  /** Fragment shader number (for debugging) */
  static unsigned fs_no = 0;
@@ -121,7 +123,8 @@ static LLVMValueRef
  generate_quad_mask(struct gallivm_state *gallivm,
                     struct lp_type fs_type,
                     unsigned first_quad,
-                   LLVMValueRef mask_input) /* int32 */
+                   unsigned sample,
+                   LLVMValueRef mask_input) /* int64 */
  {
     LLVMBuilderRef builder = gallivm->builder;
     struct lp_type mask_type;
@@ -160,6 +163,11 @@ generate_quad_mask(struct gallivm_state *gallivm,
        shift = 0;
     }
  
+   mask_input = LLVMBuildLShr(builder, mask_input, lp_build_const_int64(gallivm, 16 * sample), "");
+   mask_input = LLVMBuildTrunc(builder, mask_input,
+                               i32t, "");
+   mask_input = LLVMBuildAnd(builder, mask_input, lp_build_const_int32(gallivm, 0xffff), "");
+
     mask_input = LLVMBuildLShr(builder,
                                mask_input,
                                LLVMConstInt(i32t, shift, 0),
@@ -298,7 +306,8 @@ generate_fs_loop(struct gallivm_state *gallivm,
                   LLVMValueRef context_ptr,
                   LLVMValueRef num_loop,
                   struct lp_build_interp_soa_context *interp,
-                 struct lp_build_sampler_soa *sampler,
+                 const struct lp_build_sampler_soa *sampler,
+                 const struct lp_build_image_soa *image,
                   LLVMValueRef mask_store,
                   LLVMValueRef (*out_color)[4],
                   LLVMValueRef depth_ptr,
@@ -312,6 +321,7 @@ generate_fs_loop(struct gallivm_state *gallivm,
     LLVMTypeRef vec_type, int_vec_type;
     LLVMValueRef mask_ptr, mask_val;
     LLVMValueRef consts_ptr, num_consts_ptr;
+   LLVMValueRef ssbo_ptr, num_ssbo_ptr;
     LLVMValueRef z;
     LLVMValueRef z_value, s_value;
     LLVMValueRef z_fb, s_fb;
@@ -338,16 +348,25 @@ generate_fs_loop(struct gallivm_state *gallivm,
  
     memset(&system_values, 0, sizeof(system_values));
  
+   /* truncate then sign extend. */
+   system_values.front_facing = LLVMBuildTrunc(gallivm->builder, facing, LLVMInt1TypeInContext(gallivm->context), "");
+   system_values.front_facing = LLVMBuildSExt(gallivm->builder, system_values.front_facing, LLVMInt32TypeInContext(gallivm->context), "");
+
     if (key->depth.enabled ||
         key->stencil[0].enabled) {
  
        zs_format_desc = util_format_description(key->zsbuf_format);
        assert(zs_format_desc);
  
-      if (!shader->info.base.writes_z && !shader->info.base.writes_stencil) {
-         if (key->alpha.enabled ||
+      if (shader->info.base.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL])
+         depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
+      else if (!shader->info.base.writes_z && !shader->info.base.writes_stencil) {
+         if (shader->info.base.writes_memory)
+            depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
+         else if (key->alpha.enabled ||
               key->blend.alpha_to_coverage ||
-             shader->info.base.uses_kill) {
+             shader->info.base.uses_kill ||
+             shader->info.base.writes_samplemask) {
              /* With alpha test and kill, can do the depth test early
               * and hopefully eliminate some quads.  But need to do a
               * special deferred depth write once the final mask value
@@ -390,6 +409,9 @@ generate_fs_loop(struct gallivm_state *gallivm,
     consts_ptr = lp_jit_context_constants(gallivm, context_ptr);
     num_consts_ptr = lp_jit_context_num_constants(gallivm, context_ptr);
  
+   ssbo_ptr = lp_jit_context_ssbos(gallivm, context_ptr);
+   num_ssbo_ptr = lp_jit_context_num_ssbos(gallivm, context_ptr);
+
     lp_build_for_loop_begin(&loop_state, gallivm,
                             lp_build_const_int32(gallivm, 0),
                             LLVMIntULT,
@@ -472,12 +494,30 @@ generate_fs_loop(struct gallivm_state *gallivm,
  
     lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter);
  
+   struct lp_build_tgsi_params params;
+   memset(&params, 0, sizeof(params));
+
+   params.type = type;
+   params.mask = &mask;
+   params.consts_ptr = consts_ptr;
+   params.const_sizes_ptr = num_consts_ptr;
+   params.system_values = &system_values;
+   params.inputs = interp->inputs;
+   params.context_ptr = context_ptr;
+   params.thread_data_ptr = thread_data_ptr;
+   params.sampler = sampler;
+   params.info = &shader->info.base;
+   params.ssbo_ptr = ssbo_ptr;
+   params.ssbo_sizes_ptr = num_ssbo_ptr;
+   params.image = image;
+
     /* Build the actual shader */
-   lp_build_tgsi_soa(gallivm, tokens, type, &mask,
-                     consts_ptr, num_consts_ptr, &system_values,
-                     interp->inputs,
-                     outputs, context_ptr, thread_data_ptr,
-                     sampler, &shader->info.base, NULL);
+   if (shader->base.type == PIPE_SHADER_IR_TGSI)
+      lp_build_tgsi_soa(gallivm, tokens, &params,
+                        outputs);
+   else
+      lp_build_nir_soa(gallivm, shader->base.ir.nir, &params,
+                       outputs);
  
     /* Alpha test */
     if (key->alpha.enabled) {
@@ -516,6 +556,25 @@ generate_fs_loop(struct gallivm_state *gallivm,
        }
     }
  
+   if (shader->info.base.writes_samplemask) {
+      int smaski = find_output_by_semantic(&shader->info.base,
+                                           TGSI_SEMANTIC_SAMPLEMASK,
+                                           0);
+      LLVMValueRef smask;
+      struct lp_build_context smask_bld;
+      lp_build_context_init(&smask_bld, gallivm, int_type);
+
+      assert(smaski >= 0);
+      smask = LLVMBuildLoad(builder, outputs[smaski][0], "smask");
+      /*
+       * Pixel is alive according to the first sample in the mask.
+       */
+      smask = LLVMBuildBitCast(builder, smask, smask_bld.vec_type, "");
+      smask = lp_build_and(&smask_bld, smask, smask_bld.one);
+      smask = lp_build_cmp(&smask_bld, PIPE_FUNC_NOTEQUAL, smask, smask_bld.zero);
+      lp_build_mask_update(&mask, smask);
+   }
+
     /* Late Z test */
     if (depth_mode & LATE_DEPTH_TEST) {
        int pos0 = find_output_by_semantic(&shader->info.base,
@@ -734,6 +793,10 @@ generate_fs_twiddle(struct gallivm_state *gallivm,
        }
     } else if (twiddle) {
        /* Twiddle pixels across elements of array */
+      /*
+       * XXX: we should avoid this in some cases, but would need to tell
+       * lp_build_conv to reorder (or deal with it ourselves).
+       */
        lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
     } else {
        /* Do nothing */
@@ -764,6 +827,94 @@ generate_fs_twiddle(struct gallivm_state *gallivm,
  }
  
  
+/*
+ * Untwiddle and transpose, much like the above.
+ * However, this is after conversion, so we get packed vectors.
+ * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data,
+ * the vectors will look like:
+ * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may
+ * be swizzled here). Extending to 16bit should be trivial.
+ * Should also be extended to handle twice wide vectors with AVX2...
+ */
+static void
+fs_twiddle_transpose(struct gallivm_state *gallivm,
+                     struct lp_type type,
+                     LLVMValueRef *src,
+                     unsigned src_count,
+                     LLVMValueRef *dst)
+{
+   unsigned i, j;
+   struct lp_type type64, type16, type32;
+   LLVMTypeRef type64_t, type8_t, type16_t, type32_t;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef tmp[4], shuf[8];
+   for (j = 0; j < 2; j++) {
+      shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0);
+      shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2);
+      shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1);
+      shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3);
+   }
+
+   assert(src_count == 4 || src_count == 2 || src_count == 1);
+   assert(type.width == 8);
+   assert(type.length == 16);
+
+   type8_t = lp_build_vec_type(gallivm, type);
+
+   type64 = type;
+   type64.length /= 8;
+   type64.width *= 8;
+   type64_t = lp_build_vec_type(gallivm, type64);
+
+   type16 = type;
+   type16.length /= 2;
+   type16.width *= 2;
+   type16_t = lp_build_vec_type(gallivm, type16);
+
+   type32 = type;
+   type32.length /= 4;
+   type32.width *= 4;
+   type32_t = lp_build_vec_type(gallivm, type32);
+
+   lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp);
+
+   if (src_count == 1) {
+      /* transpose was no-op, just untwiddle */
+      LLVMValueRef shuf_vec;
+      shuf_vec = LLVMConstVector(shuf, 8);
+      tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, "");
+      tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, "");
+      dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, "");
+   } else if (src_count == 2) {
+      LLVMValueRef shuf_vec;
+      shuf_vec = LLVMConstVector(shuf, 4);
+
+      for (i = 0; i < 2; i++) {
+         tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, "");
+         tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, "");
+         dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, "");
+      }
+   } else {
+      for (j = 0; j < 2; j++) {
+         LLVMValueRef lo, hi, lo2, hi2;
+          /*
+          * Note that if we only really have 3 valid channels (rgb)
+          * and we don't need alpha we could substitute a undef here
+          * for the respective channel (causing llvm to drop conversion
+          * for alpha).
+          */
+         /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */
+         lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, "");
+         hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, "");
+         lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0);
+         hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1);
+         dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, "");
+         dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, "");
+      }
+   }
+}
+
+
  /**
   * Load an unswizzled block of pixels from memory
   */
@@ -1263,7 +1414,7 @@ convert_to_blend_type(struct gallivm_state *gallivm,
        for (j = 0; j < src_fmt->nr_channels; ++j) {
           unsigned mask = 0;
           unsigned sa = src_fmt->channel[j].shift;
-#ifdef PIPE_ARCH_LITTLE_ENDIAN
+#if UTIL_ARCH_LITTLE_ENDIAN
           unsigned from_lsb = j;
  #else
           unsigned from_lsb = src_fmt->nr_channels - j - 1;
@@ -1445,7 +1596,8 @@ convert_from_blend_type(struct gallivm_state *gallivm,
        for (j = 0; j < src_fmt->nr_channels; ++j) {
           unsigned mask = 0;
           unsigned sa = src_fmt->channel[j].shift;
-#ifdef PIPE_ARCH_LITTLE_ENDIAN
+         unsigned sz_a = src_fmt->channel[j].size;
+#if UTIL_ARCH_LITTLE_ENDIAN
           unsigned from_lsb = j;
  #else
           unsigned from_lsb = src_fmt->nr_channels - j - 1;
@@ -1460,7 +1612,8 @@ convert_from_blend_type(struct gallivm_state *gallivm,
           /* Extract bits */
           chans[j] = LLVMBuildLShr(builder,
                                    dst[i],
-                                  lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width),
+                                  lp_build_const_int_vec(gallivm, src_type,
+                                                         from_lsb * blend_type.width),
                                    "");
  
           chans[j] = LLVMBuildAnd(builder,
@@ -1472,6 +1625,10 @@ convert_from_blend_type(struct gallivm_state *gallivm,
           if (src_type.norm) {
              chans[j] = scale_bits(gallivm, blend_type.width,
                                    src_fmt->channel[j].size, chans[j], src_type);
+         } else if (!src_type.floating && sz_a < blend_type.width) {
+            LLVMValueRef mask_val = lp_build_const_int_vec(gallivm, src_type, (1UL << sz_a) - 1);
+            LLVMValueRef mask = LLVMBuildICmp(builder, LLVMIntUGT, chans[j], mask_val, "");
+            chans[j] = LLVMBuildSelect(builder, mask, mask_val, chans[j], "");
           }
  
           /* Insert bits */
@@ -1548,7 +1705,8 @@ convert_alpha(struct gallivm_state *gallivm,
        /* If there is a src for each pixel broadcast the alpha across whole row */
        if (src_count == block_size) {
           for (i = 0; i < src_count; ++i) {
-            src_alpha[i] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, row_type), src_alpha[i]);
+            src_alpha[i] = lp_build_broadcast(gallivm,
+                              lp_build_vec_type(gallivm, row_type), src_alpha[i]);
           }
        } else {
           unsigned pixels = block_size / src_count;
@@ -1654,6 +1812,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
                                       util_blend_state_is_dual(&variant->key.blend, 0);
  
     const boolean is_1d = variant->key.resource_1d;
+   boolean twiddle_after_convert = FALSE;
     unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
     LLVMValueRef fpstate = 0;
  
@@ -1749,13 +1908,23 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
     }
  
     /* If 3 channels then pad to include alpha for 4 element transpose */
-   if (dst_channels == 3 && !has_alpha) {
+   if (dst_channels == 3) {
+      assert (!has_alpha);
        for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
           if (swizzle[i] > TGSI_NUM_CHANNELS)
              swizzle[i] = 3;
        }
        if (out_format_desc->nr_channels == 4) {
           dst_channels = 4;
+         /*
+          * We use alpha from the color conversion, not separate one.
+          * We had to include it for transpose, hence it will get converted
+          * too (albeit when doing transpose after conversion, that would
+          * no longer be the case necessarily).
+          * (It works only with 4 channel dsts, e.g. rgbx formats, because
+          * otherwise we really have padding, not alpha, included.)
+          */
+         has_alpha = true;
        }
     }
  
@@ -1787,6 +1956,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
        /*
         * XXX If we include that here maybe could actually use it instead of
         * separate alpha for blending?
+       * (Difficult though we actually convert pad channels, not alpha.)
         */
        if (dst_channels == 3 && !has_alpha) {
           fs_src[i][3] = alpha;
@@ -1794,11 +1964,14 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
  
        /* We split the row_mask and row_alpha as we want 128bit interleave */
        if (fs_type.length == 8) {
-         src_mask[i*2 + 0]  = lp_build_extract_range(gallivm, fs_mask[i], 0, src_channels);
-         src_mask[i*2 + 1]  = lp_build_extract_range(gallivm, fs_mask[i], src_channels, src_channels);
+         src_mask[i*2 + 0]  = lp_build_extract_range(gallivm, fs_mask[i],
+                                                     0, src_channels);
+         src_mask[i*2 + 1]  = lp_build_extract_range(gallivm, fs_mask[i],
+                                                     src_channels, src_channels);
  
           src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
-         src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels);
+         src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
+                                                     src_channels, src_channels);
        } else {
           src_mask[i] = fs_mask[i];
           src_alpha[i] = alpha;
@@ -1829,7 +2002,8 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
           }
           if (fs_type.length == 8) {
              src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
-            src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels);
+            src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
+                                                         src_channels, src_channels);
           } else {
              src1_alpha[i] = alpha;
           }
@@ -1855,14 +2029,45 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
        }
     }
  
+   /*
+    * We actually should generally do conversion first (for non-1d cases)
+    * when the blend format is 8 or 16 bits. The reason is obvious,
+    * there's 2 or 4 times less vectors to deal with for the interleave...
+    * Albeit for the AVX (not AVX2) case there's no benefit with 16 bit
+    * vectors (as it can do 32bit unpack with 256bit vectors, but 8/16bit
+    * unpack only with 128bit vectors).
+    * Note: for 16bit sizes really need matching pack conversion code
+    */
+   if (!is_1d && dst_channels != 3 && dst_type.width == 8) {
+      twiddle_after_convert = TRUE;
+   }
+
     /*
      * Pixel twiddle from fragment shader order to memory order
      */
-   src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
-                                   dst_channels, fs_src, src, pad_inline);
-   if (dual_source_blend) {
-      generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
-                          fs_src1, src1, pad_inline);
+   if (!twiddle_after_convert) {
+      src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
+                                      dst_channels, fs_src, src, pad_inline);
+      if (dual_source_blend) {
+         generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
+                             fs_src1, src1, pad_inline);
+      }
+   } else {
+      src_count = num_fullblock_fs * dst_channels;
+      /*
+       * We reorder things a bit here, so the cases for 4-wide and 8-wide
+       * (AVX) turn out the same later when untwiddling/transpose (albeit
+       * for true AVX2 path untwiddle needs to be different).
+       * For now just order by colors first (so we can use unpack later).
+       */
+      for (j = 0; j < num_fullblock_fs; j++) {
+         for (i = 0; i < dst_channels; i++) {
+            src[i*num_fullblock_fs + j] = fs_src[j][i];
+            if (dual_source_blend) {
+               src1[i*num_fullblock_fs + j] = fs_src1[j][i];
+            }
+         }
+      }
     }
  
     src_channels = dst_channels < 3 ? dst_channels : 4;
@@ -1906,13 +2111,21 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
        assert(bits == 128 || bits == 256);
     }
  
+   if (twiddle_after_convert) {
+      fs_twiddle_transpose(gallivm, row_type, src, src_count, src);
+      if (dual_source_blend) {
+         fs_twiddle_transpose(gallivm, row_type, src1, src_count, src1);
+      }
+   }
  
     /*
      * Blend Colour conversion
      */
     blend_color = lp_jit_context_f_blend_color(gallivm, context_ptr);
-   blend_color = LLVMBuildPointerCast(builder, blend_color, LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), "");
-   blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color, &i32_zero, 1, ""), "");
+   blend_color = LLVMBuildPointerCast(builder, blend_color,
+                    LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), "");
+   blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color,
+                               &i32_zero, 1, ""), "");
  
     /* Convert */
     lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1, &blend_color, 1);
@@ -1989,13 +2202,19 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
           mask_type.length = pixels;
           mask_type.width = row_type.width * dst_channels;
  
-         src_mask[i] = LLVMBuildIntCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
+         /*
+          * If mask_type width is smaller than 32bit, this doesn't quite
+          * generate the most efficient code (could use some pack).
+          */
+         src_mask[i] = LLVMBuildIntCast(builder, src_mask[i],
+                                        lp_build_int_vec_type(gallivm, mask_type), "");
  
           mask_type.length *= dst_channels;
           mask_type.width /= dst_channels;
        }
  
-      src_mask[i] = LLVMBuildBitCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
+      src_mask[i] = LLVMBuildBitCast(builder, src_mask[i],
+                                     lp_build_int_vec_type(gallivm, mask_type), "");
        src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length);
     }
  
@@ -2064,7 +2283,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
  
     if (dst_count > src_count) {
        if ((dst_type.width == 8 || dst_type.width == 16) &&
-          util_is_power_of_two(dst_type.length) &&
+          util_is_power_of_two_or_zero(dst_type.length) &&
            dst_type.length * dst_type.width < 128) {
           /*
            * Never try to load values as 4xi8 which we will then
@@ -2141,7 +2360,8 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
      * It seems some cleanup could be done here (like skipping conversion/blend
      * when not needed).
      */
-   convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type, row_type, dst, src_count);
+   convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type,
+                         row_type, dst, src_count);
  
     /*
      * FIXME: Really should get logic ops / masks out of generic blend / row
@@ -2167,7 +2387,8 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
                                    pad_inline ? 4 : dst_channels);
     }
  
-   convert_from_blend_type(gallivm, block_size, out_format_desc, row_type, dst_type, dst, src_count);
+   convert_from_blend_type(gallivm, block_size, out_format_desc,
+                           row_type, dst_type, dst, src_count);
  
     /* Split the blend rows back to memory rows */
     if (dst_count > src_count) {
@@ -2225,14 +2446,14 @@ generate_fragment(struct llvmpipe_context *lp,
                    unsigned partial_mask)
  {
     struct gallivm_state *gallivm = variant->gallivm;
-   const struct lp_fragment_shader_variant_key *key = &variant->key;
+   struct lp_fragment_shader_variant_key *key = &variant->key;
     struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
     char func_name[64];
     struct lp_type fs_type;
     struct lp_type blend_type;
     LLVMTypeRef fs_elem_type;
     LLVMTypeRef blend_vec_type;
-   LLVMTypeRef arg_types[13];
+   LLVMTypeRef arg_types[15];
     LLVMTypeRef func_type;
     LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
     LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
@@ -2244,13 +2465,16 @@ generate_fragment(struct llvmpipe_context *lp,
     LLVMValueRef dady_ptr;
     LLVMValueRef color_ptr_ptr;
     LLVMValueRef stride_ptr;
+   LLVMValueRef color_sample_stride_ptr;
     LLVMValueRef depth_ptr;
     LLVMValueRef depth_stride;
+   LLVMValueRef depth_sample_stride;
     LLVMValueRef mask_input;
     LLVMValueRef thread_data_ptr;
     LLVMBasicBlockRef block;
     LLVMBuilderRef builder;
     struct lp_build_sampler_soa *sampler;
+   struct lp_build_image_soa *image;
     struct lp_build_interp_soa_context interp;
     LLVMValueRef fs_mask[16 / 4];
     LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4];
@@ -2308,8 +2532,8 @@ generate_fragment(struct llvmpipe_context *lp,
  
     blend_vec_type = lp_build_vec_type(gallivm, blend_type);
  
-   util_snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s",
-                 shader->no, variant->no, partial_mask ? "partial" : "whole");
+   snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s",
+            shader->no, variant->no, partial_mask ? "partial" : "whole");
  
     arg_types[0] = variant->jit_context_ptr_type;       /* context */
     arg_types[1] = int32_type;                          /* x */
@@ -2320,10 +2544,12 @@ generate_fragment(struct llvmpipe_context *lp,
     arg_types[6] = LLVMPointerType(fs_elem_type, 0);    /* dady */
     arg_types[7] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0);  /* color */
     arg_types[8] = LLVMPointerType(int8_type, 0);       /* depth */
-   arg_types[9] = int32_type;                          /* mask_input */
+   arg_types[9] = LLVMInt64TypeInContext(gallivm->context);  /* mask_input */
     arg_types[10] = variant->jit_thread_data_ptr_type;  /* per thread data */
     arg_types[11] = LLVMPointerType(int32_type, 0);     /* stride */
     arg_types[12] = int32_type;                         /* depth_stride */
+   arg_types[13] = LLVMPointerType(int32_type, 0);     /* color sample strides */
+   arg_types[14] = int32_type;                         /* depth sample stride */
  
     func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
                                  arg_types, ARRAY_SIZE(arg_types), 0);
@@ -2353,6 +2579,8 @@ generate_fragment(struct llvmpipe_context *lp,
     thread_data_ptr  = LLVMGetParam(function, 10);
     stride_ptr   = LLVMGetParam(function, 11);
     depth_stride = LLVMGetParam(function, 12);
+   color_sample_stride_ptr = LLVMGetParam(function, 13);
+   depth_sample_stride = LLVMGetParam(function, 14);
  
     lp_build_name(context_ptr, "context");
     lp_build_name(x, "x");
@@ -2366,6 +2594,8 @@ generate_fragment(struct llvmpipe_context *lp,
     lp_build_name(thread_data_ptr, "thread_data");
     lp_build_name(stride_ptr, "stride_ptr");
     lp_build_name(depth_stride, "depth_stride");
+   lp_build_name(color_sample_stride_ptr, "color_sample_stride_ptr");
+   lp_build_name(depth_sample_stride, "depth_sample_stride");
  
     /*
      * Function body
@@ -2376,8 +2606,28 @@ generate_fragment(struct llvmpipe_context *lp,
     assert(builder);
     LLVMPositionBuilderAtEnd(builder, block);
  
+   /*
+    * Must not count ps invocations if there's a null shader.
+    * (It would be ok to count with null shader if there's d/s tests,
+    * but only if there's d/s buffers too, which is different
+    * to implicit rasterization disable which must not depend
+    * on the d/s buffers.)
+    * Could use popcount on mask, but pixel accuracy is not required.
+    * Could disable if there's no stats query, but maybe not worth it.
+    */
+   if (shader->info.base.num_instructions > 1) {
+      LLVMValueRef invocs, val;
+      invocs = lp_jit_thread_data_invocations(gallivm, thread_data_ptr);
+      val = LLVMBuildLoad(builder, invocs, "");
+      val = LLVMBuildAdd(builder, val,
+                         LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), 1, 0),
+                         "invoc_count");
+      LLVMBuildStore(builder, val, invocs);
+   }
+
     /* code generated texture sampling */
-   sampler = lp_llvm_sampler_soa_create(key->state);
+   sampler = lp_llvm_sampler_soa_create(key->samplers);
+   image = lp_llvm_image_soa_create(lp_fs_variant_key_images(key));
  
     num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
     /* for 1d resources only run "upper half" of stamp */
@@ -2416,7 +2666,7 @@ generate_fragment(struct llvmpipe_context *lp,
  
           if (partial_mask) {
              mask = generate_quad_mask(gallivm, fs_type,
-                                      i*fs_type.length/4, mask_input);
+                                      i*fs_type.length/4, 0, mask_input);
           }
           else {
              mask = lp_build_const_int_vec(gallivm, fs_type, ~0);
@@ -2432,6 +2682,7 @@ generate_fragment(struct llvmpipe_context *lp,
                         num_loop,
                         &interp,
                         sampler,
+                       image,
                         mask_store, /* output */
                         color_store,
                         depth_ptr,
@@ -2466,7 +2717,7 @@ generate_fragment(struct llvmpipe_context *lp,
     }
  
     sampler->destroy(sampler);
-
+   image->destroy(image);
     /* Loop over color outputs / color buffers to do blending.
      */
     for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
@@ -2506,7 +2757,7 @@ generate_fragment(struct llvmpipe_context *lp,
  
  
  static void
-dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
+dump_fs_variant_key(struct lp_fragment_shader_variant_key *key)
  {
     unsigned i;
  
@@ -2515,30 +2766,36 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
     if (key->flatshade) {
        debug_printf("flatshade = 1\n");
     }
+   if (key->multisample) {
+      debug_printf("multisample = 1\n");
+      debug_printf("coverage samples = %d\n", key->coverage_samples);
+   }
     for (i = 0; i < key->nr_cbufs; ++i) {
        debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
+      debug_printf("cbuf nr_samples[%u] = %d\n", i, key->cbuf_nr_samples[i]);
     }
     if (key->depth.enabled || key->stencil[0].enabled) {
        debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
+      debug_printf("depth nr_samples = %d\n", key->zsbuf_nr_samples);
     }
     if (key->depth.enabled) {
-      debug_printf("depth.func = %s\n", util_dump_func(key->depth.func, TRUE));
+      debug_printf("depth.func = %s\n", util_str_func(key->depth.func, TRUE));
        debug_printf("depth.writemask = %u\n", key->depth.writemask);
     }
  
     for (i = 0; i < 2; ++i) {
        if (key->stencil[i].enabled) {
-         debug_printf("stencil[%u].func = %s\n", i, util_dump_func(key->stencil[i].func, TRUE));
-         debug_printf("stencil[%u].fail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].fail_op, TRUE));
-         debug_printf("stencil[%u].zpass_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zpass_op, TRUE));
-         debug_printf("stencil[%u].zfail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zfail_op, TRUE));
+         debug_printf("stencil[%u].func = %s\n", i, util_str_func(key->stencil[i].func, TRUE));
+         debug_printf("stencil[%u].fail_op = %s\n", i, util_str_stencil_op(key->stencil[i].fail_op, TRUE));
+         debug_printf("stencil[%u].zpass_op = %s\n", i, util_str_stencil_op(key->stencil[i].zpass_op, TRUE));
+         debug_printf("stencil[%u].zfail_op = %s\n", i, util_str_stencil_op(key->stencil[i].zfail_op, TRUE));
           debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask);
           debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask);
        }
     }
  
     if (key->alpha.enabled) {
-      debug_printf("alpha.func = %s\n", util_dump_func(key->alpha.func, TRUE));
+      debug_printf("alpha.func = %s\n", util_str_func(key->alpha.func, TRUE));
     }
  
     if (key->occlusion_count) {
@@ -2546,35 +2803,35 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
     }
  
     if (key->blend.logicop_enable) {
-      debug_printf("blend.logicop_func = %s\n", util_dump_logicop(key->blend.logicop_func, TRUE));
+      debug_printf("blend.logicop_func = %s\n", util_str_logicop(key->blend.logicop_func, TRUE));
     }
     else if (key->blend.rt[0].blend_enable) {
-      debug_printf("blend.rgb_func = %s\n",   util_dump_blend_func  (key->blend.rt[0].rgb_func, TRUE));
-      debug_printf("blend.rgb_src_factor = %s\n",   util_dump_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE));
-      debug_printf("blend.rgb_dst_factor = %s\n",   util_dump_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE));
-      debug_printf("blend.alpha_func = %s\n",       util_dump_blend_func  (key->blend.rt[0].alpha_func, TRUE));
-      debug_printf("blend.alpha_src_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE));
-      debug_printf("blend.alpha_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
+      debug_printf("blend.rgb_func = %s\n",   util_str_blend_func  (key->blend.rt[0].rgb_func, TRUE));
+      debug_printf("blend.rgb_src_factor = %s\n",   util_str_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE));
+      debug_printf("blend.rgb_dst_factor = %s\n",   util_str_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE));
+      debug_printf("blend.alpha_func = %s\n",       util_str_blend_func  (key->blend.rt[0].alpha_func, TRUE));
+      debug_printf("blend.alpha_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE));
+      debug_printf("blend.alpha_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
     }
     debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
     if (key->blend.alpha_to_coverage) {
        debug_printf("blend.alpha_to_coverage is enabled\n");
     }
     for (i = 0; i < key->nr_samplers; ++i) {
-      const struct lp_static_sampler_state *sampler = &key->state[i].sampler_state;
+      const struct lp_static_sampler_state *sampler = &key->samplers[i].sampler_state;
        debug_printf("sampler[%u] = \n", i);
        debug_printf("  .wrap = %s %s %s\n",
-                   util_dump_tex_wrap(sampler->wrap_s, TRUE),
-                   util_dump_tex_wrap(sampler->wrap_t, TRUE),
-                   util_dump_tex_wrap(sampler->wrap_r, TRUE));
+                   util_str_tex_wrap(sampler->wrap_s, TRUE),
+                   util_str_tex_wrap(sampler->wrap_t, TRUE),
+                   util_str_tex_wrap(sampler->wrap_r, TRUE));
        debug_printf("  .min_img_filter = %s\n",
-                   util_dump_tex_filter(sampler->min_img_filter, TRUE));
+                   util_str_tex_filter(sampler->min_img_filter, TRUE));
        debug_printf("  .min_mip_filter = %s\n",
-                   util_dump_tex_mipfilter(sampler->min_mip_filter, TRUE));
+                   util_str_tex_mipfilter(sampler->min_mip_filter, TRUE));
        debug_printf("  .mag_img_filter = %s\n",
-                   util_dump_tex_filter(sampler->mag_img_filter, TRUE));
+                   util_str_tex_filter(sampler->mag_img_filter, TRUE));
        if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE)
-         debug_printf("  .compare_func = %s\n", util_dump_func(sampler->compare_func, TRUE));
+         debug_printf("  .compare_func = %s\n", util_str_func(sampler->compare_func, TRUE));
        debug_printf("  .normalized_coords = %u\n", sampler->normalized_coords);
        debug_printf("  .min_max_lod_equal = %u\n", sampler->min_max_lod_equal);
        debug_printf("  .lod_bias_non_zero = %u\n", sampler->lod_bias_non_zero);
@@ -2582,12 +2839,12 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
        debug_printf("  .apply_max_lod = %u\n", sampler->apply_max_lod);
     }
     for (i = 0; i < key->nr_sampler_views; ++i) {
-      const struct lp_static_texture_state *texture = &key->state[i].texture_state;
+      const struct lp_static_texture_state *texture = &key->samplers[i].texture_state;
        debug_printf("texture[%u] = \n", i);
        debug_printf("  .format = %s\n",
                     util_format_name(texture->format));
        debug_printf("  .target = %s\n",
-                   util_dump_tex_target(texture->target, TRUE));
+                   util_str_tex_target(texture->target, TRUE));
        debug_printf("  .level_zero_only = %u\n",
                     texture->level_zero_only);
        debug_printf("  .pot = %u %u %u\n",
@@ -2595,15 +2852,33 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
                     texture->pot_height,
                     texture->pot_depth);
     }
+   struct lp_image_static_state *images = lp_fs_variant_key_images(key);
+   for (i = 0; i < key->nr_images; ++i) {
+      const struct lp_static_texture_state *image = &images[i].image_state;
+      debug_printf("image[%u] = \n", i);
+      debug_printf("  .format = %s\n",
+                   util_format_name(image->format));
+      debug_printf("  .target = %s\n",
+                   util_str_tex_target(image->target, TRUE));
+      debug_printf("  .level_zero_only = %u\n",
+                   image->level_zero_only);
+      debug_printf("  .pot = %u %u %u\n",
+                   image->pot_width,
+                   image->pot_height,
+                   image->pot_depth);
+   }
  }
  
  
  void
-lp_debug_fs_variant(const struct lp_fragment_shader_variant *variant)
+lp_debug_fs_variant(struct lp_fragment_shader_variant *variant)
  {
-   debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n", 
+   debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n",
                  variant->shader->no, variant->no);
-   tgsi_dump(variant->shader->base.tokens, 0);
+   if (variant->shader->base.type == PIPE_SHADER_IR_TGSI)
+      tgsi_dump(variant->shader->base.tokens, 0);
+   else
+      nir_print_shader(variant->shader->base.ir.nir, stderr);
     dump_fs_variant_key(&variant->key);
     debug_printf("variant->opaque = %u\n", variant->opaque);
     debug_printf("\n");
@@ -2620,16 +2895,17 @@ generate_variant(struct llvmpipe_context *lp,
                   const struct lp_fragment_shader_variant_key *key)
  {
     struct lp_fragment_shader_variant *variant;
-   const struct util_format_description *cbuf0_format_desc;
+   const struct util_format_description *cbuf0_format_desc = NULL;
     boolean fullcolormask;
     char module_name[64];
  
-   variant = CALLOC_STRUCT(lp_fragment_shader_variant);
+   variant = MALLOC(sizeof *variant + shader->variant_key_size - sizeof variant->key);
     if (!variant)
        return NULL;
  
-   util_snprintf(module_name, sizeof(module_name), "fs%u_variant%u",
-                 shader->no, shader->variants_created);
+   memset(variant, 0, sizeof(*variant));
+   snprintf(module_name, sizeof(module_name), "fs%u_variant%u",
+            shader->no, shader->variants_created);
  
     variant->gallivm = gallivm_create(module_name, lp->context);
     if (!variant->gallivm) {
@@ -2661,16 +2937,10 @@ generate_variant(struct llvmpipe_context *lp,
           !key->alpha.enabled &&
           !key->blend.alpha_to_coverage &&
           !key->depth.enabled &&
-         !shader->info.base.uses_kill
+         !shader->info.base.uses_kill &&
+         !shader->info.base.writes_samplemask
        ? TRUE : FALSE;
  
-   if ((shader->info.base.num_tokens <= 1) &&
-       !key->depth.enabled && !key->stencil[0].enabled) {
-      variant->ps_inv_multiplier = 0;
-   } else {
-      variant->ps_inv_multiplier = 1;
-   }
-
     if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
        lp_debug_fs_variant(variant);
     }
@@ -2723,6 +2993,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
     struct lp_fragment_shader *shader;
     int nr_samplers;
     int nr_sampler_views;
+   int nr_images;
     int i;
  
     shader = CALLOC_STRUCT(lp_fragment_shader);
@@ -2732,11 +3003,17 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
     shader->no = fs_no++;
     make_empty_list(&shader->variants);
  
-   /* get/save the summary info for this shader */
-   lp_build_tgsi_info(templ->tokens, &shader->info);
+   shader->base.type = templ->type;
+   if (templ->type == PIPE_SHADER_IR_TGSI) {
+      /* get/save the summary info for this shader */
+      lp_build_tgsi_info(templ->tokens, &shader->info);
  
-   /* we need to keep a local copy of the tokens */
-   shader->base.tokens = tgsi_dup_tokens(templ->tokens);
+      /* we need to keep a local copy of the tokens */
+      shader->base.tokens = tgsi_dup_tokens(templ->tokens);
+   } else {
+      shader->base.ir.nir = templ->ir.nir;
+      nir_tgsi_scan_shader(templ->ir.nir, &shader->info.base, true);
+   }
  
     shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ);
     if (shader->draw_data == NULL) {
@@ -2747,13 +3024,13 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
  
     nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
     nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
-
-   shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key,
-                                     state[MAX2(nr_samplers, nr_sampler_views)]);
+   nr_images = shader->info.base.file_max[TGSI_FILE_IMAGE] + 1;
+   shader->variant_key_size = lp_fs_variant_key_size(MAX2(nr_samplers, nr_sampler_views), nr_images);
  
     for (i = 0; i < shader->info.base.num_inputs; i++) {
        shader->inputs[i].usage_mask = shader->info.base.input_usage_mask[i];
        shader->inputs[i].cyl_wrap = shader->info.base.input_cylindrical_wrap[i];
+      shader->inputs[i].location = shader->info.base.input_interpolate_loc[i];
  
        switch (shader->info.base.input_interpolate[i]) {
        case TGSI_INTERPOLATE_CONSTANT:
@@ -2815,14 +3092,14 @@ static void
  llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
  {
     struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
-
-   if (llvmpipe->fs == fs)
+   struct lp_fragment_shader *lp_fs = (struct lp_fragment_shader *)fs;
+   if (llvmpipe->fs == lp_fs)
        return;
  
-   llvmpipe->fs = (struct lp_fragment_shader *) fs;
-
     draw_bind_fragment_shader(llvmpipe->draw,
-                             (llvmpipe->fs ? llvmpipe->fs->draw_data : NULL));
+                             (lp_fs ? lp_fs->draw_data : NULL));
+
+   llvmpipe->fs = lp_fs;
  
     llvmpipe->dirty |= LP_NEW_FS;
  }
@@ -2832,18 +3109,17 @@ llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
   * Remove shader variant from two lists: the shader's variant list
   * and the context's variant list.
   */
-void
+static void
  llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
                                 struct lp_fragment_shader_variant *variant)
  {
-   if (gallivm_debug & GALLIVM_DEBUG_IR) {
-      debug_printf("llvmpipe: del fs #%u var #%u v created #%u v cached"
-                   " #%u v total cached #%u\n",
-                   variant->shader->no,
-                   variant->no,
+   if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
+      debug_printf("llvmpipe: del fs #%u var %u v created %u v cached %u "
+                   "v total cached %u inst %u total inst %u\n",
+                   variant->shader->no, variant->no,
                     variant->shader->variants_created,
                     variant->shader->variants_cached,
-                   lp->nr_fs_variants);
+                   lp->nr_fs_variants, variant->nr_instrs, lp->nr_fs_instrs);
     }
  
     gallivm_destroy(variant->gallivm);
@@ -2888,6 +3164,8 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
     /* Delete draw module's data */
     draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data);
  
+   if (shader->base.ir.nir)
+      ralloc_free(shader->base.ir.nir);
     assert(shader->variants_cached == 0);
     FREE((void *) shader->base.tokens);
     FREE(shader);
@@ -2897,7 +3175,7 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
  
  static void
  llvmpipe_set_constant_buffer(struct pipe_context *pipe,
-                             uint shader, uint index,
+                             enum pipe_shader_type shader, uint index,
                               const struct pipe_constant_buffer *cb)
  {
     struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
@@ -2917,7 +3195,9 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
     }
  
     if (shader == PIPE_SHADER_VERTEX ||
-       shader == PIPE_SHADER_GEOMETRY) {
+       shader == PIPE_SHADER_GEOMETRY ||
+       shader == PIPE_SHADER_TESS_CTRL ||
+       shader == PIPE_SHADER_TESS_EVAL) {
        /* Pass the constants to the 'draw' module */
        const unsigned size = cb ? cb->buffer_size : 0;
        const ubyte *data;
@@ -2938,15 +3218,78 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
        draw_set_mapped_constant_buffer(llvmpipe->draw, shader,
                                        index, data, size);
     }
-   else {
+   else if (shader == PIPE_SHADER_COMPUTE)
+      llvmpipe->cs_dirty |= LP_CSNEW_CONSTANTS;
+   else
        llvmpipe->dirty |= LP_NEW_FS_CONSTANTS;
-   }
  
     if (cb && cb->user_buffer) {
        pipe_resource_reference(&constants, NULL);
     }
  }
  
+static void
+llvmpipe_set_shader_buffers(struct pipe_context *pipe,
+                            enum pipe_shader_type shader, unsigned start_slot,
+                            unsigned count, const struct pipe_shader_buffer *buffers,
+                            unsigned writable_bitmask)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   unsigned i, idx;
+   for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
+      const struct pipe_shader_buffer *buffer = buffers ? &buffers[idx] : NULL;
+
+      util_copy_shader_buffer(&llvmpipe->ssbos[shader][i], buffer);
+
+      if (shader == PIPE_SHADER_VERTEX ||
+          shader == PIPE_SHADER_GEOMETRY ||
+          shader == PIPE_SHADER_TESS_CTRL ||
+          shader == PIPE_SHADER_TESS_EVAL) {
+         const unsigned size = buffer ? buffer->buffer_size : 0;
+         const ubyte *data = NULL;
+         if (buffer && buffer->buffer)
+            data = (ubyte *) llvmpipe_resource_data(buffer->buffer);
+         if (data)
+            data += buffer->buffer_offset;
+         draw_set_mapped_shader_buffer(llvmpipe->draw, shader,
+                                       i, data, size);
+      } else if (shader == PIPE_SHADER_COMPUTE) {
+        llvmpipe->cs_dirty |= LP_CSNEW_SSBOS;
+      } else if (shader == PIPE_SHADER_FRAGMENT) {
+         llvmpipe->dirty |= LP_NEW_FS_SSBOS;
+      }
+   }
+}
+
+static void
+llvmpipe_set_shader_images(struct pipe_context *pipe,
+                            enum pipe_shader_type shader, unsigned start_slot,
+                           unsigned count, const struct pipe_image_view *images)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   unsigned i, idx;
+
+   draw_flush(llvmpipe->draw);
+   for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
+      const struct pipe_image_view *image = images ? &images[idx] : NULL;
+
+      util_copy_image_view(&llvmpipe->images[shader][i], image);
+   }
+
+   llvmpipe->num_images[shader] = start_slot + count;
+   if (shader == PIPE_SHADER_VERTEX ||
+       shader == PIPE_SHADER_GEOMETRY ||
+       shader == PIPE_SHADER_TESS_CTRL ||
+       shader == PIPE_SHADER_TESS_EVAL) {
+      draw_set_images(llvmpipe->draw,
+                      shader,
+                      llvmpipe->images[shader],
+                      start_slot + count);
+   } else if (shader == PIPE_SHADER_COMPUTE)
+      llvmpipe->cs_dirty |= LP_CSNEW_IMAGES;
+   else
+      llvmpipe->dirty |= LP_NEW_FS_IMAGES;
+}
  
  /**
   * Return the blend factor equivalent to a destination alpha of one.
@@ -2977,14 +3320,17 @@ force_dst_alpha_one(unsigned factor, boolean clamped_zero)
   * TODO: there is actually no reason to tie this to context state -- the
   * generated code could be cached globally in the screen.
   */
-static void
+static struct lp_fragment_shader_variant_key *
  make_variant_key(struct llvmpipe_context *lp,
                   struct lp_fragment_shader *shader,
-                 struct lp_fragment_shader_variant_key *key)
+                 char *store)
  {
     unsigned i;
+   struct lp_fragment_shader_variant_key *key;
  
-   memset(key, 0, shader->variant_key_size);
+   key = (struct lp_fragment_shader_variant_key *)store;
+
+   memset(key, 0, offsetof(struct lp_fragment_shader_variant_key, samplers[1]));
  
     if (lp->framebuffer.zsbuf) {
        enum pipe_format zsbuf_format = lp->framebuffer.zsbuf->format;
@@ -3004,6 +3350,7 @@ make_variant_key(struct llvmpipe_context *lp,
        if (llvmpipe_resource_is_1d(lp->framebuffer.zsbuf->texture)) {
           key->resource_1d = TRUE;
        }
+      key->zsbuf_nr_samples = util_res_sample_count(lp->framebuffer.zsbuf->texture);
     }
  
     /*
@@ -3022,7 +3369,7 @@ make_variant_key(struct llvmpipe_context *lp,
     if (lp->rasterizer->clip_halfz) {
        key->depth_clamp = 1;
     } else {
-      key->depth_clamp = (lp->rasterizer->depth_clip == 0) ? 1 : 0;
+      key->depth_clamp = (lp->rasterizer->depth_clip_near == 0) ? 1 : 0;
     }
  
     /* alpha test only applies if render buffer 0 is non-integer (or does not exist) */
@@ -3036,7 +3383,8 @@ make_variant_key(struct llvmpipe_context *lp,
     /* alpha.ref_value is passed in jit_context */
  
     key->flatshade = lp->rasterizer->flatshade;
-   if (lp->active_occlusion_queries) {
+   key->multisample = lp->rasterizer->multisample;
+   if (lp->active_occlusion_queries && !lp->queries_disabled) {
        key->occlusion_count = TRUE;
     }
  
@@ -3044,6 +3392,9 @@ make_variant_key(struct llvmpipe_context *lp,
        memcpy(&key->blend, lp->blend, sizeof key->blend);
     }
  
+   key->coverage_samples = 1;
+   if (key->multisample)
+      key->coverage_samples = util_framebuffer_get_num_samples(&lp->framebuffer);
     key->nr_cbufs = lp->framebuffer.nr_cbufs;
  
     if (!key->blend.independent_blend_enable) {
@@ -3062,6 +3413,7 @@ make_variant_key(struct llvmpipe_context *lp,
           const struct util_format_description *format_desc;
  
           key->cbuf_format[i] = format;
+         key->cbuf_nr_samples[i] = util_res_sample_count(lp->framebuffer.cbufs[i]->texture);
  
           /*
            * Figure out if this is a 1d resource. Note that OpenGL allows crazy
@@ -3121,6 +3473,7 @@ make_variant_key(struct llvmpipe_context *lp,
        else {
           /* no color buffer for this fragment output */
           key->cbuf_format[i] = PIPE_FORMAT_NONE;
+         key->cbuf_nr_samples[i] = 0;
           blend_rt->colormask = 0x0;
           blend_rt->blend_enable = 0;
        }
@@ -3130,9 +3483,15 @@ make_variant_key(struct llvmpipe_context *lp,
      */
     key->nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
  
+   struct lp_sampler_static_state *fs_sampler;
+
+   fs_sampler = key->samplers;
+
+   memset(fs_sampler, 0, MAX2(key->nr_samplers, key->nr_sampler_views) * sizeof *fs_sampler);
+
     for(i = 0; i < key->nr_samplers; ++i) {
        if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
-         lp_sampler_static_sampler_state(&key->state[i].sampler_state,
+         lp_sampler_static_sampler_state(&fs_sampler[i].sampler_state,
                                           lp->samplers[PIPE_SHADER_FRAGMENT][i]);
        }
     }
@@ -3145,8 +3504,13 @@ make_variant_key(struct llvmpipe_context *lp,
     if (shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) {
        key->nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
        for(i = 0; i < key->nr_sampler_views; ++i) {
-         if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) {
-            lp_sampler_static_texture_state(&key->state[i].texture_state,
+         /*
+          * Note sview may exceed what's representable by file_mask.
+          * This will still work, the only downside is that not actually
+          * used views may be included in the shader key.
+          */
+         if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1u << (i & 31))) {
+            lp_sampler_static_texture_state(&fs_sampler[i].texture_state,
                                              lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
           }
        }
@@ -3155,11 +3519,22 @@ make_variant_key(struct llvmpipe_context *lp,
        key->nr_sampler_views = key->nr_samplers;
        for(i = 0; i < key->nr_sampler_views; ++i) {
           if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
-            lp_sampler_static_texture_state(&key->state[i].texture_state,
+            lp_sampler_static_texture_state(&fs_sampler[i].texture_state,
                                              lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
           }
        }
     }
+
+   struct lp_image_static_state *lp_image;
+   lp_image = lp_fs_variant_key_images(key);
+   key->nr_images = shader->info.base.file_max[TGSI_FILE_IMAGE] + 1;
+   for (i = 0; i < key->nr_images; ++i) {
+      if (shader->info.base.file_mask[TGSI_FILE_IMAGE] & (1 << i)) {
+         lp_sampler_static_texture_state_image(&lp_image[i].image_state,
+                                               &lp->images[PIPE_SHADER_FRAGMENT][i]);
+      }
+   }
+   return key;
  }
  
  
@@ -3172,16 +3547,17 @@ void
  llvmpipe_update_fs(struct llvmpipe_context *lp)
  {
     struct lp_fragment_shader *shader = lp->fs;
-   struct lp_fragment_shader_variant_key key;
+   struct lp_fragment_shader_variant_key *key;
     struct lp_fragment_shader_variant *variant = NULL;
     struct lp_fs_variant_list_item *li;
+   char store[LP_FS_MAX_VARIANT_KEY_SIZE];
  
-   make_variant_key(lp, shader, &key);
+   key = make_variant_key(lp, shader, store);
  
     /* Search the variants for one which matches the key */
     li = first_elem(&shader->variants);
     while(!at_end(&shader->variants, li)) {
-      if(memcmp(&li->base->key, &key, shader->variant_key_size) == 0) {
+      if(memcmp(&li->base->key, key, shader->variant_key_size) == 0) {
           variant = li->base;
           break;
        }
@@ -3200,7 +3576,7 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
        unsigned i;
        unsigned variants_to_cull;
  
-      if (0) {
+      if (LP_DEBUG & DEBUG_FS) {
           debug_printf("%u variants,\t%u instrs,\t%u instrs/variant\n",
                        lp->nr_fs_variants,
                        lp->nr_fs_instrs,
@@ -3208,14 +3584,22 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
        }
  
        /* First, check if we've exceeded the max number of shader variants.
-       * If so, free 25% of them (the least recently used ones).
+       * If so, free 6.25% of them (the least recently used ones).
         */
-      variants_to_cull = lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS ? LP_MAX_SHADER_VARIANTS / 4 : 0;
+      variants_to_cull = lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS ? LP_MAX_SHADER_VARIANTS / 16 : 0;
  
        if (variants_to_cull ||
            lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) {
           struct pipe_context *pipe = &lp->pipe;
  
+         if (gallivm_debug & GALLIVM_DEBUG_PERF) {
+            debug_printf("Evicting FS: %u fs variants,\t%u total variants,"
+                         "\t%u instrs,\t%u instrs/variant\n",
+                         shader->variants_cached,
+                         lp->nr_fs_variants, lp->nr_fs_instrs,
+                         lp->nr_fs_instrs / lp->nr_fs_variants);
+         }
+
           /*
            * XXX: we need to flush the context until we have some sort of
            * reference counting in fragment shaders as they may still be binned
@@ -3245,7 +3629,7 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
         * Generate the new variant.
         */
        t0 = os_time_get();
-      variant = generate_variant(lp, shader, &key);
+      variant = generate_variant(lp, shader, key);
        t1 = os_time_get();
        dt = t1 - t0;
        LP_COUNT_ADD(llvm_compile_time, dt);
@@ -3277,19 +3661,9 @@ llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe)
     llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state;
  
     llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer;
+
+   llvmpipe->pipe.set_shader_buffers = llvmpipe_set_shader_buffers;
+   llvmpipe->pipe.set_shader_images = llvmpipe_set_shader_images;
  }
  
-/*
- * Rasterization is disabled if there is no pixel shader and
- * both depth and stencil testing are disabled:
- * http://msdn.microsoft.com/en-us/library/windows/desktop/bb205125
- */
-boolean
-llvmpipe_rasterization_disabled(struct llvmpipe_context *lp)
-{
-   boolean null_fs = !lp->fs || lp->fs->info.base.num_tokens <= 1;
  
-   return (null_fs &&
-           !lp->depth_stencil->depth.enabled &&
-           !lp->depth_stencil->stencil[0].enabled);
-}