llvmpipe: blend has effects even if no colorbuffers.

[mesa.git] / src / gallium / drivers / llvmpipe / lp_state_fs.c
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c

index 8ecb62ed0ff4d0909a5a6c8cfe511ef7c15b3cb4..804ed9214c628936721f98c8012440bbcb5bf0d9 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -89,6 +89,7 @@
  #include "gallivm/lp_bld_pack.h"
  #include "gallivm/lp_bld_format.h"
  #include "gallivm/lp_bld_quad.h"
+#include "gallivm/lp_bld_gather.h"
  
  #include "lp_bld_alpha.h"
  #include "lp_bld_blend.h"
@@ -105,9 +106,106 @@
  #include "lp_rast.h"
  #include "nir/nir_to_tgsi_info.h"
  
+#include "lp_screen.h"
+#include "compiler/nir/nir_serialize.h"
+#include "util/mesa-sha1.h"
  /** Fragment shader number (for debugging) */
  static unsigned fs_no = 0;
  
+static void
+load_unswizzled_block(struct gallivm_state *gallivm,
+                      LLVMValueRef base_ptr,
+                      LLVMValueRef stride,
+                      unsigned block_width,
+                      unsigned block_height,
+                      LLVMValueRef* dst,
+                      struct lp_type dst_type,
+                      unsigned dst_count,
+                      unsigned dst_alignment,
+                      LLVMValueRef x_offset,
+                      LLVMValueRef y_offset,
+                      bool fb_fetch_twiddle);
+/**
+ * Checks if a format description is an arithmetic format
+ *
+ * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
+ */
+static inline boolean
+is_arithmetic_format(const struct util_format_description *format_desc)
+{
+   boolean arith = false;
+   unsigned i;
+
+   for (i = 0; i < format_desc->nr_channels; ++i) {
+      arith |= format_desc->channel[i].size != format_desc->channel[0].size;
+      arith |= (format_desc->channel[i].size % 8) != 0;
+   }
+
+   return arith;
+}
+
+/**
+ * Checks if this format requires special handling due to required expansion
+ * to floats for blending, and furthermore has "natural" packed AoS -> unpacked
+ * SoA conversion.
+ */
+static inline boolean
+format_expands_to_float_soa(const struct util_format_description *format_desc)
+{
+   if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
+       format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+      return true;
+   }
+   return false;
+}
+
+
+/**
+ * Retrieves the type representing the memory layout for a format
+ *
+ * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
+ */
+static inline void
+lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
+                             struct lp_type* type)
+{
+   unsigned i;
+   unsigned chan;
+
+   if (format_expands_to_float_soa(format_desc)) {
+      /* just make this a uint with width of block */
+      type->floating = false;
+      type->fixed = false;
+      type->sign = false;
+      type->norm = false;
+      type->width = format_desc->block.bits;
+      type->length = 1;
+      return;
+   }
+
+   for (i = 0; i < 4; i++)
+      if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
+         break;
+   chan = i;
+
+   memset(type, 0, sizeof(struct lp_type));
+   type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
+   type->fixed    = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
+   type->sign     = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
+   type->norm     = format_desc->channel[chan].normalized;
+
+   if (is_arithmetic_format(format_desc)) {
+      type->width = 0;
+      type->length = 1;
+
+      for (i = 0; i < format_desc->nr_channels; ++i) {
+         type->width += format_desc->channel[i].size;
+      }
+   } else {
+      type->width = format_desc->channel[chan].size;
+      type->length = format_desc->nr_channels;
+   }
+}
  
  /**
   * Expand the relevant bits of mask_input to a n*4-dword mask for the
@@ -325,6 +423,11 @@ struct lp_build_fs_llvm_iface {
     struct lp_build_interp_soa_context *interp;
     struct lp_build_for_loop_state *loop_state;
     LLVMValueRef mask_store;
+   LLVMValueRef sample_id;
+   LLVMValueRef color_ptr_ptr;
+   LLVMValueRef color_stride_ptr;
+   LLVMValueRef color_sample_stride_ptr;
+   const struct lp_fragment_shader_variant_key *key;
  };
  
  static LLVMValueRef fs_interp(const struct lp_build_fs_iface *iface,
@@ -347,6 +450,105 @@ static LLVMValueRef fs_interp(const struct lp_build_fs_iface *iface,
                                attrib, chan, loc, attrib_indir, offsets);
  }
  
+static void fs_fb_fetch(const struct lp_build_fs_iface *iface,
+                                struct lp_build_context *bld,
+                                unsigned cbuf,
+                                LLVMValueRef result[4])
+{
+   struct lp_build_fs_llvm_iface *fs_iface = (struct lp_build_fs_llvm_iface *)iface;
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   const struct lp_fragment_shader_variant_key *key = fs_iface->key;
+   LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
+   LLVMValueRef color_ptr = LLVMBuildLoad(builder, LLVMBuildGEP(builder, fs_iface->color_ptr_ptr, &index, 1, ""), "");
+   LLVMValueRef stride = LLVMBuildLoad(builder, LLVMBuildGEP(builder, fs_iface->color_stride_ptr, &index, 1, ""), "");
+
+   LLVMValueRef dst[4 * 4];
+   enum pipe_format cbuf_format = key->cbuf_format[cbuf];
+   const struct util_format_description* out_format_desc = util_format_description(cbuf_format);
+   struct lp_type dst_type;
+   unsigned block_size = bld->type.length;
+   unsigned block_height = key->resource_1d ? 1 : 2;
+   unsigned block_width = block_size / block_height;
+
+   lp_mem_type_from_format_desc(out_format_desc, &dst_type);
+
+   struct lp_type blend_type;
+   memset(&blend_type, 0, sizeof blend_type);
+   blend_type.floating = FALSE; /* values are integers */
+   blend_type.sign = FALSE;     /* values are unsigned */
+   blend_type.norm = TRUE;      /* values are in [0,1] or [-1,1] */
+   blend_type.width = 8;        /* 8-bit ubyte values */
+   blend_type.length = 16;      /* 16 elements per vector */
+
+   uint32_t dst_alignment;
+   /*
+    * Compute the alignment of the destination pointer in bytes
+    * We fetch 1-4 pixels, if the format has pot alignment then those fetches
+    * are always aligned by MIN2(16, fetch_width) except for buffers (not
+    * 1d tex but can't distinguish here) so need to stick with per-pixel
+    * alignment in this case.
+    */
+   if (key->resource_1d) {
+      dst_alignment = (out_format_desc->block.bits + 7)/(out_format_desc->block.width * 8);
+   }
+   else {
+      dst_alignment = dst_type.length * dst_type.width / 8;
+   }
+   /* Force power-of-two alignment by extracting only the least-significant-bit */
+   dst_alignment = 1 << (ffs(dst_alignment) - 1);
+   /*
+    * Resource base and stride pointers are aligned to 16 bytes, so that's
+    * the maximum alignment we can guarantee
+    */
+   dst_alignment = MIN2(16, dst_alignment);
+
+   LLVMTypeRef blend_vec_type = lp_build_vec_type(gallivm, blend_type);
+   color_ptr = LLVMBuildBitCast(builder, color_ptr, LLVMPointerType(blend_vec_type, 0), "");
+
+   if (key->multisample) {
+      LLVMValueRef sample_stride = LLVMBuildLoad(builder,
+                                                 LLVMBuildGEP(builder, fs_iface->color_sample_stride_ptr,
+                                                              &index, 1, ""), "");
+      LLVMValueRef sample_offset = LLVMBuildMul(builder, sample_stride, fs_iface->sample_id, "");
+      color_ptr = LLVMBuildGEP(builder, color_ptr, &sample_offset, 1, "");
+   }
+   /* fragment shader executes on 4x4 blocks. depending on vector width it can execute 2 or 4 iterations.
+    * only move to the next row once the top row has completed 8 wide 1 iteration, 4 wide 2 iterations */
+   LLVMValueRef x_offset = NULL, y_offset = NULL;
+   if (!key->resource_1d) {
+      LLVMValueRef counter = fs_iface->loop_state->counter;
+
+      if (block_size == 4) {
+         x_offset = LLVMBuildShl(builder,
+                                 LLVMBuildAnd(builder, fs_iface->loop_state->counter, lp_build_const_int32(gallivm, 1), ""),
+                                 lp_build_const_int32(gallivm, 1), "");
+         counter = LLVMBuildLShr(builder, fs_iface->loop_state->counter, lp_build_const_int32(gallivm, 1), "");
+      }
+      y_offset = LLVMBuildMul(builder, counter, lp_build_const_int32(gallivm, 2), "");
+   }
+   load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height, dst, dst_type, block_size, dst_alignment, x_offset, y_offset, true);
+
+   for (unsigned i = 0; i < block_size; i++) {
+      dst[i] = LLVMBuildBitCast(builder, dst[i], LLVMInt32TypeInContext(gallivm->context), "");
+   }
+   LLVMValueRef packed = lp_build_gather_values(gallivm, dst, block_size);
+
+   struct lp_type texel_type = bld->type;
+   if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
+       out_format_desc->channel[0].pure_integer) {
+      if (out_format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
+         texel_type = lp_type_int_vec(bld->type.width, bld->type.width * bld->type.length);
+      }
+      else if (out_format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+         texel_type = lp_type_uint_vec(bld->type.width, bld->type.width * bld->type.length);
+      }
+   }
+   lp_build_unpack_rgba_soa(gallivm, out_format_desc,
+                            texel_type,
+                            packed, result);
+}
+
  /**
   * Generate the fragment shader, depth/stencil test, and alpha tests.
   */
@@ -367,6 +569,9 @@ generate_fs_loop(struct gallivm_state *gallivm,
                   LLVMValueRef depth_base_ptr,
                   LLVMValueRef depth_stride,
                   LLVMValueRef depth_sample_stride,
+                 LLVMValueRef color_ptr_ptr,
+                 LLVMValueRef color_stride_ptr,
+                 LLVMValueRef color_sample_stride_ptr,
                   LLVMValueRef facing,
                   LLVMValueRef thread_data_ptr)
  {
@@ -396,6 +601,7 @@ generate_fs_loop(struct gallivm_state *gallivm,
                              shader->info.base.num_instructions < 8) && 0;
     const boolean dual_source_blend = key->blend.rt[0].blend_enable &&
                                       util_blend_state_is_dual(&key->blend, 0);
+   const bool post_depth_coverage = shader->info.base.properties[TGSI_PROPERTY_FS_POST_DEPTH_COVERAGE];
     unsigned attrib;
     unsigned chan;
     unsigned cbuf;
@@ -500,7 +706,9 @@ generate_fs_loop(struct gallivm_state *gallivm,
                             num_loop,
                             lp_build_const_int32(gallivm, 1));
  
+   LLVMValueRef sample_mask_in;
     if (key->multisample) {
+      sample_mask_in = lp_build_const_int_vec(gallivm, type, 0);
        /* create shader execution mask by combining all sample masks. */
        for (unsigned s = 0; s < key->coverage_samples; s++) {
           LLVMValueRef s_mask_idx = LLVMBuildMul(builder, num_loop, lp_build_const_int32(gallivm, s), "");
@@ -510,11 +718,18 @@ generate_fs_loop(struct gallivm_state *gallivm,
              mask_val = s_mask;
           else
              mask_val = LLVMBuildOr(builder, s_mask, mask_val, "");
+
+         LLVMValueRef mask_in = LLVMBuildAnd(builder, s_mask, lp_build_const_int_vec(gallivm, type, (1 << s)), "");
+         sample_mask_in = LLVMBuildOr(builder, sample_mask_in, mask_in, "");
        }
     } else {
+      sample_mask_in = lp_build_const_int_vec(gallivm, type, 1);
        mask_ptr = LLVMBuildGEP(builder, mask_store,
                                &loop_state.counter, 1, "mask_ptr");
        mask_val = LLVMBuildLoad(builder, mask_ptr, "");
+
+      LLVMValueRef mask_in = LLVMBuildAnd(builder, mask_val, lp_build_const_int_vec(gallivm, type, 1), "");
+      sample_mask_in = LLVMBuildOr(builder, sample_mask_in, mask_in, "");
     }
  
     /* 'mask' will control execution based on quad's pixel alive/killed state */
@@ -527,6 +742,11 @@ generate_fs_loop(struct gallivm_state *gallivm,
     LLVMValueRef s_mask_or = lp_build_alloca(gallivm, lp_build_int_vec_type(gallivm, type), "cov_mask_early_depth");
     LLVMBuildStore(builder, LLVMConstNull(lp_build_int_vec_type(gallivm, type)), s_mask_or);
  
+   /* Create storage for post depth sample mask */
+   LLVMValueRef post_depth_sample_mask_in = NULL;
+   if (post_depth_coverage)
+      post_depth_sample_mask_in = lp_build_alloca(gallivm, int_vec_type, "post_depth_sample_mask_in");
+
     LLVMValueRef s_mask = NULL, s_mask_ptr = NULL;
     LLVMValueRef z_sample_value_store = NULL, s_sample_value_store = NULL;
     LLVMValueRef z_fb_store = NULL, s_fb_store = NULL;
@@ -541,6 +761,9 @@ generate_fs_loop(struct gallivm_state *gallivm,
           struct lp_type s_type = zs_type;
           if (zs_format_desc->block.bits < type.width)
              z_type.width = type.width;
+         if (zs_format_desc->block.bits == 8)
+            s_type.width = type.width;
+
           else if (zs_format_desc->block.bits > 32) {
              z_type.width = z_type.width / 2;
              s_type.width = s_type.width / 2;
@@ -639,6 +862,14 @@ generate_fs_loop(struct gallivm_state *gallivm,
        tmp_s_mask_or = LLVMBuildOr(builder, tmp_s_mask_or, s_mask, "");
        LLVMBuildStore(builder, tmp_s_mask_or, s_mask_or);
  
+      if (post_depth_coverage) {
+         LLVMValueRef mask_bit_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
+         LLVMValueRef post_depth_mask_in = LLVMBuildLoad(builder, post_depth_sample_mask_in, "");
+         mask_bit_idx = LLVMBuildAnd(builder, s_mask, lp_build_broadcast(gallivm, int_vec_type, mask_bit_idx), "");
+         post_depth_mask_in = LLVMBuildOr(builder, post_depth_mask_in, mask_bit_idx, "");
+         LLVMBuildStore(builder, post_depth_mask_in, post_depth_sample_mask_in);
+      }
+
        LLVMBuildStore(builder, s_mask, s_mask_ptr);
  
        lp_build_for_loop_end(&sample_loop_state);
@@ -652,6 +883,11 @@ generate_fs_loop(struct gallivm_state *gallivm,
           lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, NULL);
           lp_build_mask_update(&mask, tmp_s_mask_or);
        }
+   } else {
+      if (post_depth_coverage) {
+         LLVMValueRef post_depth_mask_in = LLVMBuildAnd(builder, lp_build_mask_value(&mask), lp_build_const_int_vec(gallivm, type, 1), "");
+         LLVMBuildStore(builder, post_depth_mask_in, post_depth_sample_mask_in);
+      }
     }
  
     LLVMValueRef out_sample_mask_storage = NULL;
@@ -661,6 +897,11 @@ generate_fs_loop(struct gallivm_state *gallivm,
           LLVMBuildStore(builder, LLVMConstNull(int_vec_type), out_sample_mask_storage);
     }
  
+   if (post_depth_coverage) {
+      system_values.sample_mask_in = LLVMBuildLoad(builder, post_depth_sample_mask_in, "");
+   }
+   else
+      system_values.sample_mask_in = sample_mask_in;
     if (key->multisample && key->min_samples > 1) {
        lp_build_for_loop_begin(&sample_loop_state, gallivm,
                                lp_build_const_int32(gallivm, 0),
@@ -675,18 +916,28 @@ generate_fs_loop(struct gallivm_state *gallivm,
        lp_build_mask_force(&mask, s_mask);
        lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, sample_loop_state.counter);
        system_values.sample_id = sample_loop_state.counter;
-   } else
+      system_values.sample_mask_in = LLVMBuildAnd(builder, system_values.sample_mask_in,
+                                                  lp_build_broadcast(gallivm, int_vec_type,
+                                                                     LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "")), "");
+   } else {
        system_values.sample_id = lp_build_const_int32(gallivm, 0);
  
+   }
     system_values.sample_pos = sample_pos_array;
  
     lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter, mask_store, sample_loop_state.counter);
  
     struct lp_build_fs_llvm_iface fs_iface = {
       .base.interp_fn = fs_interp,
+     .base.fb_fetch = fs_fb_fetch,
       .interp = interp,
       .loop_state = &loop_state,
+     .sample_id = system_values.sample_id,
       .mask_store = mask_store,
+     .color_ptr_ptr = color_ptr_ptr,
+     .color_stride_ptr = color_stride_ptr,
+     .color_sample_stride_ptr = color_sample_stride_ptr,
+     .key = key,
     };
  
     struct lp_build_tgsi_params params;
@@ -1201,7 +1452,10 @@ load_unswizzled_block(struct gallivm_state *gallivm,
                        LLVMValueRef* dst,
                        struct lp_type dst_type,
                        unsigned dst_count,
-                      unsigned dst_alignment)
+                      unsigned dst_alignment,
+                      LLVMValueRef x_offset,
+                      LLVMValueRef y_offset,
+                      bool fb_fetch_twiddle)
  {
     LLVMBuilderRef builder = gallivm->builder;
     unsigned row_size = dst_count / block_height;
@@ -1214,8 +1468,28 @@ load_unswizzled_block(struct gallivm_state *gallivm,
        unsigned x = i % row_size;
        unsigned y = i / row_size;
  
-      LLVMValueRef bx = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length);
-      LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
+      if (block_height == 2 && dst_count == 8 && fb_fetch_twiddle) {
+         /* remap the raw slots into the fragment shader execution mode. */
+         /* this math took me way too long to work out, I'm sure it's overkill. */
+         x = (i & 1) + ((i >> 2) << 1);
+         y = (i & 2) >> 1;
+      }
+
+      LLVMValueRef x_val;
+      if (x_offset) {
+         x_val = lp_build_const_int32(gallivm, x);
+         if (x_offset)
+            x_val = LLVMBuildAdd(builder, x_val, x_offset, "");
+         x_val = LLVMBuildMul(builder, x_val, lp_build_const_int32(gallivm, (dst_type.width / 8) * dst_type.length), "");
+      } else
+         x_val = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length);
+
+      LLVMValueRef bx = x_val;
+
+      LLVMValueRef y_val = lp_build_const_int32(gallivm, y);
+      if (y_offset)
+         y_val = LLVMBuildAdd(builder, y_val, y_offset, "");
+      LLVMValueRef by = LLVMBuildMul(builder, y_val, stride, "");
  
        LLVMValueRef gep[2];
        LLVMValueRef dst_ptr;
@@ -1279,89 +1553,6 @@ store_unswizzled_block(struct gallivm_state *gallivm,
  }
  
  
-/**
- * Checks if a format description is an arithmetic format
- *
- * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
- */
-static inline boolean
-is_arithmetic_format(const struct util_format_description *format_desc)
-{
-   boolean arith = false;
-   unsigned i;
-
-   for (i = 0; i < format_desc->nr_channels; ++i) {
-      arith |= format_desc->channel[i].size != format_desc->channel[0].size;
-      arith |= (format_desc->channel[i].size % 8) != 0;
-   }
-
-   return arith;
-}
-
-
-/**
- * Checks if this format requires special handling due to required expansion
- * to floats for blending, and furthermore has "natural" packed AoS -> unpacked
- * SoA conversion.
- */
-static inline boolean
-format_expands_to_float_soa(const struct util_format_description *format_desc)
-{
-   if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
-       format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
-      return true;
-   }
-   return false;
-}
-
-
-/**
- * Retrieves the type representing the memory layout for a format
- *
- * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
- */
-static inline void
-lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
-                             struct lp_type* type)
-{
-   unsigned i;
-   unsigned chan;
-
-   if (format_expands_to_float_soa(format_desc)) {
-      /* just make this a uint with width of block */
-      type->floating = false;
-      type->fixed = false;
-      type->sign = false;
-      type->norm = false;
-      type->width = format_desc->block.bits;
-      type->length = 1;
-      return;
-   }
-
-   for (i = 0; i < 4; i++)
-      if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
-         break;
-   chan = i;
-
-   memset(type, 0, sizeof(struct lp_type));
-   type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
-   type->fixed    = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
-   type->sign     = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
-   type->norm     = format_desc->channel[chan].normalized;
-
-   if (is_arithmetic_format(format_desc)) {
-      type->width = 0;
-      type->length = 1;
-
-      for (i = 0; i < format_desc->nr_channels; ++i) {
-         type->width += format_desc->channel[i].size;
-      }
-   } else {
-      type->width = format_desc->channel[chan].size;
-      type->length = format_desc->nr_channels;
-   }
-}
-
  
  /**
   * Retrieves the type for a format which is usable in the blending code.
@@ -2579,7 +2770,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
  
     if (is_1d) {
        load_unswizzled_block(gallivm, color_ptr, stride, block_width, 1,
-                            dst, ls_type, dst_count / 4, dst_alignment);
+                            dst, ls_type, dst_count / 4, dst_alignment, NULL, NULL, false);
        for (i = dst_count / 4; i < dst_count; i++) {
           dst[i] = lp_build_undef(gallivm, ls_type);
        }
@@ -2587,7 +2778,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
     }
     else {
        load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height,
-                            dst, ls_type, dst_count, dst_alignment);
+                            dst, ls_type, dst_count, dst_alignment, NULL, NULL, false);
     }
  
  
@@ -2806,8 +2997,8 @@ generate_fragment(struct llvmpipe_context *lp,
  
     blend_vec_type = lp_build_vec_type(gallivm, blend_type);
  
-   snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s",
-            shader->no, variant->no, partial_mask ? "partial" : "whole");
+   snprintf(func_name, sizeof(func_name), "fs_variant_%s",
+            partial_mask ? "partial" : "whole");
  
     arg_types[0] = variant->jit_context_ptr_type;       /* context */
     arg_types[1] = int32_type;                          /* x */
@@ -2840,6 +3031,9 @@ generate_fragment(struct llvmpipe_context *lp,
        if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
           lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
  
+   if (variant->gallivm->cache->data_size)
+      return;
+
     context_ptr  = LLVMGetParam(function, 0);
     x            = LLVMGetParam(function, 1);
     y            = LLVMGetParam(function, 2);
@@ -2900,8 +3094,8 @@ generate_fragment(struct llvmpipe_context *lp,
     }
  
     /* code generated texture sampling */
-   sampler = lp_llvm_sampler_soa_create(key->samplers);
-   image = lp_llvm_image_soa_create(lp_fs_variant_key_images(key));
+   sampler = lp_llvm_sampler_soa_create(key->samplers, key->nr_samplers);
+   image = lp_llvm_image_soa_create(lp_fs_variant_key_images(key), key->nr_images);
  
     num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
     /* for 1d resources only run "upper half" of stamp */
@@ -3012,6 +3206,9 @@ generate_fragment(struct llvmpipe_context *lp,
                         depth_ptr,
                         depth_stride,
                         depth_sample_stride,
+                       color_ptr_ptr,
+                       stride_ptr,
+                       color_sample_stride_ptr,
                         facing,
                         thread_data_ptr);
  
@@ -3237,6 +3434,27 @@ lp_debug_fs_variant(struct lp_fragment_shader_variant *variant)
     debug_printf("\n");
  }
  
+static void
+lp_fs_get_ir_cache_key(struct lp_fragment_shader_variant *variant,
+                            unsigned char ir_sha1_cache_key[20])
+{
+   struct blob blob = { 0 };
+   unsigned ir_size;
+   void *ir_binary;
+
+   blob_init(&blob);
+   nir_serialize(&blob, variant->shader->base.ir.nir, true);
+   ir_binary = blob.data;
+   ir_size = blob.size;
+
+   struct mesa_sha1 ctx;
+   _mesa_sha1_init(&ctx);
+   _mesa_sha1_update(&ctx, &variant->key, variant->shader->variant_key_size);
+   _mesa_sha1_update(&ctx, ir_binary, ir_size);
+   _mesa_sha1_final(&ctx, ir_sha1_cache_key);
+
+   blob_finish(&blob);
+}
  
  /**
   * Generate a new fragment shader variant from the shader code and
@@ -3247,11 +3465,14 @@ generate_variant(struct llvmpipe_context *lp,
                   struct lp_fragment_shader *shader,
                   const struct lp_fragment_shader_variant_key *key)
  {
+   struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
     struct lp_fragment_shader_variant *variant;
     const struct util_format_description *cbuf0_format_desc = NULL;
     boolean fullcolormask;
     char module_name[64];
-
+   unsigned char ir_sha1_cache_key[20];
+   struct lp_cached_code cached = { 0 };
+   bool needs_caching = false;
     variant = MALLOC(sizeof *variant + shader->variant_key_size - sizeof variant->key);
     if (!variant)
        return NULL;
@@ -3260,18 +3481,29 @@ generate_variant(struct llvmpipe_context *lp,
     snprintf(module_name, sizeof(module_name), "fs%u_variant%u",
              shader->no, shader->variants_created);
  
-   variant->gallivm = gallivm_create(module_name, lp->context);
+   pipe_reference_init(&variant->reference, 1);
+   lp_fs_reference(lp, &variant->shader, shader);
+
+   memcpy(&variant->key, key, shader->variant_key_size);
+
+   if (shader->base.ir.nir) {
+      lp_fs_get_ir_cache_key(variant, ir_sha1_cache_key);
+
+      lp_disk_cache_find_shader(screen, &cached, ir_sha1_cache_key);
+      if (!cached.data_size)
+         needs_caching = true;
+   }
+   variant->gallivm = gallivm_create(module_name, lp->context, &cached);
     if (!variant->gallivm) {
        FREE(variant);
        return NULL;
     }
  
-   variant->shader = shader;
     variant->list_item_global.base = variant;
     variant->list_item_local.base = variant;
     variant->no = shader->variants_created++;
  
-   memcpy(&variant->key, key, shader->variant_key_size);
+
  
     /*
      * Determine whether we are touching all channels in the color buffer.
@@ -3333,6 +3565,10 @@ generate_variant(struct llvmpipe_context *lp,
        variant->jit_function[RAST_WHOLE] = variant->jit_function[RAST_EDGE_TEST];
     }
  
+   if (needs_caching) {
+      lp_disk_cache_insert_shader(screen, &cached, ir_sha1_cache_key);
+   }
+
     gallivm_free_ir(variant->gallivm);
  
     return variant;
@@ -3354,6 +3590,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
     if (!shader)
        return NULL;
  
+   pipe_reference_init(&shader->reference, 1);
     shader->no = fs_no++;
     make_empty_list(&shader->variants);
  
@@ -3453,8 +3690,10 @@ llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
     draw_bind_fragment_shader(llvmpipe->draw,
                               (lp_fs ? lp_fs->draw_data : NULL));
  
-   llvmpipe->fs = lp_fs;
+   lp_fs_reference(llvmpipe, &llvmpipe->fs, lp_fs);
  
+   /* invalidate the setup link, NEW_FS will make it update */
+   lp_setup_set_fs_variant(llvmpipe->setup, NULL);
     llvmpipe->dirty |= LP_NEW_FS;
  }
  
@@ -3463,9 +3702,10 @@ llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
   * Remove shader variant from two lists: the shader's variant list
   * and the context's variant list.
   */
-static void
-llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
-                               struct lp_fragment_shader_variant *variant)
+
+static
+void llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
+                                    struct lp_fragment_shader_variant *variant)
  {
     if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
        debug_printf("llvmpipe: del fs #%u var %u v created %u v cached %u "
@@ -3476,8 +3716,6 @@ llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
                     lp->nr_fs_variants, variant->nr_instrs, lp->nr_fs_instrs);
     }
  
-   gallivm_destroy(variant->gallivm);
-
     /* remove from shader's list */
     remove_from_list(&variant->list_item_local);
     variant->shader->variants_cached--;
@@ -3486,10 +3724,32 @@ llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
     remove_from_list(&variant->list_item_global);
     lp->nr_fs_variants--;
     lp->nr_fs_instrs -= variant->nr_instrs;
+}
+
+void
+llvmpipe_destroy_shader_variant(struct llvmpipe_context *lp,
+                               struct lp_fragment_shader_variant *variant)
+{
+   gallivm_destroy(variant->gallivm);
+
+   lp_fs_reference(lp, &variant->shader, NULL);
  
     FREE(variant);
  }
  
+void
+llvmpipe_destroy_fs(struct llvmpipe_context *llvmpipe,
+                    struct lp_fragment_shader *shader)
+{
+   /* Delete draw module's data */
+   draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data);
+
+   if (shader->base.ir.nir)
+      ralloc_free(shader->base.ir.nir);
+   assert(shader->variants_cached == 0);
+   FREE((void *) shader->base.tokens);
+   FREE(shader);
+}
  
  static void
  llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
@@ -3498,35 +3758,20 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
     struct lp_fragment_shader *shader = fs;
     struct lp_fs_variant_list_item *li;
  
-   assert(fs != llvmpipe->fs);
-
-   /*
-    * XXX: we need to flush the context until we have some sort of reference
-    * counting in fragment shaders as they may still be binned
-    * Flushing alone might not sufficient we need to wait on it too.
-    */
-   llvmpipe_finish(pipe, __FUNCTION__);
-
     /* Delete all the variants */
     li = first_elem(&shader->variants);
     while(!at_end(&shader->variants, li)) {
        struct lp_fs_variant_list_item *next = next_elem(li);
+      struct lp_fragment_shader_variant *variant;
+      variant = li->base;
        llvmpipe_remove_shader_variant(llvmpipe, li->base);
+      lp_fs_variant_reference(llvmpipe, &variant, NULL);
        li = next;
     }
  
-   /* Delete draw module's data */
-   draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data);
-
-   if (shader->base.ir.nir)
-      ralloc_free(shader->base.ir.nir);
-   assert(shader->variants_cached == 0);
-   FREE((void *) shader->base.tokens);
-   FREE(shader);
+   lp_fs_reference(llvmpipe, &shader, NULL);
  }
  
-
-
  static void
  llvmpipe_set_constant_buffer(struct pipe_context *pipe,
                               enum pipe_shader_type shader, uint index,
@@ -3742,9 +3987,7 @@ make_variant_key(struct llvmpipe_context *lp,
        key->occlusion_count = TRUE;
     }
  
-   if (lp->framebuffer.nr_cbufs) {
-      memcpy(&key->blend, lp->blend, sizeof key->blend);
-   }
+   memcpy(&key->blend, lp->blend, sizeof key->blend);
  
     key->coverage_samples = 1;
     key->min_samples = 1;
@@ -3947,8 +4190,6 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
  
        if (variants_to_cull ||
            lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) {
-         struct pipe_context *pipe = &lp->pipe;
-
           if (gallivm_debug & GALLIVM_DEBUG_PERF) {
              debug_printf("Evicting FS: %u fs variants,\t%u total variants,"
                           "\t%u instrs,\t%u instrs/variant\n",
@@ -3957,13 +4198,6 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
                           lp->nr_fs_instrs / lp->nr_fs_variants);
           }
  
-         /*
-          * XXX: we need to flush the context until we have some sort of
-          * reference counting in fragment shaders as they may still be binned
-          * Flushing alone might not be sufficient we need to wait on it too.
-          */
-         llvmpipe_finish(pipe, __FUNCTION__);
-
           /*
            * We need to re-check lp->nr_fs_variants because an arbitrarliy large
            * number of shader variants (potentially all of them) could be
@@ -3979,6 +4213,7 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
              assert(item);
              assert(item->base);
              llvmpipe_remove_shader_variant(lp, item->base);
+            lp_fs_variant_reference(lp, &item->base, NULL);
           }
        }