gallium: fix windows build from params change.

[mesa.git] / src / gallium / drivers / llvmpipe / lp_state_fs.c
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c

index 431f31efaf70b3d4a927dd4283088876b597f9a7..101877ded3471446fc0d7b4a77b8caadae7e7b6c 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -65,9 +65,9 @@
  #include "util/u_format.h"
  #include "util/u_dump.h"
  #include "util/u_string.h"
-#include "util/u_simple_list.h"
+#include "util/simple_list.h"
  #include "util/u_dual_blend.h"
-#include "os/os_time.h"
+#include "util/os_time.h"
  #include "pipe/p_shader_tokens.h"
  #include "draw/draw_context.h"
  #include "tgsi/tgsi_dump.h"
@@ -84,6 +84,7 @@
  #include "gallivm/lp_bld_flow.h"
  #include "gallivm/lp_bld_debug.h"
  #include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_bitarit.h"
  #include "gallivm/lp_bld_pack.h"
  #include "gallivm/lp_bld_format.h"
  #include "gallivm/lp_bld_quad.h"
@@ -127,14 +128,14 @@ generate_quad_mask(struct gallivm_state *gallivm,
     struct lp_type mask_type;
     LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
     LLVMValueRef bits[16];
-   LLVMValueRef mask;
+   LLVMValueRef mask, bits_vec;
     int shift, i;
  
     /*
      * XXX: We'll need a different path for 16 x u8
      */
     assert(fs_type.width == 32);
-   assert(fs_type.length <= Elements(bits));
+   assert(fs_type.length <= ARRAY_SIZE(bits));
     mask_type = lp_int_type(fs_type);
  
     /*
@@ -174,20 +175,20 @@ generate_quad_mask(struct gallivm_state *gallivm,
  
     for (i = 0; i < fs_type.length / 4; i++) {
        unsigned j = 2 * (i % 2) + (i / 2) * 8;
-      bits[4*i + 0] = LLVMConstInt(i32t, 1 << (j + 0), 0);
-      bits[4*i + 1] = LLVMConstInt(i32t, 1 << (j + 1), 0);
-      bits[4*i + 2] = LLVMConstInt(i32t, 1 << (j + 4), 0);
-      bits[4*i + 3] = LLVMConstInt(i32t, 1 << (j + 5), 0);
+      bits[4*i + 0] = LLVMConstInt(i32t, 1ULL << (j + 0), 0);
+      bits[4*i + 1] = LLVMConstInt(i32t, 1ULL << (j + 1), 0);
+      bits[4*i + 2] = LLVMConstInt(i32t, 1ULL << (j + 4), 0);
+      bits[4*i + 3] = LLVMConstInt(i32t, 1ULL << (j + 5), 0);
     }
-   mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, fs_type.length), "");
+   bits_vec = LLVMConstVector(bits, fs_type.length);
+   mask = LLVMBuildAnd(builder, mask, bits_vec, "");
  
     /*
-    * mask = mask != 0 ? ~0 : 0
+    * mask = mask == bits ? ~0 : 0
      */
     mask = lp_build_compare(gallivm,
-                           mask_type, PIPE_FUNC_NOTEQUAL,
-                           mask,
-                           lp_build_const_int_vec(gallivm, mask_type, 0));
+                           mask_type, PIPE_FUNC_EQUAL,
+                           mask, bits_vec);
  
     return mask;
  }
@@ -238,6 +239,54 @@ lp_llvm_viewport(LLVMValueRef context_ptr,
  }
  
  
+static LLVMValueRef
+lp_build_depth_clamp(struct gallivm_state *gallivm,
+                     LLVMBuilderRef builder,
+                     struct lp_type type,
+                     LLVMValueRef context_ptr,
+                     LLVMValueRef thread_data_ptr,
+                     LLVMValueRef z)
+{
+   LLVMValueRef viewport, min_depth, max_depth;
+   LLVMValueRef viewport_index;
+   struct lp_build_context f32_bld;
+
+   assert(type.floating);
+   lp_build_context_init(&f32_bld, gallivm, type);
+
+   /*
+    * Assumes clamping of the viewport index will occur in setup/gs. Value
+    * is passed through the rasterization stage via lp_rast_shader_inputs.
+    *
+    * See: draw_clamp_viewport_idx and lp_clamp_viewport_idx for clamping
+    *      semantics.
+    */
+   viewport_index = lp_jit_thread_data_raster_state_viewport_index(gallivm,
+                       thread_data_ptr);
+
+   /*
+    * Load the min and max depth from the lp_jit_context.viewports
+    * array of lp_jit_viewport structures.
+    */
+   viewport = lp_llvm_viewport(context_ptr, gallivm, viewport_index);
+
+   /* viewports[viewport_index].min_depth */
+   min_depth = LLVMBuildExtractElement(builder, viewport,
+                  lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MIN_DEPTH), "");
+   min_depth = lp_build_broadcast_scalar(&f32_bld, min_depth);
+
+   /* viewports[viewport_index].max_depth */
+   max_depth = LLVMBuildExtractElement(builder, viewport,
+                  lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MAX_DEPTH), "");
+   max_depth = lp_build_broadcast_scalar(&f32_bld, max_depth);
+
+   /*
+    * Clamp to the min and max depth values for the given viewport.
+    */
+   return lp_build_clamp(&f32_bld, z, min_depth, max_depth);
+}
+
+
  /**
   * Generate the fragment shader, depth/stencil test, and alpha tests.
   */
@@ -250,7 +299,7 @@ generate_fs_loop(struct gallivm_state *gallivm,
                   LLVMValueRef context_ptr,
                   LLVMValueRef num_loop,
                   struct lp_build_interp_soa_context *interp,
-                 struct lp_build_sampler_soa *sampler,
+                 const struct lp_build_sampler_soa *sampler,
                   LLVMValueRef mask_store,
                   LLVMValueRef (*out_color)[4],
                   LLVMValueRef depth_ptr,
@@ -260,9 +309,11 @@ generate_fs_loop(struct gallivm_state *gallivm,
  {
     const struct util_format_description *zs_format_desc = NULL;
     const struct tgsi_token *tokens = shader->base.tokens;
-   LLVMTypeRef vec_type;
+   struct lp_type int_type = lp_int_type(type);
+   LLVMTypeRef vec_type, int_vec_type;
     LLVMValueRef mask_ptr, mask_val;
     LLVMValueRef consts_ptr, num_consts_ptr;
+   LLVMValueRef ssbo_ptr, num_ssbo_ptr;
     LLVMValueRef z;
     LLVMValueRef z_value, s_value;
     LLVMValueRef z_fb, s_fb;
@@ -295,10 +346,11 @@ generate_fs_loop(struct gallivm_state *gallivm,
        zs_format_desc = util_format_description(key->zsbuf_format);
        assert(zs_format_desc);
  
-      if (!shader->info.base.writes_z) {
+      if (!shader->info.base.writes_z && !shader->info.base.writes_stencil) {
           if (key->alpha.enabled ||
               key->blend.alpha_to_coverage ||
-             shader->info.base.uses_kill) {
+             shader->info.base.uses_kill ||
+             shader->info.base.writes_samplemask) {
              /* With alpha test and kill, can do the depth test early
               * and hopefully eliminate some quads.  But need to do a
               * special deferred depth write once the final mask value
@@ -329,15 +381,21 @@ generate_fs_loop(struct gallivm_state *gallivm,
        depth_mode = 0;
     }
  
+   vec_type = lp_build_vec_type(gallivm, type);
+   int_vec_type = lp_build_vec_type(gallivm, int_type);
  
     stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_ptr);
     stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_ptr);
-
-   vec_type = lp_build_vec_type(gallivm, type);
+   /* convert scalar stencil refs into vectors */
+   stencil_refs[0] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[0]);
+   stencil_refs[1] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[1]);
  
     consts_ptr = lp_jit_context_constants(gallivm, context_ptr);
     num_consts_ptr = lp_jit_context_num_constants(gallivm, context_ptr);
  
+   ssbo_ptr = lp_jit_context_ssbos(gallivm, context_ptr);
+   num_ssbo_ptr = lp_jit_context_num_ssbos(gallivm, context_ptr);
+
     lp_build_for_loop_begin(&loop_state, gallivm,
                             lp_build_const_int32(gallivm, 0),
                             LLVMIntULT,
@@ -379,6 +437,13 @@ generate_fs_loop(struct gallivm_state *gallivm,
     z = interp->pos[2];
  
     if (depth_mode & EARLY_DEPTH_TEST) {
+      /*
+       * Clamp according to ARB_depth_clamp semantics.
+       */
+      if (key->depth_clamp) {
+         z = lp_build_depth_clamp(gallivm, builder, type, context_ptr,
+                                  thread_data_ptr, z);
+      }
        lp_build_depth_stencil_load_swizzled(gallivm, type,
                                             zs_format_desc, key->resource_1d,
                                             depth_ptr, depth_stride,
@@ -413,11 +478,25 @@ generate_fs_loop(struct gallivm_state *gallivm,
  
     lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter);
  
+   struct lp_build_tgsi_params params;
+   memset(&params, 0, sizeof(params));
+
+   params.type = type;
+   params.mask = &mask;
+   params.consts_ptr = consts_ptr;
+   params.const_sizes_ptr = num_consts_ptr;
+   params.system_values = &system_values;
+   params.inputs = interp->inputs;
+   params.context_ptr = context_ptr;
+   params.thread_data_ptr = thread_data_ptr;
+   params.sampler = sampler;
+   params.info = &shader->info.base;
+   params.ssbo_ptr = ssbo_ptr;
+   params.ssbo_sizes_ptr = num_ssbo_ptr;
+
     /* Build the actual shader */
-   lp_build_tgsi_soa(gallivm, tokens, type, &mask,
-                     consts_ptr, num_consts_ptr, &system_values,
-                     interp->inputs,
-                     outputs, sampler, &shader->info.base, NULL);
+   lp_build_tgsi_soa(gallivm, tokens, &params,
+                     outputs);
  
     /* Alpha test */
     if (key->alpha.enabled) {
@@ -456,59 +535,51 @@ generate_fs_loop(struct gallivm_state *gallivm,
        }
     }
  
+   if (shader->info.base.writes_samplemask) {
+      int smaski = find_output_by_semantic(&shader->info.base,
+                                           TGSI_SEMANTIC_SAMPLEMASK,
+                                           0);
+      LLVMValueRef smask;
+      struct lp_build_context smask_bld;
+      lp_build_context_init(&smask_bld, gallivm, int_type);
+
+      assert(smaski >= 0);
+      smask = LLVMBuildLoad(builder, outputs[smaski][0], "smask");
+      /*
+       * Pixel is alive according to the first sample in the mask.
+       */
+      smask = LLVMBuildBitCast(builder, smask, smask_bld.vec_type, "");
+      smask = lp_build_and(&smask_bld, smask, smask_bld.one);
+      smask = lp_build_cmp(&smask_bld, PIPE_FUNC_NOTEQUAL, smask, smask_bld.zero);
+      lp_build_mask_update(&mask, smask);
+   }
+
     /* Late Z test */
     if (depth_mode & LATE_DEPTH_TEST) {
        int pos0 = find_output_by_semantic(&shader->info.base,
                                           TGSI_SEMANTIC_POSITION,
                                           0);
-
+      int s_out = find_output_by_semantic(&shader->info.base,
+                                          TGSI_SEMANTIC_STENCIL,
+                                          0);
        if (pos0 != -1 && outputs[pos0][2]) {
           z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z");
+      }
+      /*
+       * Clamp according to ARB_depth_clamp semantics.
+       */
+      if (key->depth_clamp) {
+         z = lp_build_depth_clamp(gallivm, builder, type, context_ptr,
+                                  thread_data_ptr, z);
+      }
  
-         /*
-          * Clamp according to ARB_depth_clamp semantics.
-          */
-         if (key->depth_clamp) {
-            LLVMValueRef viewport, min_depth, max_depth;
-            LLVMValueRef viewport_index;
-            struct lp_build_context f32_bld;
-
-            assert(type.floating);
-            lp_build_context_init(&f32_bld, gallivm, type);
-
-            /*
-             * Assumes clamping of the viewport index will occur in setup/gs. Value
-             * is passed through the rasterization stage via lp_rast_shader_inputs.
-             *
-             * See: draw_clamp_viewport_idx and lp_clamp_viewport_idx for clamping
-             *      semantics.
-             */
-            viewport_index = lp_jit_thread_data_raster_state_viewport_index(gallivm,
-                                thread_data_ptr);
-
-            /*
-             * Load the min and max depth from the lp_jit_context.viewports
-             * array of lp_jit_viewport structures.
-             */
-            viewport = lp_llvm_viewport(context_ptr, gallivm, viewport_index);
-
-            /* viewports[viewport_index].min_depth */
-            min_depth = LLVMBuildExtractElement(builder, viewport,
-                           lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MIN_DEPTH),
-                           "");
-            min_depth = lp_build_broadcast_scalar(&f32_bld, min_depth);
-
-            /* viewports[viewport_index].max_depth */
-            max_depth = LLVMBuildExtractElement(builder, viewport,
-                           lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MAX_DEPTH),
-                           "");
-            max_depth = lp_build_broadcast_scalar(&f32_bld, max_depth);
-
-            /*
-             * Clamp to the min and max depth values for the given viewport.
-             */
-            z = lp_build_clamp(&f32_bld, z, min_depth, max_depth);
-         }
+      if (s_out != -1 && outputs[s_out][1]) {
+         /* there's only one value, and spec says to discard additional bits */
+         LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255);
+         stencil_refs[0] = LLVMBuildLoad(builder, outputs[s_out][1], "output.s");
+         stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, "");
+         stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, "");
+         stencil_refs[1] = stencil_refs[0];
        }
  
        lp_build_depth_stencil_load_swizzled(gallivm, type,
@@ -630,7 +701,7 @@ generate_fs_twiddle(struct gallivm_state *gallivm,
     src_count = num_fs * src_channels;
  
     assert(pixels == 2 || pixels == 1);
-   assert(num_fs * src_channels <= Elements(src));
+   assert(num_fs * src_channels <= ARRAY_SIZE(src));
  
     /*
      * Transpose from SoA -> AoS
@@ -701,6 +772,10 @@ generate_fs_twiddle(struct gallivm_state *gallivm,
        }
     } else if (twiddle) {
        /* Twiddle pixels across elements of array */
+      /*
+       * XXX: we should avoid this in some cases, but would need to tell
+       * lp_build_conv to reorder (or deal with it ourselves).
+       */
        lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
     } else {
        /* Do nothing */
@@ -731,6 +806,94 @@ generate_fs_twiddle(struct gallivm_state *gallivm,
  }
  
  
+/*
+ * Untwiddle and transpose, much like the above.
+ * However, this is after conversion, so we get packed vectors.
+ * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data,
+ * the vectors will look like:
+ * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may
+ * be swizzled here). Extending to 16bit should be trivial.
+ * Should also be extended to handle twice wide vectors with AVX2...
+ */
+static void
+fs_twiddle_transpose(struct gallivm_state *gallivm,
+                     struct lp_type type,
+                     LLVMValueRef *src,
+                     unsigned src_count,
+                     LLVMValueRef *dst)
+{
+   unsigned i, j;
+   struct lp_type type64, type16, type32;
+   LLVMTypeRef type64_t, type8_t, type16_t, type32_t;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef tmp[4], shuf[8];
+   for (j = 0; j < 2; j++) {
+      shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0);
+      shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2);
+      shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1);
+      shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3);
+   }
+
+   assert(src_count == 4 || src_count == 2 || src_count == 1);
+   assert(type.width == 8);
+   assert(type.length == 16);
+
+   type8_t = lp_build_vec_type(gallivm, type);
+
+   type64 = type;
+   type64.length /= 8;
+   type64.width *= 8;
+   type64_t = lp_build_vec_type(gallivm, type64);
+
+   type16 = type;
+   type16.length /= 2;
+   type16.width *= 2;
+   type16_t = lp_build_vec_type(gallivm, type16);
+
+   type32 = type;
+   type32.length /= 4;
+   type32.width *= 4;
+   type32_t = lp_build_vec_type(gallivm, type32);
+
+   lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp);
+
+   if (src_count == 1) {
+      /* transpose was no-op, just untwiddle */
+      LLVMValueRef shuf_vec;
+      shuf_vec = LLVMConstVector(shuf, 8);
+      tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, "");
+      tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, "");
+      dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, "");
+   } else if (src_count == 2) {
+      LLVMValueRef shuf_vec;
+      shuf_vec = LLVMConstVector(shuf, 4);
+
+      for (i = 0; i < 2; i++) {
+         tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, "");
+         tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, "");
+         dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, "");
+      }
+   } else {
+      for (j = 0; j < 2; j++) {
+         LLVMValueRef lo, hi, lo2, hi2;
+          /*
+          * Note that if we only really have 3 valid channels (rgb)
+          * and we don't need alpha we could substitute a undef here
+          * for the respective channel (causing llvm to drop conversion
+          * for alpha).
+          */
+         /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */
+         lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, "");
+         hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, "");
+         lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0);
+         hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1);
+         dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, "");
+         dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, "");
+      }
+   }
+}
+
+
  /**
   * Load an unswizzled block of pixels from memory
   */
@@ -766,11 +929,12 @@ load_unswizzled_block(struct gallivm_state *gallivm,
        gep[1] = LLVMBuildAdd(builder, bx, by, "");
  
        dst_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
-      dst_ptr = LLVMBuildBitCast(builder, dst_ptr, LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
+      dst_ptr = LLVMBuildBitCast(builder, dst_ptr,
+                                 LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
  
        dst[i] = LLVMBuildLoad(builder, dst_ptr, "");
  
-      lp_set_load_alignment(dst[i], dst_alignment);
+      LLVMSetAlignment(dst[i], dst_alignment);
     }
  }
  
@@ -810,11 +974,12 @@ store_unswizzled_block(struct gallivm_state *gallivm,
        gep[1] = LLVMBuildAdd(builder, bx, by, "");
  
        src_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
-      src_ptr = LLVMBuildBitCast(builder, src_ptr, LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
+      src_ptr = LLVMBuildBitCast(builder, src_ptr,
+                                 LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
  
        src_ptr = LLVMBuildStore(builder, src[i], src_ptr);
  
-      lp_set_store_alignment(src_ptr, src_alignment);
+      LLVMSetAlignment(src_ptr, src_alignment);
     }
  }
  
@@ -824,7 +989,7 @@ store_unswizzled_block(struct gallivm_state *gallivm,
   *
   * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
   */
-static INLINE boolean
+static inline boolean
  is_arithmetic_format(const struct util_format_description *format_desc)
  {
     boolean arith = false;
@@ -844,7 +1009,7 @@ is_arithmetic_format(const struct util_format_description *format_desc)
   * to floats for blending, and furthermore has "natural" packed AoS -> unpacked
   * SoA conversion.
   */
-static INLINE boolean
+static inline boolean
  format_expands_to_float_soa(const struct util_format_description *format_desc)
  {
     if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
@@ -860,7 +1025,7 @@ format_expands_to_float_soa(const struct util_format_description *format_desc)
   *
   * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
   */
-static INLINE void
+static inline void
  lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
                               struct lp_type* type)
  {
@@ -908,7 +1073,7 @@ lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
   *
   * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
   */
-static INLINE void
+static inline void
  lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
                                 struct lp_type* type)
  {
@@ -980,7 +1145,7 @@ lp_blend_type_from_format_desc(const struct util_format_description *format_desc
   *
   * but we try to avoid division and multiplication through shifts.
   */
-static INLINE LLVMValueRef
+static inline LLVMValueRef
  scale_bits(struct gallivm_state *gallivm,
             int src_bits,
             int dst_bits,
@@ -1063,7 +1228,7 @@ scale_bits(struct gallivm_state *gallivm,
                              lp_build_const_int_vec(gallivm, src_type, db),
                              "");
  
-      if (db < src_bits) {
+      if (db <= src_bits) {
           /* Enough bits in src to fill the remainder */
           LLVMValueRef lower = LLVMBuildLShr(builder,
                                              src,
@@ -1092,7 +1257,7 @@ scale_bits(struct gallivm_state *gallivm,
  /**
   * If RT is a smallfloat (needing denorms) format
   */
-static INLINE int
+static inline int
  have_smallfloat_format(struct lp_type dst_type,
                         enum pipe_format format)
  {
@@ -1121,7 +1286,7 @@ convert_to_blend_type(struct gallivm_state *gallivm,
     LLVMBuilderRef builder = gallivm->builder;
     struct lp_type blend_type;
     struct lp_type mem_type;
-   unsigned i, j, k;
+   unsigned i, j;
     unsigned pixels = block_size / num_srcs;
     bool is_arith;
  
@@ -1234,9 +1399,7 @@ convert_to_blend_type(struct gallivm_state *gallivm,
           unsigned from_lsb = src_fmt->nr_channels - j - 1;
  #endif
  
-         for (k = 0; k < src_fmt->channel[j].size; ++k) {
-            mask |= 1 << k;
-         }
+         mask = (1 << src_fmt->channel[j].size) - 1;
  
           /* Extract bits from source */
           chans[j] = LLVMBuildLShr(builder,
@@ -1427,7 +1590,8 @@ convert_from_blend_type(struct gallivm_state *gallivm,
           /* Extract bits */
           chans[j] = LLVMBuildLShr(builder,
                                    dst[i],
-                                  lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width),
+                                  lp_build_const_int_vec(gallivm, src_type,
+                                                         from_lsb * blend_type.width),
                                    "");
  
           chans[j] = LLVMBuildAnd(builder,
@@ -1515,7 +1679,8 @@ convert_alpha(struct gallivm_state *gallivm,
        /* If there is a src for each pixel broadcast the alpha across whole row */
        if (src_count == block_size) {
           for (i = 0; i < src_count; ++i) {
-            src_alpha[i] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, row_type), src_alpha[i]);
+            src_alpha[i] = lp_build_broadcast(gallivm,
+                              lp_build_vec_type(gallivm, row_type), src_alpha[i]);
           }
        } else {
           unsigned pixels = block_size / src_count;
@@ -1585,7 +1750,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
     LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS];
     LLVMValueRef fs_src1[4][TGSI_NUM_CHANNELS];
     LLVMValueRef src_alpha[4 * 4];
-   LLVMValueRef src1_alpha[4 * 4];
+   LLVMValueRef src1_alpha[4 * 4] = { NULL };
     LLVMValueRef src_mask[4 * 4];
     LLVMValueRef src[4 * 4];
     LLVMValueRef src1[4 * 4];
@@ -1601,6 +1766,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
     struct lp_type blend_type;
     struct lp_type row_type;
     struct lp_type dst_type;
+   struct lp_type ls_type;
  
     unsigned char swizzle[TGSI_NUM_CHANNELS];
     unsigned vector_width;
@@ -1620,6 +1786,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
                                       util_blend_state_is_dual(&variant->key.blend, 0);
  
     const boolean is_1d = variant->key.resource_1d;
+   boolean twiddle_after_convert = FALSE;
     unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
     LLVMValueRef fpstate = 0;
  
@@ -1666,15 +1833,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
     partial_mask |= !variant->opaque;
     i32_zero = lp_build_const_int32(gallivm, 0);
  
-#if HAVE_LLVM < 0x0302
-   /*
-    * undef triggers a crash in LLVMBuildTrunc in convert_from_blend_type in some
-    * cases (seen with r10g10b10a2, 128bit wide vectors) (only used for 1d case).
-    */
-   undef_src_val = lp_build_zero(gallivm, fs_type);
-#else
     undef_src_val = lp_build_undef(gallivm, fs_type);
-#endif
  
     row_type.length = fs_type.length;
     vector_width    = dst_type.floating ? lp_native_vector_width : lp_integer_vector_width;
@@ -1723,13 +1882,23 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
     }
  
     /* If 3 channels then pad to include alpha for 4 element transpose */
-   if (dst_channels == 3 && !has_alpha) {
+   if (dst_channels == 3) {
+      assert (!has_alpha);
        for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
           if (swizzle[i] > TGSI_NUM_CHANNELS)
              swizzle[i] = 3;
        }
        if (out_format_desc->nr_channels == 4) {
           dst_channels = 4;
+         /*
+          * We use alpha from the color conversion, not separate one.
+          * We had to include it for transpose, hence it will get converted
+          * too (albeit when doing transpose after conversion, that would
+          * no longer be the case necessarily).
+          * (It works only with 4 channel dsts, e.g. rgbx formats, because
+          * otherwise we really have padding, not alpha, included.)
+          */
+         has_alpha = true;
        }
     }
  
@@ -1761,6 +1930,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
        /*
         * XXX If we include that here maybe could actually use it instead of
         * separate alpha for blending?
+       * (Difficult though we actually convert pad channels, not alpha.)
         */
        if (dst_channels == 3 && !has_alpha) {
           fs_src[i][3] = alpha;
@@ -1768,11 +1938,14 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
  
        /* We split the row_mask and row_alpha as we want 128bit interleave */
        if (fs_type.length == 8) {
-         src_mask[i*2 + 0]  = lp_build_extract_range(gallivm, fs_mask[i], 0, src_channels);
-         src_mask[i*2 + 1]  = lp_build_extract_range(gallivm, fs_mask[i], src_channels, src_channels);
+         src_mask[i*2 + 0]  = lp_build_extract_range(gallivm, fs_mask[i],
+                                                     0, src_channels);
+         src_mask[i*2 + 1]  = lp_build_extract_range(gallivm, fs_mask[i],
+                                                     src_channels, src_channels);
  
           src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
-         src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels);
+         src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
+                                                     src_channels, src_channels);
        } else {
           src_mask[i] = fs_mask[i];
           src_alpha[i] = alpha;
@@ -1803,7 +1976,8 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
           }
           if (fs_type.length == 8) {
              src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
-            src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels);
+            src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
+                                                         src_channels, src_channels);
           } else {
              src1_alpha[i] = alpha;
           }
@@ -1829,14 +2003,45 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
        }
     }
  
+   /*
+    * We actually should generally do conversion first (for non-1d cases)
+    * when the blend format is 8 or 16 bits. The reason is obvious,
+    * there's 2 or 4 times less vectors to deal with for the interleave...
+    * Albeit for the AVX (not AVX2) case there's no benefit with 16 bit
+    * vectors (as it can do 32bit unpack with 256bit vectors, but 8/16bit
+    * unpack only with 128bit vectors).
+    * Note: for 16bit sizes really need matching pack conversion code
+    */
+   if (!is_1d && dst_channels != 3 && dst_type.width == 8) {
+      twiddle_after_convert = TRUE;
+   }
+
     /*
      * Pixel twiddle from fragment shader order to memory order
      */
-   src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
-                                   dst_channels, fs_src, src, pad_inline);
-   if (dual_source_blend) {
-      generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
-                          fs_src1, src1, pad_inline);
+   if (!twiddle_after_convert) {
+      src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
+                                      dst_channels, fs_src, src, pad_inline);
+      if (dual_source_blend) {
+         generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
+                             fs_src1, src1, pad_inline);
+      }
+   } else {
+      src_count = num_fullblock_fs * dst_channels;
+      /*
+       * We reorder things a bit here, so the cases for 4-wide and 8-wide
+       * (AVX) turn out the same later when untwiddling/transpose (albeit
+       * for true AVX2 path untwiddle needs to be different).
+       * For now just order by colors first (so we can use unpack later).
+       */
+      for (j = 0; j < num_fullblock_fs; j++) {
+         for (i = 0; i < dst_channels; i++) {
+            src[i*num_fullblock_fs + j] = fs_src[j][i];
+            if (dual_source_blend) {
+               src1[i*num_fullblock_fs + j] = fs_src1[j][i];
+            }
+         }
+      }
     }
  
     src_channels = dst_channels < 3 ? dst_channels : 4;
@@ -1880,13 +2085,21 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
        assert(bits == 128 || bits == 256);
     }
  
+   if (twiddle_after_convert) {
+      fs_twiddle_transpose(gallivm, row_type, src, src_count, src);
+      if (dual_source_blend) {
+         fs_twiddle_transpose(gallivm, row_type, src1, src_count, src1);
+      }
+   }
  
     /*
      * Blend Colour conversion
      */
     blend_color = lp_jit_context_f_blend_color(gallivm, context_ptr);
-   blend_color = LLVMBuildPointerCast(builder, blend_color, LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), "");
-   blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color, &i32_zero, 1, ""), "");
+   blend_color = LLVMBuildPointerCast(builder, blend_color,
+                    LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), "");
+   blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color,
+                               &i32_zero, 1, ""), "");
  
     /* Convert */
     lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1, &blend_color, 1);
@@ -1963,13 +2176,19 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
           mask_type.length = pixels;
           mask_type.width = row_type.width * dst_channels;
  
-         src_mask[i] = LLVMBuildIntCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
+         /*
+          * If mask_type width is smaller than 32bit, this doesn't quite
+          * generate the most efficient code (could use some pack).
+          */
+         src_mask[i] = LLVMBuildIntCast(builder, src_mask[i],
+                                        lp_build_int_vec_type(gallivm, mask_type), "");
  
           mask_type.length *= dst_channels;
           mask_type.width /= dst_channels;
        }
  
-      src_mask[i] = LLVMBuildBitCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
+      src_mask[i] = LLVMBuildBitCast(builder, src_mask[i],
+                                     lp_build_int_vec_type(gallivm, mask_type), "");
        src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length);
     }
  
@@ -2034,17 +2253,41 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
      */
     dst_alignment = MIN2(16, dst_alignment);
  
+   ls_type = dst_type;
+
+   if (dst_count > src_count) {
+      if ((dst_type.width == 8 || dst_type.width == 16) &&
+          util_is_power_of_two_or_zero(dst_type.length) &&
+          dst_type.length * dst_type.width < 128) {
+         /*
+          * Never try to load values as 4xi8 which we will then
+          * concatenate to larger vectors. This gives llvm a real
+          * headache (the problem is the type legalizer (?) will
+          * try to load that as 4xi8 zext to 4xi32 to fill the vector,
+          * then the shuffles to concatenate are more or less impossible
+          * - llvm is easily capable of generating a sequence of 32
+          * pextrb/pinsrb instructions for that. Albeit it appears to
+          * be fixed in llvm 4.0. So, load and concatenate with 32bit
+          * width to avoid the trouble (16bit seems not as bad, llvm
+          * probably recognizes the load+shuffle as only one shuffle
+          * is necessary, but we can do just the same anyway).
+          */
+         ls_type.length = dst_type.length * dst_type.width / 32;
+         ls_type.width = 32;
+      }
+   }
+
     if (is_1d) {
        load_unswizzled_block(gallivm, color_ptr, stride, block_width, 1,
-                            dst, dst_type, dst_count / 4, dst_alignment);
+                            dst, ls_type, dst_count / 4, dst_alignment);
        for (i = dst_count / 4; i < dst_count; i++) {
-         dst[i] = lp_build_undef(gallivm, dst_type);
+         dst[i] = lp_build_undef(gallivm, ls_type);
        }
  
     }
     else {
        load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height,
-                            dst, dst_type, dst_count, dst_alignment);
+                            dst, ls_type, dst_count, dst_alignment);
     }
  
  
@@ -2059,7 +2302,24 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
      * on all 16 pixels in that single vector at once.
      */
     if (dst_count > src_count) {
-      lp_build_concat_n(gallivm, dst_type, dst, 4, dst, src_count);
+      if (ls_type.length != dst_type.length && ls_type.length == 1) {
+         LLVMTypeRef elem_type = lp_build_elem_type(gallivm, ls_type);
+         LLVMTypeRef ls_vec_type = LLVMVectorType(elem_type, 1);
+         for (i = 0; i < dst_count; i++) {
+            dst[i] = LLVMBuildBitCast(builder, dst[i], ls_vec_type, "");
+         }
+      }
+
+      lp_build_concat_n(gallivm, ls_type, dst, 4, dst, src_count);
+
+      if (ls_type.length != dst_type.length) {
+         struct lp_type tmp_type = dst_type;
+         tmp_type.length = dst_type.length * 4 / src_count;
+         for (i = 0; i < src_count; i++) {
+            dst[i] = LLVMBuildBitCast(builder, dst[i],
+                                      lp_build_vec_type(gallivm, tmp_type), "");
+         }
+      }
     }
  
     /*
@@ -2074,7 +2334,8 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
      * It seems some cleanup could be done here (like skipping conversion/blend
      * when not needed).
      */
-   convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type, row_type, dst, src_count);
+   convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type,
+                         row_type, dst, src_count);
  
     /*
      * FIXME: Really should get logic ops / masks out of generic blend / row
@@ -2100,7 +2361,8 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
                                    pad_inline ? 4 : dst_channels);
     }
  
-   convert_from_blend_type(gallivm, block_size, out_format_desc, row_type, dst_type, dst, src_count);
+   convert_from_blend_type(gallivm, block_size, out_format_desc,
+                           row_type, dst_type, dst, src_count);
  
     /* Split the blend rows back to memory rows */
     if (dst_count > src_count) {
@@ -2213,7 +2475,7 @@ generate_fragment(struct llvmpipe_context *lp,
  
     /* check if writes to cbuf[0] are to be copied to all cbufs */
     cbuf0_write_all =
-     shader->info.base.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS][0];
+     shader->info.base.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
  
     /* TODO: actually pick these based on the fs and color buffer
      * characteristics. */
@@ -2241,8 +2503,8 @@ generate_fragment(struct llvmpipe_context *lp,
  
     blend_vec_type = lp_build_vec_type(gallivm, blend_type);
  
-   util_snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s",
-                 shader->no, variant->no, partial_mask ? "partial" : "whole");
+   snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s",
+            shader->no, variant->no, partial_mask ? "partial" : "whole");
  
     arg_types[0] = variant->jit_context_ptr_type;       /* context */
     arg_types[1] = int32_type;                          /* x */
@@ -2259,7 +2521,7 @@ generate_fragment(struct llvmpipe_context *lp,
     arg_types[12] = int32_type;                         /* depth_stride */
  
     func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
-                                arg_types, Elements(arg_types), 0);
+                                arg_types, ARRAY_SIZE(arg_types), 0);
  
     function = LLVMAddFunction(gallivm->module, func_name, func_type);
     LLVMSetFunctionCallConv(function, LLVMCCallConv);
@@ -2269,9 +2531,9 @@ generate_fragment(struct llvmpipe_context *lp,
     /* XXX: need to propagate noalias down into color param now we are
      * passing a pointer-to-pointer?
      */
-   for(i = 0; i < Elements(arg_types); ++i)
+   for(i = 0; i < ARRAY_SIZE(arg_types); ++i)
        if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
-         LLVMAddAttribute(LLVMGetParam(function, i), LLVMNoAliasAttribute);
+         lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
  
     context_ptr  = LLVMGetParam(function, 0);
     x            = LLVMGetParam(function, 1);
@@ -2295,8 +2557,8 @@ generate_fragment(struct llvmpipe_context *lp,
     lp_build_name(dady_ptr, "dady");
     lp_build_name(color_ptr_ptr, "color_ptr_ptr");
     lp_build_name(depth_ptr, "depth");
-   lp_build_name(thread_data_ptr, "thread_data");
     lp_build_name(mask_input, "mask_input");
+   lp_build_name(thread_data_ptr, "thread_data");
     lp_build_name(stride_ptr, "stride_ptr");
     lp_build_name(depth_stride, "depth_stride");
  
@@ -2309,8 +2571,27 @@ generate_fragment(struct llvmpipe_context *lp,
     assert(builder);
     LLVMPositionBuilderAtEnd(builder, block);
  
+   /*
+    * Must not count ps invocations if there's a null shader.
+    * (It would be ok to count with null shader if there's d/s tests,
+    * but only if there's d/s buffers too, which is different
+    * to implicit rasterization disable which must not depend
+    * on the d/s buffers.)
+    * Could use popcount on mask, but pixel accuracy is not required.
+    * Could disable if there's no stats query, but maybe not worth it.
+    */
+   if (shader->info.base.num_instructions > 1) {
+      LLVMValueRef invocs, val;
+      invocs = lp_jit_thread_data_invocations(gallivm, thread_data_ptr);
+      val = LLVMBuildLoad(builder, invocs, "");
+      val = LLVMBuildAdd(builder, val,
+                         LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), 1, 0),
+                         "invoc_count");
+      LLVMBuildStore(builder, val, invocs);
+   }
+
     /* code generated texture sampling */
-   sampler = lp_llvm_sampler_soa_create(key->state, context_ptr);
+   sampler = lp_llvm_sampler_soa_create(key->state);
  
     num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
     /* for 1d resources only run "upper half" of stamp */
@@ -2323,6 +2604,8 @@ generate_fragment(struct llvmpipe_context *lp,
        LLVMValueRef mask_store = lp_build_array_alloca(gallivm, mask_type,
                                                        num_loop, "mask_store");
        LLVMValueRef color_store[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS];
+      boolean pixel_center_integer =
+         shader->info.base.properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER];
  
        /*
         * The shader input interpolation info is not explicitely baked in the
@@ -2333,7 +2616,8 @@ generate_fragment(struct llvmpipe_context *lp,
                                 gallivm,
                                 shader->info.base.num_inputs,
                                 inputs,
-                               shader->info.base.pixel_center_integer,
+                               pixel_center_integer,
+                               key->depth_clamp,
                                 builder, fs_type,
                                 a0_ptr, dadx_ptr, dady_ptr,
                                 x, y);
@@ -2448,25 +2732,27 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
     for (i = 0; i < key->nr_cbufs; ++i) {
        debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
     }
-   if (key->depth.enabled) {
+   if (key->depth.enabled || key->stencil[0].enabled) {
        debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
-      debug_printf("depth.func = %s\n", util_dump_func(key->depth.func, TRUE));
+   }
+   if (key->depth.enabled) {
+      debug_printf("depth.func = %s\n", util_str_func(key->depth.func, TRUE));
        debug_printf("depth.writemask = %u\n", key->depth.writemask);
     }
  
     for (i = 0; i < 2; ++i) {
        if (key->stencil[i].enabled) {
-         debug_printf("stencil[%u].func = %s\n", i, util_dump_func(key->stencil[i].func, TRUE));
-         debug_printf("stencil[%u].fail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].fail_op, TRUE));
-         debug_printf("stencil[%u].zpass_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zpass_op, TRUE));
-         debug_printf("stencil[%u].zfail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zfail_op, TRUE));
+         debug_printf("stencil[%u].func = %s\n", i, util_str_func(key->stencil[i].func, TRUE));
+         debug_printf("stencil[%u].fail_op = %s\n", i, util_str_stencil_op(key->stencil[i].fail_op, TRUE));
+         debug_printf("stencil[%u].zpass_op = %s\n", i, util_str_stencil_op(key->stencil[i].zpass_op, TRUE));
+         debug_printf("stencil[%u].zfail_op = %s\n", i, util_str_stencil_op(key->stencil[i].zfail_op, TRUE));
           debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask);
           debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask);
        }
     }
  
     if (key->alpha.enabled) {
-      debug_printf("alpha.func = %s\n", util_dump_func(key->alpha.func, TRUE));
+      debug_printf("alpha.func = %s\n", util_str_func(key->alpha.func, TRUE));
     }
  
     if (key->occlusion_count) {
@@ -2474,15 +2760,15 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
     }
  
     if (key->blend.logicop_enable) {
-      debug_printf("blend.logicop_func = %s\n", util_dump_logicop(key->blend.logicop_func, TRUE));
+      debug_printf("blend.logicop_func = %s\n", util_str_logicop(key->blend.logicop_func, TRUE));
     }
     else if (key->blend.rt[0].blend_enable) {
-      debug_printf("blend.rgb_func = %s\n",   util_dump_blend_func  (key->blend.rt[0].rgb_func, TRUE));
-      debug_printf("blend.rgb_src_factor = %s\n",   util_dump_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE));
-      debug_printf("blend.rgb_dst_factor = %s\n",   util_dump_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE));
-      debug_printf("blend.alpha_func = %s\n",       util_dump_blend_func  (key->blend.rt[0].alpha_func, TRUE));
-      debug_printf("blend.alpha_src_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE));
-      debug_printf("blend.alpha_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
+      debug_printf("blend.rgb_func = %s\n",   util_str_blend_func  (key->blend.rt[0].rgb_func, TRUE));
+      debug_printf("blend.rgb_src_factor = %s\n",   util_str_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE));
+      debug_printf("blend.rgb_dst_factor = %s\n",   util_str_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE));
+      debug_printf("blend.alpha_func = %s\n",       util_str_blend_func  (key->blend.rt[0].alpha_func, TRUE));
+      debug_printf("blend.alpha_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE));
+      debug_printf("blend.alpha_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
     }
     debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
     if (key->blend.alpha_to_coverage) {
@@ -2492,17 +2778,17 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
        const struct lp_static_sampler_state *sampler = &key->state[i].sampler_state;
        debug_printf("sampler[%u] = \n", i);
        debug_printf("  .wrap = %s %s %s\n",
-                   util_dump_tex_wrap(sampler->wrap_s, TRUE),
-                   util_dump_tex_wrap(sampler->wrap_t, TRUE),
-                   util_dump_tex_wrap(sampler->wrap_r, TRUE));
+                   util_str_tex_wrap(sampler->wrap_s, TRUE),
+                   util_str_tex_wrap(sampler->wrap_t, TRUE),
+                   util_str_tex_wrap(sampler->wrap_r, TRUE));
        debug_printf("  .min_img_filter = %s\n",
-                   util_dump_tex_filter(sampler->min_img_filter, TRUE));
+                   util_str_tex_filter(sampler->min_img_filter, TRUE));
        debug_printf("  .min_mip_filter = %s\n",
-                   util_dump_tex_mipfilter(sampler->min_mip_filter, TRUE));
+                   util_str_tex_mipfilter(sampler->min_mip_filter, TRUE));
        debug_printf("  .mag_img_filter = %s\n",
-                   util_dump_tex_filter(sampler->mag_img_filter, TRUE));
+                   util_str_tex_filter(sampler->mag_img_filter, TRUE));
        if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE)
-         debug_printf("  .compare_func = %s\n", util_dump_func(sampler->compare_func, TRUE));
+         debug_printf("  .compare_func = %s\n", util_str_func(sampler->compare_func, TRUE));
        debug_printf("  .normalized_coords = %u\n", sampler->normalized_coords);
        debug_printf("  .min_max_lod_equal = %u\n", sampler->min_max_lod_equal);
        debug_printf("  .lod_bias_non_zero = %u\n", sampler->lod_bias_non_zero);
@@ -2515,7 +2801,7 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
        debug_printf("  .format = %s\n",
                     util_format_name(texture->format));
        debug_printf("  .target = %s\n",
-                   util_dump_tex_target(texture->target, TRUE));
+                   util_str_tex_target(texture->target, TRUE));
        debug_printf("  .level_zero_only = %u\n",
                     texture->level_zero_only);
        debug_printf("  .pot = %u %u %u\n",
@@ -2548,16 +2834,16 @@ generate_variant(struct llvmpipe_context *lp,
                   const struct lp_fragment_shader_variant_key *key)
  {
     struct lp_fragment_shader_variant *variant;
-   const struct util_format_description *cbuf0_format_desc;
+   const struct util_format_description *cbuf0_format_desc = NULL;
     boolean fullcolormask;
     char module_name[64];
  
     variant = CALLOC_STRUCT(lp_fragment_shader_variant);
-   if(!variant)
+   if (!variant)
        return NULL;
  
-   util_snprintf(module_name, sizeof(module_name), "fs%u_variant%u",
-                 shader->no, shader->variants_created);
+   snprintf(module_name, sizeof(module_name), "fs%u_variant%u",
+            shader->no, shader->variants_created);
  
     variant->gallivm = gallivm_create(module_name, lp->context);
     if (!variant->gallivm) {
@@ -2589,16 +2875,10 @@ generate_variant(struct llvmpipe_context *lp,
           !key->alpha.enabled &&
           !key->blend.alpha_to_coverage &&
           !key->depth.enabled &&
-         !shader->info.base.uses_kill
+         !shader->info.base.uses_kill &&
+         !shader->info.base.writes_samplemask
        ? TRUE : FALSE;
  
-   if ((shader->info.base.num_tokens <= 1) &&
-       !key->depth.enabled && !key->stencil[0].enabled) {
-      variant->ps_inv_multiplier = 0;
-   } else {
-      variant->ps_inv_multiplier = 1;
-   }
-
     if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
        lp_debug_fs_variant(variant);
     }
@@ -2685,34 +2965,35 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
  
        switch (shader->info.base.input_interpolate[i]) {
        case TGSI_INTERPOLATE_CONSTANT:
-        shader->inputs[i].interp = LP_INTERP_CONSTANT;
-        break;
+         shader->inputs[i].interp = LP_INTERP_CONSTANT;
+         break;
        case TGSI_INTERPOLATE_LINEAR:
-        shader->inputs[i].interp = LP_INTERP_LINEAR;
-        break;
+         shader->inputs[i].interp = LP_INTERP_LINEAR;
+         break;
        case TGSI_INTERPOLATE_PERSPECTIVE:
-        shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
-        break;
+         shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
+         break;
        case TGSI_INTERPOLATE_COLOR:
-        shader->inputs[i].interp = LP_INTERP_COLOR;
-        break;
+         shader->inputs[i].interp = LP_INTERP_COLOR;
+         break;
        default:
-        assert(0);
-        break;
+         assert(0);
+         break;
        }
  
        switch (shader->info.base.input_semantic_name[i]) {
        case TGSI_SEMANTIC_FACE:
-        shader->inputs[i].interp = LP_INTERP_FACING;
-        break;
+         shader->inputs[i].interp = LP_INTERP_FACING;
+         break;
        case TGSI_SEMANTIC_POSITION:
-        /* Position was already emitted above
-         */
-        shader->inputs[i].interp = LP_INTERP_POSITION;
-        shader->inputs[i].src_index = 0;
-        continue;
+         /* Position was already emitted above
+          */
+         shader->inputs[i].interp = LP_INTERP_POSITION;
+         shader->inputs[i].src_index = 0;
+         continue;
        }
  
+      /* XXX this is a completely pointless index map... */
        shader->inputs[i].src_index = i+1;
     }
  
@@ -2759,18 +3040,17 @@ llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
   * Remove shader variant from two lists: the shader's variant list
   * and the context's variant list.
   */
-void
+static void
  llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
                                 struct lp_fragment_shader_variant *variant)
  {
-   if (gallivm_debug & GALLIVM_DEBUG_IR) {
-      debug_printf("llvmpipe: del fs #%u var #%u v created #%u v cached"
-                   " #%u v total cached #%u\n",
-                   variant->shader->no,
-                   variant->no,
+   if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
+      debug_printf("llvmpipe: del fs #%u var %u v created %u v cached %u "
+                   "v total cached %u inst %u total inst %u\n",
+                   variant->shader->no, variant->no,
                     variant->shader->variants_created,
                     variant->shader->variants_cached,
-                   lp->nr_fs_variants);
+                   lp->nr_fs_variants, variant->nr_instrs, lp->nr_fs_instrs);
     }
  
     gallivm_destroy(variant->gallivm);
@@ -2824,18 +3104,25 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
  
  static void
  llvmpipe_set_constant_buffer(struct pipe_context *pipe,
-                             uint shader, uint index,
-                             struct pipe_constant_buffer *cb)
+                             enum pipe_shader_type shader, uint index,
+                             const struct pipe_constant_buffer *cb)
  {
     struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
     struct pipe_resource *constants = cb ? cb->buffer : NULL;
  
     assert(shader < PIPE_SHADER_TYPES);
-   assert(index < Elements(llvmpipe->constants[shader]));
+   assert(index < ARRAY_SIZE(llvmpipe->constants[shader]));
  
     /* note: reference counting */
     util_copy_constant_buffer(&llvmpipe->constants[shader][index], cb);
  
+   if (constants) {
+       if (!(constants->bind & PIPE_BIND_CONSTANT_BUFFER)) {
+         debug_printf("Illegal set constant without bind flag\n");
+         constants->bind |= PIPE_BIND_CONSTANT_BUFFER;
+      }
+   }
+
     if (shader == PIPE_SHADER_VERTEX ||
         shader == PIPE_SHADER_GEOMETRY) {
        /* Pass the constants to the 'draw' module */
@@ -2858,19 +3145,48 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
        draw_set_mapped_constant_buffer(llvmpipe->draw, shader,
                                        index, data, size);
     }
-
-   llvmpipe->dirty |= LP_NEW_CONSTANTS;
+   else {
+      llvmpipe->dirty |= LP_NEW_FS_CONSTANTS;
+   }
  
     if (cb && cb->user_buffer) {
        pipe_resource_reference(&constants, NULL);
     }
  }
  
+static void
+llvmpipe_set_shader_buffers(struct pipe_context *pipe,
+                            enum pipe_shader_type shader, unsigned start_slot,
+                            unsigned count, const struct pipe_shader_buffer *buffers,
+                            unsigned writable_bitmask)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   unsigned i, idx;
+   for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
+      const struct pipe_shader_buffer *buffer = buffers ? &buffers[idx] : NULL;
+
+      util_copy_shader_buffer(&llvmpipe->ssbos[shader][i], buffer);
+
+      if (shader == PIPE_SHADER_VERTEX ||
+          shader == PIPE_SHADER_GEOMETRY) {
+         const unsigned size = buffer ? buffer->buffer_size : 0;
+         const ubyte *data = NULL;
+         if (buffer && buffer->buffer)
+            data = (ubyte *) llvmpipe_resource_data(buffer->buffer);
+         if (data)
+            data += buffer->buffer_offset;
+         draw_set_mapped_shader_buffer(llvmpipe->draw, shader,
+                                       i, data, size);
+      } else if (shader == PIPE_SHADER_FRAGMENT) {
+         llvmpipe->dirty |= LP_NEW_FS_SSBOS;
+      }
+   }
+}
  
  /**
   * Return the blend factor equivalent to a destination alpha of one.
   */
-static INLINE unsigned
+static inline unsigned
  force_dst_alpha_one(unsigned factor, boolean clamped_zero)
  {
     switch(factor) {
@@ -2930,11 +3246,18 @@ make_variant_key(struct llvmpipe_context *lp,
      * depth_clip == 0 implies depth clamping is enabled.
      *
      * When clip_halfz is enabled, then always clamp the depth values.
+    *
+    * XXX: This is incorrect for GL, but correct for d3d10 (depth
+    * clamp is always active in d3d10, regardless if depth clip is
+    * enabled or not).
+    * (GL has an always-on [0,1] clamp on fs depth output instead
+    * to ensure the depth values stay in range. Doesn't look like
+    * we do that, though...)
      */
     if (lp->rasterizer->clip_halfz) {
        key->depth_clamp = 1;
     } else {
-      key->depth_clamp = (lp->rasterizer->depth_clip == 0) ? 1 : 0;
+      key->depth_clamp = (lp->rasterizer->depth_clip_near == 0) ? 1 : 0;
     }
  
     /* alpha test only applies if render buffer 0 is non-integer (or does not exist) */
@@ -3016,7 +3339,7 @@ make_variant_key(struct llvmpipe_context *lp,
            * Also, force rgb/alpha func/factors match, to make AoS blending
            * easier.
            */
-         if (format_desc->swizzle[3] > UTIL_FORMAT_SWIZZLE_W ||
+         if (format_desc->swizzle[3] > PIPE_SWIZZLE_W ||
               format_desc->swizzle[3] == format_desc->swizzle[0]) {
              /* Doesn't cover mixed snorm/unorm but can't render to them anyway */
              boolean clamped_zero = !util_format_is_float(format) &&
@@ -3057,7 +3380,12 @@ make_variant_key(struct llvmpipe_context *lp,
     if (shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) {
        key->nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
        for(i = 0; i < key->nr_sampler_views; ++i) {
-         if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) {
+         /*
+          * Note sview may exceed what's representable by file_mask.
+          * This will still work, the only downside is that not actually
+          * used views may be included in the shader key.
+          */
+         if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1u << (i & 31))) {
              lp_sampler_static_texture_state(&key->state[i].texture_state,
                                              lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
           }
@@ -3112,7 +3440,7 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
        unsigned i;
        unsigned variants_to_cull;
  
-      if (0) {
+      if (LP_DEBUG & DEBUG_FS) {
           debug_printf("%u variants,\t%u instrs,\t%u instrs/variant\n",
                        lp->nr_fs_variants,
                        lp->nr_fs_instrs,
@@ -3120,14 +3448,22 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
        }
  
        /* First, check if we've exceeded the max number of shader variants.
-       * If so, free 25% of them (the least recently used ones).
+       * If so, free 6.25% of them (the least recently used ones).
         */
-      variants_to_cull = lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS ? LP_MAX_SHADER_VARIANTS / 4 : 0;
+      variants_to_cull = lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS ? LP_MAX_SHADER_VARIANTS / 16 : 0;
  
        if (variants_to_cull ||
            lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) {
           struct pipe_context *pipe = &lp->pipe;
  
+         if (gallivm_debug & GALLIVM_DEBUG_PERF) {
+            debug_printf("Evicting FS: %u fs variants,\t%u total variants,"
+                         "\t%u instrs,\t%u instrs/variant\n",
+                         shader->variants_cached,
+                         lp->nr_fs_variants, lp->nr_fs_instrs,
+                         lp->nr_fs_instrs / lp->nr_fs_variants);
+         }
+
           /*
            * XXX: we need to flush the context until we have some sort of
            * reference counting in fragment shaders as they may still be binned
@@ -3189,19 +3525,8 @@ llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe)
     llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state;
  
     llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer;
+
+   llvmpipe->pipe.set_shader_buffers = llvmpipe_set_shader_buffers;
  }
  
-/*
- * Rasterization is disabled if there is no pixel shader and
- * both depth and stencil testing are disabled:
- * http://msdn.microsoft.com/en-us/library/windows/desktop/bb205125
- */
-boolean
-llvmpipe_rasterization_disabled(struct llvmpipe_context *lp)
-{
-   boolean null_fs = !lp->fs || lp->fs->info.base.num_tokens <= 1;
  
-   return (null_fs &&
-           !lp->depth_stencil->depth.enabled &&
-           !lp->depth_stencil->stencil[0].enabled);
-}