llvmpipe: Unbreak Z32_FLOAT.
[mesa.git] / src / gallium / drivers / llvmpipe / lp_state_fs.c
index ee912f73c7af6cdada066a0bd9cbfe658c902b55..c09835635dd0e7907a7d0c87d82b9214a69bc11b 100644 (file)
@@ -31,9 +31,6 @@
  * Code generate the whole fragment pipeline.
  *
  * The fragment pipeline consists of the following stages:
- * - triangle edge in/out testing
- * - scissor test
- * - stipple (TBI)
  * - early depth test
  * - fragment shader
  * - alpha test
 #include "pipe/p_defines.h"
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
+#include "util/u_pointer.h"
 #include "util/u_format.h"
 #include "util/u_dump.h"
+#include "util/u_string.h"
+#include "util/u_simple_list.h"
 #include "os/os_time.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
@@ -75,6 +75,7 @@
 #include "gallivm/lp_bld_type.h"
 #include "gallivm/lp_bld_const.h"
 #include "gallivm/lp_bld_conv.h"
+#include "gallivm/lp_bld_init.h"
 #include "gallivm/lp_bld_intr.h"
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_tgsi.h"
 #include "lp_setup.h"
 #include "lp_state.h"
 #include "lp_tex_sample.h"
+#include "lp_flush.h"
+#include "lp_state_fs.h"
 
 
 #include <llvm-c/Analysis.h>
 
 
-static const unsigned char quad_offset_x[4] = {0, 1, 0, 1};
-static const unsigned char quad_offset_y[4] = {0, 0, 1, 1};
+static unsigned fs_no = 0;
 
 
-/*
- * Derive from the quad's upper left scalar coordinates the coordinates for
- * all other quad pixels
- */
-static void
-generate_pos0(LLVMBuilderRef builder,
-              LLVMValueRef x,
-              LLVMValueRef y,
-              LLVMValueRef *x0,
-              LLVMValueRef *y0)
-{
-   LLVMTypeRef int_elem_type = LLVMInt32Type();
-   LLVMTypeRef int_vec_type = LLVMVectorType(int_elem_type, QUAD_SIZE);
-   LLVMTypeRef elem_type = LLVMFloatType();
-   LLVMTypeRef vec_type = LLVMVectorType(elem_type, QUAD_SIZE);
-   LLVMValueRef x_offsets[QUAD_SIZE];
-   LLVMValueRef y_offsets[QUAD_SIZE];
-   unsigned i;
-
-   x = lp_build_broadcast(builder, int_vec_type, x);
-   y = lp_build_broadcast(builder, int_vec_type, y);
-
-   for(i = 0; i < QUAD_SIZE; ++i) {
-      x_offsets[i] = LLVMConstInt(int_elem_type, quad_offset_x[i], 0);
-      y_offsets[i] = LLVMConstInt(int_elem_type, quad_offset_y[i], 0);
-   }
-
-   x = LLVMBuildAdd(builder, x, LLVMConstVector(x_offsets, QUAD_SIZE), "");
-   y = LLVMBuildAdd(builder, y, LLVMConstVector(y_offsets, QUAD_SIZE), "");
-
-   *x0 = LLVMBuildSIToFP(builder, x, vec_type, "");
-   *y0 = LLVMBuildSIToFP(builder, y, vec_type, "");
-}
-
 
 /**
- * Generate the depth /stencil test code.
+ * Expand the relevent bits of mask_input to a 4-dword mask for the 
+ * four pixels in a 2x2 quad.  This will set the four elements of the
+ * quad mask vector to 0 or ~0.
+ *
+ * \param quad  which quad of the quad group to test, in [0,3]
+ * \param mask_input  bitwise mask for the whole 4x4 stamp
  */
-static void
-generate_depth_stencil(LLVMBuilderRef builder,
-                       const struct lp_fragment_shader_variant_key *key,
-                       struct lp_type src_type,
-                       struct lp_build_mask_context *mask,
-                       LLVMValueRef stencil_refs[2],
-                       LLVMValueRef src,
-                       LLVMValueRef dst_ptr,
-                       LLVMValueRef facing,
-                       LLVMValueRef counter)
+static LLVMValueRef
+generate_quad_mask(LLVMBuilderRef builder,
+                   struct lp_type fs_type,
+                   unsigned quad,
+                   LLVMValueRef mask_input) /* int32 */
 {
-   const struct util_format_description *format_desc;
-   struct lp_type dst_type;
-
-   if (!key->depth.enabled && !key->stencil[0].enabled && !key->stencil[1].enabled)
-      return;
+   struct lp_type mask_type;
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMValueRef bits[4];
+   LLVMValueRef mask;
+   int shift;
 
-   format_desc = util_format_description(key->zsbuf_format);
-   assert(format_desc);
+   /*
+    * XXX: We'll need a different path for 16 x u8
+    */
+   assert(fs_type.width == 32);
+   assert(fs_type.length == 4);
+   mask_type = lp_int_type(fs_type);
 
    /*
-    * Depths are expected to be between 0 and 1, even if they are stored in
-    * floats. Setting these bits here will ensure that the lp_build_conv() call
-    * below won't try to unnecessarily clamp the incoming values.
+    * mask_input >>= (quad * 4)
     */
-   if(src_type.floating) {
-      src_type.sign = FALSE;
-      src_type.norm = TRUE;
-   }
-   else {
-      assert(!src_type.sign);
-      assert(src_type.norm);
+   
+   switch (quad) {
+   case 0:
+      shift = 0;
+      break;
+   case 1:
+      shift = 2;
+      break;
+   case 2:
+      shift = 8;
+      break;
+   case 3:
+      shift = 10;
+      break;
+   default:
+      assert(0);
+      shift = 0;
    }
 
-   /* Pick the depth type. */
-   dst_type = lp_depth_type(format_desc, src_type.width*src_type.length);
-
-   /* FIXME: Cope with a depth test type with a different bit width. */
-   assert(dst_type.width == src_type.width);
-   assert(dst_type.length == src_type.length);
-
-   /* Convert fragment Z from float to integer */
-   lp_build_conv(builder, src_type, dst_type, &src, 1, &src, 1);
-
-   dst_ptr = LLVMBuildBitCast(builder,
-                              dst_ptr,
-                              LLVMPointerType(lp_build_vec_type(dst_type), 0), "");
-   lp_build_depth_stencil_test(builder,
-                               &key->depth,
-                               key->stencil,
-                               dst_type,
-                               format_desc,
-                               mask,
-                               stencil_refs,
-                               src,
-                               dst_ptr,
-                               facing,
-                               counter);
-}
+   mask_input = LLVMBuildLShr(builder,
+                              mask_input,
+                              LLVMConstInt(i32t, shift, 0),
+                              "");
 
+   /*
+    * mask = { mask_input & (1 << i), for i in [0,3] }
+    */
 
-/**
- * Generate the code to do inside/outside triangle testing for the
- * four pixels in a 2x2 quad.  This will set the four elements of the
- * quad mask vector to 0 or ~0.
- * \param i  which quad of the quad group to test, in [0,3]
- */
-static void
-generate_tri_edge_mask(LLVMBuilderRef builder,
-                       unsigned i,
-                       LLVMValueRef *mask,      /* ivec4, out */
-                       LLVMValueRef c0,         /* int32 */
-                       LLVMValueRef c1,         /* int32 */
-                       LLVMValueRef c2,         /* int32 */
-                       LLVMValueRef step0_ptr,  /* ivec4 */
-                       LLVMValueRef step1_ptr,  /* ivec4 */
-                       LLVMValueRef step2_ptr)  /* ivec4 */
-{
-#define OPTIMIZE_IN_OUT_TEST 0
-#if OPTIMIZE_IN_OUT_TEST
-   struct lp_build_if_state ifctx;
-   LLVMValueRef not_draw_all;
-#endif
-   struct lp_build_flow_context *flow;
-   struct lp_type i32_type;
-   LLVMTypeRef i32vec4_type;
-   LLVMValueRef c0_vec, c1_vec, c2_vec;
-   LLVMValueRef in_out_mask;
+   mask = lp_build_broadcast(builder, lp_build_vec_type(mask_type), mask_input);
 
-   assert(i < 4);
+   bits[0] = LLVMConstInt(i32t, 1 << 0, 0);
+   bits[1] = LLVMConstInt(i32t, 1 << 1, 0);
+   bits[2] = LLVMConstInt(i32t, 1 << 4, 0);
+   bits[3] = LLVMConstInt(i32t, 1 << 5, 0);
    
-   /* int32 vector type */
-   memset(&i32_type, 0, sizeof i32_type);
-   i32_type.floating = FALSE; /* values are integers */
-   i32_type.sign = TRUE;      /* values are signed */
-   i32_type.norm = FALSE;     /* values are not normalized */
-   i32_type.width = 32;       /* 32-bit int values */
-   i32_type.length = 4;       /* 4 elements per vector */
-
-   i32vec4_type = lp_build_int32_vec4_type();
+   mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, 4), "");
 
    /*
-    * Use a conditional here to do detailed pixel in/out testing.
-    * We only have to do this if c0 != INT_MIN.
+    * mask = mask != 0 ? ~0 : 0
     */
-   flow = lp_build_flow_create(builder);
-   lp_build_flow_scope_begin(flow);
-
-   {
-#if OPTIMIZE_IN_OUT_TEST
-      /* not_draw_all = (c0 != INT_MIN) */
-      not_draw_all = LLVMBuildICmp(builder,
-                                   LLVMIntNE,
-                                   c0,
-                                   LLVMConstInt(LLVMInt32Type(), INT_MIN, 0),
-                                   "");
-
-      in_out_mask = lp_build_const_int_vec(i32_type, ~0);
-
-
-      lp_build_flow_scope_declare(flow, &in_out_mask);
-
-      /* if (not_draw_all) {... */
-      lp_build_if(&ifctx, flow, builder, not_draw_all);
-#endif
-      {
-         LLVMValueRef step0_vec, step1_vec, step2_vec;
-         LLVMValueRef m0_vec, m1_vec, m2_vec;
-         LLVMValueRef index, m;
 
-         /* c0_vec = {c0, c0, c0, c0}
-          * Note that we emit this code four times but LLVM optimizes away
-          * three instances of it.
-          */
-         c0_vec = lp_build_broadcast(builder, i32vec4_type, c0);
-         c1_vec = lp_build_broadcast(builder, i32vec4_type, c1);
-         c2_vec = lp_build_broadcast(builder, i32vec4_type, c2);
-         lp_build_name(c0_vec, "edgeconst0vec");
-         lp_build_name(c1_vec, "edgeconst1vec");
-         lp_build_name(c2_vec, "edgeconst2vec");
-
-         /* load step0vec, step1, step2 vec from memory */
-         index = LLVMConstInt(LLVMInt32Type(), i, 0);
-         step0_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step0_ptr, &index, 1, ""), "");
-         step1_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step1_ptr, &index, 1, ""), "");
-         step2_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step2_ptr, &index, 1, ""), "");
-         lp_build_name(step0_vec, "step0vec");
-         lp_build_name(step1_vec, "step1vec");
-         lp_build_name(step2_vec, "step2vec");
-
-         /* m0_vec = step0_ptr[i] > c0_vec */
-         m0_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step0_vec, c0_vec);
-         m1_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step1_vec, c1_vec);
-         m2_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step2_vec, c2_vec);
-
-         /* in_out_mask = m0_vec & m1_vec & m2_vec */
-         m = LLVMBuildAnd(builder, m0_vec, m1_vec, "");
-         in_out_mask = LLVMBuildAnd(builder, m, m2_vec, "");
-         lp_build_name(in_out_mask, "inoutmaskvec");
-      }
-#if OPTIMIZE_IN_OUT_TEST
-      lp_build_endif(&ifctx);
-#endif
+   mask = lp_build_compare(builder,
+                           mask_type, PIPE_FUNC_NOTEQUAL,
+                           mask,
+                           lp_build_const_int_vec(mask_type, 0));
 
-   }
-   lp_build_flow_scope_end(flow);
-   lp_build_flow_destroy(flow);
-
-   /* This is the initial alive/dead pixel mask for a quad of four pixels.
-    * It's an int[4] vector with each word set to 0 or ~0.
-    * Words will get cleared when pixels faile the Z test, etc.
-    */
-   *mask = in_out_mask;
+   return mask;
 }
 
 
-static LLVMValueRef
-generate_scissor_test(LLVMBuilderRef builder,
-                      LLVMValueRef context_ptr,
-                      const struct lp_build_interp_soa_context *interp,
-                      struct lp_type type)
-{
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   LLVMValueRef xpos = interp->pos[0], ypos = interp->pos[1];
-   LLVMValueRef xmin, ymin, xmax, ymax;
-   LLVMValueRef m0, m1, m2, m3, m;
-
-   /* xpos, ypos contain the window coords for the four pixels in the quad */
-   assert(xpos);
-   assert(ypos);
-
-   /* get the current scissor bounds, convert to vectors */
-   xmin = lp_jit_context_scissor_xmin_value(builder, context_ptr);
-   xmin = lp_build_broadcast(builder, vec_type, xmin);
-
-   ymin = lp_jit_context_scissor_ymin_value(builder, context_ptr);
-   ymin = lp_build_broadcast(builder, vec_type, ymin);
-
-   xmax = lp_jit_context_scissor_xmax_value(builder, context_ptr);
-   xmax = lp_build_broadcast(builder, vec_type, xmax);
-
-   ymax = lp_jit_context_scissor_ymax_value(builder, context_ptr);
-   ymax = lp_build_broadcast(builder, vec_type, ymax);
+#define EARLY_DEPTH_TEST  0x1
+#define LATE_DEPTH_TEST   0x2
+#define EARLY_DEPTH_WRITE 0x4
+#define LATE_DEPTH_WRITE  0x8
 
-   /* compare the fragment's position coordinates against the scissor bounds */
-   m0 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, xpos, xmin);
-   m1 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, ypos, ymin);
-   m2 = lp_build_compare(builder, type, PIPE_FUNC_LESS, xpos, xmax);
-   m3 = lp_build_compare(builder, type, PIPE_FUNC_LESS, ypos, ymax);
-
-   /* AND all the masks together */
-   m = LLVMBuildAnd(builder, m0, m1, "");
-   m = LLVMBuildAnd(builder, m, m2, "");
-   m = LLVMBuildAnd(builder, m, m3, "");
-
-   lp_build_name(m, "scissormask");
-
-   return m;
-}
+static int
+find_output_by_semantic( const struct tgsi_shader_info *info,
+                        unsigned semantic,
+                        unsigned index )
+{
+   int i;
 
+   for (i = 0; i < info->num_outputs; i++)
+      if (info->output_semantic_name[i] == semantic &&
+         info->output_semantic_index[i] == index)
+        return i;
 
-static LLVMValueRef
-build_int32_vec_const(int value)
-{
-   struct lp_type i32_type;
-
-   memset(&i32_type, 0, sizeof i32_type);
-   i32_type.floating = FALSE; /* values are integers */
-   i32_type.sign = TRUE;      /* values are signed */
-   i32_type.norm = FALSE;     /* values are not normalized */
-   i32_type.width = 32;       /* 32-bit int values */
-   i32_type.length = 4;       /* 4 elements per vector */
-   return lp_build_const_int_vec(i32_type, value);
+   return -1;
 }
 
 
-
 /**
  * Generate the fragment shader, depth/stencil test, and alpha tests.
  * \param i  which quad in the tile, in range [0,3]
- * \param do_tri_test  if 1, do triangle edge in/out testing
+ * \param partial_mask  if 1, do mask_input testing
  */
 static void
 generate_fs(struct llvmpipe_context *lp,
@@ -390,33 +219,62 @@ generate_fs(struct llvmpipe_context *lp,
             struct lp_type type,
             LLVMValueRef context_ptr,
             unsigned i,
-            const struct lp_build_interp_soa_context *interp,
+            struct lp_build_interp_soa_context *interp,
             struct lp_build_sampler_soa *sampler,
             LLVMValueRef *pmask,
             LLVMValueRef (*color)[4],
             LLVMValueRef depth_ptr,
             LLVMValueRef facing,
-            unsigned do_tri_test,
-            LLVMValueRef c0,
-            LLVMValueRef c1,
-            LLVMValueRef c2,
-            LLVMValueRef step0_ptr,
-            LLVMValueRef step1_ptr,
-            LLVMValueRef step2_ptr,
+            unsigned partial_mask,
+            LLVMValueRef mask_input,
             LLVMValueRef counter)
 {
+   const struct util_format_description *zs_format_desc = NULL;
    const struct tgsi_token *tokens = shader->base.tokens;
    LLVMTypeRef vec_type;
    LLVMValueRef consts_ptr;
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
-   LLVMValueRef z = interp->pos[2];
+   LLVMValueRef z;
+   LLVMValueRef zs_value = NULL;
    LLVMValueRef stencil_refs[2];
-   struct lp_build_flow_context *flow;
    struct lp_build_mask_context mask;
-   boolean early_depth_stencil_test;
+   boolean simple_shader = (shader->info.base.file_count[TGSI_FILE_SAMPLER] == 0 &&
+                            shader->info.base.num_inputs < 3 &&
+                            shader->info.base.num_instructions < 8);
    unsigned attrib;
    unsigned chan;
    unsigned cbuf;
+   unsigned depth_mode;
+
+   if (key->depth.enabled ||
+       key->stencil[0].enabled ||
+       key->stencil[1].enabled) {
+
+      zs_format_desc = util_format_description(key->zsbuf_format);
+      assert(zs_format_desc);
+
+      if (!shader->info.base.writes_z) {
+         if (key->alpha.enabled || shader->info.base.uses_kill)
+            /* With alpha test and kill, can do the depth test early
+             * and hopefully eliminate some quads.  But need to do a
+             * special deferred depth write once the final mask value
+             * is known.
+             */
+            depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE;
+         else
+            depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
+      }
+      else {
+         depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
+      }
+
+      if (!(key->depth.enabled && key->depth.writemask) &&
+          !(key->stencil[0].enabled && key->stencil[0].writemask))
+         depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE);
+   }
+   else {
+      depth_mode = 0;
+   }
 
    assert(i < 4);
 
@@ -427,106 +285,144 @@ generate_fs(struct llvmpipe_context *lp,
 
    consts_ptr = lp_jit_context_constants(builder, context_ptr);
 
-   flow = lp_build_flow_create(builder);
-
    memset(outputs, 0, sizeof outputs);
 
-   lp_build_flow_scope_begin(flow);
-
    /* Declare the color and z variables */
    for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-        color[cbuf][chan] = LLVMGetUndef(vec_type);
-        lp_build_flow_scope_declare(flow, &color[cbuf][chan]);
+        color[cbuf][chan] = lp_build_alloca(builder, vec_type, "color");
       }
    }
-   lp_build_flow_scope_declare(flow, &z);
 
    /* do triangle edge testing */
-   if (do_tri_test) {
-      generate_tri_edge_mask(builder, i, pmask,
-                             c0, c1, c2, step0_ptr, step1_ptr, step2_ptr);
+   if (partial_mask) {
+      *pmask = generate_quad_mask(builder, type,
+                                  i, mask_input);
    }
    else {
-      *pmask = build_int32_vec_const(~0);
+      *pmask = lp_build_const_int_vec(type, ~0);
    }
 
    /* 'mask' will control execution based on quad's pixel alive/killed state */
-   lp_build_mask_begin(&mask, flow, type, *pmask);
-
-   if (key->scissor) {
-      LLVMValueRef smask =
-         generate_scissor_test(builder, context_ptr, interp, type);
-      lp_build_mask_update(&mask, smask);
+   lp_build_mask_begin(&mask, builder, type, *pmask);
+
+   if (!(depth_mode & EARLY_DEPTH_TEST) && !simple_shader)
+      lp_build_mask_check(&mask);
+
+   lp_build_interp_soa_update_pos(interp, i);
+   z = interp->pos[2];
+
+   if (depth_mode & EARLY_DEPTH_TEST) {
+      lp_build_depth_stencil_test(builder,
+                                  &key->depth,
+                                  key->stencil,
+                                  type,
+                                  zs_format_desc,
+                                  &mask,
+                                  stencil_refs,
+                                  z,
+                                  depth_ptr, facing,
+                                  &zs_value,
+                                  !simple_shader);
+
+      if (depth_mode & EARLY_DEPTH_WRITE) {
+         lp_build_depth_write(builder, zs_format_desc, depth_ptr, zs_value);
+      }
    }
 
-   early_depth_stencil_test =
-      (key->depth.enabled || key->stencil[0].enabled) &&
-      !key->alpha.enabled &&
-      !shader->info.uses_kill &&
-      !shader->info.writes_z;
-
-   if (early_depth_stencil_test)
-      generate_depth_stencil(builder, key,
-                             type, &mask,
-                             stencil_refs, z, depth_ptr, facing, counter);
-
+   lp_build_interp_soa_update_inputs(interp, i);
+   
+   /* Build the actual shader */
    lp_build_tgsi_soa(builder, tokens, type, &mask,
                      consts_ptr, interp->pos, interp->inputs,
-                     outputs, sampler, &shader->info);
+                     outputs, sampler, &shader->info.base);
 
-   /* loop over fragment shader outputs/results */
-   for (attrib = 0; attrib < shader->info.num_outputs; ++attrib) {
-      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-         if(outputs[attrib][chan]) {
-            LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
-            lp_build_name(out, "output%u.%u.%c", i, attrib, "xyzw"[chan]);
-
-            switch (shader->info.output_semantic_name[attrib]) {
-            case TGSI_SEMANTIC_COLOR:
-               {
-                  unsigned cbuf = shader->info.output_semantic_index[attrib];
-
-                  lp_build_name(out, "color%u.%u.%c", i, attrib, "rgba"[chan]);
-
-                  /* Alpha test */
-                  /* XXX: should the alpha reference value be passed separately? */
-                 /* XXX: should only test the final assignment to alpha */
-                  if(cbuf == 0 && chan == 3) {
-                     LLVMValueRef alpha = out;
-                     LLVMValueRef alpha_ref_value;
-                     alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr);
-                     alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value);
-                     lp_build_alpha_test(builder, &key->alpha, type,
-                                         &mask, alpha, alpha_ref_value);
-                  }
-
-                 color[cbuf][chan] = out;
-                  break;
-               }
-
-            case TGSI_SEMANTIC_POSITION:
-               if(chan == 2)
-                  z = out;
-               break;
-            }
-         }
+
+   /* Alpha test */
+   if (key->alpha.enabled) {
+      int color0 = find_output_by_semantic(&shader->info.base,
+                                           TGSI_SEMANTIC_COLOR,
+                                           0);
+
+      if (color0 != -1) {
+         LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha");
+         LLVMValueRef alpha_ref_value;
+
+         alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr);
+         alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value);
+
+         lp_build_alpha_test(builder, key->alpha.func, type,
+                             &mask, alpha, alpha_ref_value,
+                             (depth_mode & LATE_DEPTH_TEST) != 0);
       }
    }
 
-   if (!early_depth_stencil_test)
-      generate_depth_stencil(builder, key,
-                             type, &mask,
-                             stencil_refs, z, depth_ptr, facing, counter);
+   /* Late Z test */
+   if (depth_mode & LATE_DEPTH_TEST) { 
+      int pos0 = find_output_by_semantic(&shader->info.base,
+                                         TGSI_SEMANTIC_POSITION,
+                                         0);
+         
+      if (pos0 != -1) {
+         z = LLVMBuildLoad(builder, outputs[pos0][2], "z");
+         lp_build_name(z, "output%u.%u.%c", i, pos0, "xyzw"[chan]);
+      }
 
-   lp_build_mask_end(&mask);
+      lp_build_depth_stencil_test(builder,
+                                  &key->depth,
+                                  key->stencil,
+                                  type,
+                                  zs_format_desc,
+                                  &mask,
+                                  stencil_refs,
+                                  z,
+                                  depth_ptr, facing,
+                                  &zs_value,
+                                  !simple_shader);
+      /* Late Z write */
+      if (depth_mode & LATE_DEPTH_WRITE) {
+         lp_build_depth_write(builder, zs_format_desc, depth_ptr, zs_value);
+      }
+   }
+   else if ((depth_mode & EARLY_DEPTH_TEST) &&
+            (depth_mode & LATE_DEPTH_WRITE))
+   {
+      /* Need to apply a reduced mask to the depth write.  Reload the
+       * depth value, update from zs_value with the new mask value and
+       * write that out.
+       */
+      lp_build_deferred_depth_write(builder,
+                                    type,
+                                    zs_format_desc,
+                                    &mask,
+                                    depth_ptr,
+                                    zs_value);
+   }
 
-   lp_build_flow_scope_end(flow);
 
-   lp_build_flow_destroy(flow);
+   /* Color write  */
+   for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib)
+   {
+      if (shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR)
+      {
+         unsigned cbuf = shader->info.base.output_semantic_index[attrib];
+         for(chan = 0; chan < NUM_CHANNELS; ++chan)
+         {
+            /* XXX: just initialize outputs to point at colors[] and
+             * skip this.
+             */
+            LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
+            lp_build_name(out, "color%u.%u.%c", i, attrib, "rgba"[chan]);
+            LLVMBuildStore(builder, out, color[cbuf][chan]);
+         }
+      }
+   }
 
-   *pmask = mask.value;
+   if (counter)
+      lp_build_occlusion_count(builder, type,
+                               lp_build_mask_value(&mask), counter);
 
+   *pmask = lp_build_mask_end(&mask);
 }
 
 
@@ -547,10 +443,10 @@ generate_blend(const struct pipe_blend_state *blend,
                LLVMValueRef context_ptr,
                LLVMValueRef mask,
                LLVMValueRef *src,
-               LLVMValueRef dst_ptr)
+               LLVMValueRef dst_ptr,
+               boolean do_branch)
 {
    struct lp_build_context bld;
-   struct lp_build_flow_context *flow;
    struct lp_build_mask_context mask_ctx;
    LLVMTypeRef vec_type;
    LLVMValueRef const_ptr;
@@ -561,10 +457,9 @@ generate_blend(const struct pipe_blend_state *blend,
 
    lp_build_context_init(&bld, builder, type);
 
-   flow = lp_build_flow_create(builder);
-
-   /* we'll use this mask context to skip blending if all pixels are dead */
-   lp_build_mask_begin(&mask_ctx, flow, type, mask);
+   lp_build_mask_begin(&mask_ctx, builder, type, mask);
+   if (do_branch)
+      lp_build_mask_check(&mask_ctx);
 
    vec_type = lp_build_vec_type(type);
 
@@ -597,7 +492,6 @@ generate_blend(const struct pipe_blend_state *blend,
    }
 
    lp_build_mask_end(&mask_ctx);
-   lp_build_flow_destroy(flow);
 }
 
 
@@ -611,18 +505,18 @@ static void
 generate_fragment(struct llvmpipe_context *lp,
                   struct lp_fragment_shader *shader,
                   struct lp_fragment_shader_variant *variant,
-                  unsigned do_tri_test)
+                  unsigned partial_mask)
 {
    struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
    const struct lp_fragment_shader_variant_key *key = &variant->key;
+   char func_name[256];
    struct lp_type fs_type;
    struct lp_type blend_type;
    LLVMTypeRef fs_elem_type;
    LLVMTypeRef fs_int_vec_type;
    LLVMTypeRef blend_vec_type;
-   LLVMTypeRef arg_types[16];
+   LLVMTypeRef arg_types[11];
    LLVMTypeRef func_type;
-   LLVMTypeRef int32_vec4_type = lp_build_int32_vec4_type();
    LLVMValueRef context_ptr;
    LLVMValueRef x;
    LLVMValueRef y;
@@ -631,11 +525,10 @@ generate_fragment(struct llvmpipe_context *lp,
    LLVMValueRef dady_ptr;
    LLVMValueRef color_ptr_ptr;
    LLVMValueRef depth_ptr;
-   LLVMValueRef c0, c1, c2, step0_ptr, step1_ptr, step2_ptr, counter = NULL;
+   LLVMValueRef mask_input;
+   LLVMValueRef counter = NULL;
    LLVMBasicBlockRef block;
    LLVMBuilderRef builder;
-   LLVMValueRef x0;
-   LLVMValueRef y0;
    struct lp_build_sampler_soa *sampler;
    struct lp_build_interp_soa_context interp;
    LLVMValueRef fs_mask[LP_MAX_VECTOR_LENGTH];
@@ -643,6 +536,7 @@ generate_fragment(struct llvmpipe_context *lp,
    LLVMValueRef blend_mask;
    LLVMValueRef function;
    LLVMValueRef facing;
+   const struct util_format_description *zs_format_desc;
    unsigned num_fs;
    unsigned i;
    unsigned chan;
@@ -677,6 +571,9 @@ generate_fragment(struct llvmpipe_context *lp,
 
    blend_vec_type = lp_build_vec_type(blend_type);
 
+   util_snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s", 
+                shader->no, variant->no, partial_mask ? "partial" : "whole");
+
    arg_types[0] = screen->context_ptr_type;            /* context */
    arg_types[1] = LLVMInt32Type();                     /* x */
    arg_types[2] = LLVMInt32Type();                     /* y */
@@ -685,24 +582,16 @@ generate_fragment(struct llvmpipe_context *lp,
    arg_types[5] = LLVMPointerType(fs_elem_type, 0);    /* dadx */
    arg_types[6] = LLVMPointerType(fs_elem_type, 0);    /* dady */
    arg_types[7] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0);  /* color */
-   arg_types[8] = LLVMPointerType(fs_int_vec_type, 0); /* depth */
-   arg_types[9] = LLVMInt32Type();                     /* c0 */
-   arg_types[10] = LLVMInt32Type();                    /* c1 */
-   arg_types[11] = LLVMInt32Type();                    /* c2 */
-   /* Note: the step arrays are built as int32[16] but we interpret
-    * them here as int32_vec4[4].
-    */
-   arg_types[12] = LLVMPointerType(int32_vec4_type, 0);/* step0 */
-   arg_types[13] = LLVMPointerType(int32_vec4_type, 0);/* step1 */
-   arg_types[14] = LLVMPointerType(int32_vec4_type, 0);/* step2 */
-   arg_types[15] = LLVMPointerType(LLVMInt32Type(), 0);/* counter */
+   arg_types[8] = LLVMPointerType(LLVMInt8Type(), 0);  /* depth */
+   arg_types[9] = LLVMInt32Type();                     /* mask_input */
+   arg_types[10] = LLVMPointerType(LLVMInt32Type(), 0);/* counter */
 
    func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
 
-   function = LLVMAddFunction(screen->module, "shader", func_type);
+   function = LLVMAddFunction(screen->module, func_name, func_type);
    LLVMSetFunctionCallConv(function, LLVMCCallConv);
 
-   variant->function[do_tri_test] = function;
+   variant->function[partial_mask] = function;
 
 
    /* XXX: need to propagate noalias down into color param now we are
@@ -721,12 +610,7 @@ generate_fragment(struct llvmpipe_context *lp,
    dady_ptr     = LLVMGetParam(function, 6);
    color_ptr_ptr = LLVMGetParam(function, 7);
    depth_ptr    = LLVMGetParam(function, 8);
-   c0           = LLVMGetParam(function, 9);
-   c1           = LLVMGetParam(function, 10);
-   c2           = LLVMGetParam(function, 11);
-   step0_ptr    = LLVMGetParam(function, 12);
-   step1_ptr    = LLVMGetParam(function, 13);
-   step2_ptr    = LLVMGetParam(function, 14);
+   mask_input   = LLVMGetParam(function, 9);
 
    lp_build_name(context_ptr, "context");
    lp_build_name(x, "x");
@@ -736,15 +620,10 @@ generate_fragment(struct llvmpipe_context *lp,
    lp_build_name(dady_ptr, "dady");
    lp_build_name(color_ptr_ptr, "color_ptr_ptr");
    lp_build_name(depth_ptr, "depth");
-   lp_build_name(c0, "c0");
-   lp_build_name(c1, "c1");
-   lp_build_name(c2, "c2");
-   lp_build_name(step0_ptr, "step0");
-   lp_build_name(step1_ptr, "step1");
-   lp_build_name(step2_ptr, "step2");
+   lp_build_name(mask_input, "mask_input");
 
    if (key->occlusion_count) {
-      counter = LLVMGetParam(function, 15);
+      counter = LLVMGetParam(function, 10);
       lp_build_name(counter, "counter");
    }
 
@@ -756,28 +635,32 @@ generate_fragment(struct llvmpipe_context *lp,
    builder = LLVMCreateBuilder();
    LLVMPositionBuilderAtEnd(builder, block);
 
-   generate_pos0(builder, x, y, &x0, &y0);
-
+   /*
+    * The shader input interpolation info is not explicitely baked in the
+    * shader key, but everything it derives from (TGSI, and flatshade) is
+    * already included in the shader key.
+    */
    lp_build_interp_soa_init(&interp, 
-                            shader->base.tokens,
-                            key->flatshade,
+                            lp->num_inputs,
+                            lp->inputs,
                             builder, fs_type,
                             a0_ptr, dadx_ptr, dady_ptr,
-                            x0, y0);
+                            x, y);
 
    /* code generated texture sampling */
    sampler = lp_llvm_sampler_soa_create(key->sampler, context_ptr);
 
    /* loop over quads in the block */
+   zs_format_desc = util_format_description(key->zsbuf_format);
+
    for(i = 0; i < num_fs; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef depth_offset = LLVMConstInt(LLVMInt32Type(),
+                                               i*fs_type.length*zs_format_desc->block.bits/8,
+                                               0);
       LLVMValueRef out_color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS];
       LLVMValueRef depth_ptr_i;
 
-      if(i != 0)
-         lp_build_interp_soa_update(&interp, i);
-
-      depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &index, 1, "");
+      depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &depth_offset, 1, "");
 
       generate_fs(lp, shader, key,
                   builder,
@@ -790,9 +673,9 @@ generate_fragment(struct llvmpipe_context *lp,
                   out_color,
                   depth_ptr_i,
                   facing,
-                  do_tri_test,
-                  c0, c1, c2,
-                  step0_ptr, step1_ptr, step2_ptr, counter);
+                  partial_mask,
+                  mask_input,
+                  counter);
 
       for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++)
         for(chan = 0; chan < NUM_CHANNELS; ++chan)
@@ -813,15 +696,28 @@ generate_fragment(struct llvmpipe_context *lp,
        * Convert the fs's output color and mask to fit to the blending type. 
        */
       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+         LLVMValueRef fs_color_vals[LP_MAX_VECTOR_LENGTH];
+         
+         for (i = 0; i < num_fs; i++) {
+            fs_color_vals[i] =
+               LLVMBuildLoad(builder, fs_out_color[cbuf][chan][i], "fs_color_vals");
+         }
+
         lp_build_conv(builder, fs_type, blend_type,
-                      fs_out_color[cbuf][chan], num_fs,
+                       fs_color_vals,
+                       num_fs,
                       &blend_in_color[chan], 1);
+
         lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]);
       }
 
-      lp_build_conv_mask(builder, fs_type, blend_type,
-                        fs_mask, num_fs,
-                        &blend_mask, 1);
+      if (partial_mask || !variant->opaque) {
+         lp_build_conv_mask(builder, fs_type, blend_type,
+                            fs_mask, num_fs,
+                            &blend_mask, 1);
+      } else {
+         blend_mask = lp_build_const_int_vec(blend_type, ~0);
+      }
 
       color_ptr = LLVMBuildLoad(builder, 
                                LLVMBuildGEP(builder, color_ptr_ptr, &index, 1, ""),
@@ -834,16 +730,30 @@ generate_fragment(struct llvmpipe_context *lp,
       /*
        * Blending.
        */
-      generate_blend(&key->blend,
-                     rt,
-                    builder,
-                    blend_type,
-                    context_ptr,
-                    blend_mask,
-                    blend_in_color,
-                    color_ptr);
+      {
+         /* Could the 4x4 have been killed?
+          */
+         boolean do_branch = ((key->depth.enabled || key->stencil[0].enabled) &&
+                              !key->alpha.enabled &&
+                              !shader->info.base.uses_kill);
+
+         generate_blend(&key->blend,
+                        rt,
+                        builder,
+                        blend_type,
+                        context_ptr,
+                        blend_mask,
+                        blend_in_color,
+                        color_ptr,
+                        do_branch);
+      }
    }
 
+#ifdef PIPE_ARCH_X86
+   /* Avoid corrupting the FPU stack on 32bit OSes. */
+   lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0);
+#endif
+
    LLVMBuildRetVoid(builder);
 
    LLVMDisposeBuilder(builder);
@@ -859,10 +769,9 @@ generate_fragment(struct llvmpipe_context *lp,
 #endif
 
    /* Apply optimizations to LLVM IR */
-   if (1)
-      LLVMRunFunctionPassManager(screen->pass, function);
+   LLVMRunFunctionPassManager(screen->pass, function);
 
-   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+   if ((gallivm_debug & GALLIVM_DEBUG_IR) || (LP_DEBUG & DEBUG_FS)) {
       /* Print the LLVM IR to stderr */
       lp_debug_dump_value(function);
       debug_printf("\n");
@@ -874,109 +783,163 @@ generate_fragment(struct llvmpipe_context *lp,
    {
       void *f = LLVMGetPointerToGlobal(screen->engine, function);
 
-      variant->jit_function[do_tri_test] = cast_voidptr_to_lp_jit_frag_func(f);
+      variant->jit_function[partial_mask] = (lp_jit_frag_func)pointer_to_func(f);
 
-      if (gallivm_debug & GALLIVM_DEBUG_ASM) {
+      if ((gallivm_debug & GALLIVM_DEBUG_ASM) || (LP_DEBUG & DEBUG_FS)) {
          lp_disassemble(f);
       }
+      lp_func_delete_body(function);
+   }
+}
+
+
+static void
+dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
+{
+   unsigned i;
+
+   debug_printf("fs variant %p:\n", (void *) key);
+
+   if (key->flatshade) {
+      debug_printf("flatshade = 1\n");
+   }
+   for (i = 0; i < key->nr_cbufs; ++i) {
+      debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
+   }
+   if (key->depth.enabled) {
+      debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
+      debug_printf("depth.func = %s\n", util_dump_func(key->depth.func, TRUE));
+      debug_printf("depth.writemask = %u\n", key->depth.writemask);
+   }
+
+   for (i = 0; i < 2; ++i) {
+      if (key->stencil[i].enabled) {
+         debug_printf("stencil[%u].func = %s\n", i, util_dump_func(key->stencil[i].func, TRUE));
+         debug_printf("stencil[%u].fail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].fail_op, TRUE));
+         debug_printf("stencil[%u].zpass_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zpass_op, TRUE));
+         debug_printf("stencil[%u].zfail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zfail_op, TRUE));
+         debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask);
+         debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask);
+      }
+   }
+
+   if (key->alpha.enabled) {
+      debug_printf("alpha.func = %s\n", util_dump_func(key->alpha.func, TRUE));
+   }
+
+   if (key->occlusion_count) {
+      debug_printf("occlusion_count = 1\n");
+   }
+
+   if (key->blend.logicop_enable) {
+      debug_printf("blend.logicop_func = %s\n", util_dump_logicop(key->blend.logicop_func, TRUE));
+   }
+   else if (key->blend.rt[0].blend_enable) {
+      debug_printf("blend.rgb_func = %s\n",   util_dump_blend_func  (key->blend.rt[0].rgb_func, TRUE));
+      debug_printf("blend.rgb_src_factor = %s\n",   util_dump_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE));
+      debug_printf("blend.rgb_dst_factor = %s\n",   util_dump_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE));
+      debug_printf("blend.alpha_func = %s\n",       util_dump_blend_func  (key->blend.rt[0].alpha_func, TRUE));
+      debug_printf("blend.alpha_src_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE));
+      debug_printf("blend.alpha_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
+   }
+   debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
+   for (i = 0; i < key->nr_samplers; ++i) {
+      debug_printf("sampler[%u] = \n", i);
+      debug_printf("  .format = %s\n",
+                   util_format_name(key->sampler[i].format));
+      debug_printf("  .target = %s\n",
+                   util_dump_tex_target(key->sampler[i].target, TRUE));
+      debug_printf("  .pot = %u %u %u\n",
+                   key->sampler[i].pot_width,
+                   key->sampler[i].pot_height,
+                   key->sampler[i].pot_depth);
+      debug_printf("  .wrap = %s %s %s\n",
+                   util_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
+                   util_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
+                   util_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
+      debug_printf("  .min_img_filter = %s\n",
+                   util_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
+      debug_printf("  .min_mip_filter = %s\n",
+                   util_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
+      debug_printf("  .mag_img_filter = %s\n",
+                   util_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
+      if (key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
+         debug_printf("  .compare_func = %s\n", util_dump_func(key->sampler[i].compare_func, TRUE));
+      debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
+      debug_printf("  .min_max_lod_equal = %u\n", key->sampler[i].min_max_lod_equal);
+      debug_printf("  .lod_bias_non_zero = %u\n", key->sampler[i].lod_bias_non_zero);
+      debug_printf("  .apply_min_lod = %u\n", key->sampler[i].apply_min_lod);
+      debug_printf("  .apply_max_lod = %u\n", key->sampler[i].apply_max_lod);
    }
 }
 
 
+void
+lp_debug_fs_variant(const struct lp_fragment_shader_variant *variant)
+{
+   debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n", 
+                variant->shader->no, variant->no);
+   tgsi_dump(variant->shader->base.tokens, 0);
+   dump_fs_variant_key(&variant->key);
+   debug_printf("variant->opaque = %u\n", variant->opaque);
+   debug_printf("\n");
+}
+
 static struct lp_fragment_shader_variant *
 generate_variant(struct llvmpipe_context *lp,
                  struct lp_fragment_shader *shader,
                  const struct lp_fragment_shader_variant_key *key)
 {
    struct lp_fragment_shader_variant *variant;
-
-   if (gallivm_debug & GALLIVM_DEBUG_IR) {
-      unsigned i;
-
-      tgsi_dump(shader->base.tokens, 0);
-      if(key->depth.enabled) {
-         debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
-         debug_printf("depth.func = %s\n", util_dump_func(key->depth.func, TRUE));
-         debug_printf("depth.writemask = %u\n", key->depth.writemask);
-      }
-      for (i = 0; i < 2; ++i) {
-         if(key->stencil[i].enabled) {
-            debug_printf("stencil[%u].func = %s\n", i, util_dump_func(key->stencil[i].func, TRUE));
-            debug_printf("stencil[%u].fail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].fail_op, TRUE));
-            debug_printf("stencil[%u].zpass_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zpass_op, TRUE));
-            debug_printf("stencil[%u].zfail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zfail_op, TRUE));
-            debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask);
-            debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask);
-         }
-      }
-      if(key->alpha.enabled) {
-         debug_printf("alpha.func = %s\n", util_dump_func(key->alpha.func, TRUE));
-         debug_printf("alpha.ref_value = %f\n", key->alpha.ref_value);
-      }
-      if(key->blend.logicop_enable) {
-         debug_printf("blend.logicop_func = %u\n", key->blend.logicop_func);
-      }
-      else if(key->blend.rt[0].blend_enable) {
-         debug_printf("blend.rgb_func = %s\n",   util_dump_blend_func  (key->blend.rt[0].rgb_func, TRUE));
-         debug_printf("rgb_src_factor = %s\n",   util_dump_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE));
-         debug_printf("rgb_dst_factor = %s\n",   util_dump_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE));
-         debug_printf("alpha_func = %s\n",       util_dump_blend_func  (key->blend.rt[0].alpha_func, TRUE));
-         debug_printf("alpha_src_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE));
-         debug_printf("alpha_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
-      }
-      debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
-      for(i = 0; i < PIPE_MAX_SAMPLERS; ++i) {
-         if(key->sampler[i].format) {
-            debug_printf("sampler[%u] = \n", i);
-            debug_printf("  .format = %s\n",
-                         util_format_name(key->sampler[i].format));
-            debug_printf("  .target = %s\n",
-                         util_dump_tex_target(key->sampler[i].target, TRUE));
-            debug_printf("  .pot = %u %u %u\n",
-                         key->sampler[i].pot_width,
-                         key->sampler[i].pot_height,
-                         key->sampler[i].pot_depth);
-            debug_printf("  .wrap = %s %s %s\n",
-                         util_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
-                         util_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
-                         util_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
-            debug_printf("  .min_img_filter = %s\n",
-                         util_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
-            debug_printf("  .min_mip_filter = %s\n",
-                         util_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
-            debug_printf("  .mag_img_filter = %s\n",
-                         util_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
-            if(key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
-               debug_printf("  .compare_func = %s\n", util_dump_func(key->sampler[i].compare_func, TRUE));
-            debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
-         }
-      }
-   }
+   boolean fullcolormask;
 
    variant = CALLOC_STRUCT(lp_fragment_shader_variant);
    if(!variant)
       return NULL;
 
-   memcpy(&variant->key, key, sizeof *key);
+   variant->shader = shader;
+   variant->list_item_global.base = variant;
+   variant->list_item_local.base = variant;
+   variant->no = shader->variants_created++;
 
-   generate_fragment(lp, shader, variant, RAST_WHOLE);
-   generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
+   memcpy(&variant->key, key, shader->variant_key_size);
+
+   /*
+    * Determine whether we are touching all channels in the color buffer.
+    */
+   fullcolormask = FALSE;
+   if (key->nr_cbufs == 1) {
+      const struct util_format_description *format_desc;
+      format_desc = util_format_description(key->cbuf_format[0]);
+      if ((~key->blend.rt[0].colormask &
+           util_format_colormask(format_desc)) == 0) {
+         fullcolormask = TRUE;
+      }
+   }
 
-   /* TODO: most of these can be relaxed, in particular the colormask */
    variant->opaque =
          !key->blend.logicop_enable &&
          !key->blend.rt[0].blend_enable &&
-         key->blend.rt[0].colormask == 0xf &&
+         fullcolormask &&
          !key->stencil[0].enabled &&
          !key->alpha.enabled &&
          !key->depth.enabled &&
-         !key->scissor &&
-         !shader->info.uses_kill
+         !shader->info.base.uses_kill
          ? TRUE : FALSE;
 
-   /* insert new variant into linked list */
-   variant->next = shader->variants;
-   shader->variants = variant;
+
+   if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
+      lp_debug_fs_variant(variant);
+   }
+
+   generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
+
+   if (variant->opaque) {
+      /* Specialized shader, which doesn't need to read the color buffer. */
+      generate_fragment(lp, shader, variant, RAST_WHOLE);
+   } else {
+      variant->jit_function[RAST_WHOLE] = variant->jit_function[RAST_EDGE_TEST];
+   }
 
    return variant;
 }
@@ -986,21 +949,50 @@ static void *
 llvmpipe_create_fs_state(struct pipe_context *pipe,
                          const struct pipe_shader_state *templ)
 {
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
    struct lp_fragment_shader *shader;
+   int nr_samplers;
 
    shader = CALLOC_STRUCT(lp_fragment_shader);
    if (!shader)
       return NULL;
 
+   shader->no = fs_no++;
+   make_empty_list(&shader->variants);
+
    /* get/save the summary info for this shader */
-   tgsi_scan_shader(templ->tokens, &shader->info);
+   lp_build_tgsi_info(templ->tokens, &shader->info);
 
    /* we need to keep a local copy of the tokens */
    shader->base.tokens = tgsi_dup_tokens(templ->tokens);
 
-   if (gallivm_debug & GALLIVM_DEBUG_TGSI) {
-      debug_printf("llvmpipe: Create fragment shader %p:\n", (void *) shader);
+   shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ);
+   if (shader->draw_data == NULL) {
+      FREE((void *) shader->base.tokens);
+      FREE(shader);
+      return NULL;
+   }
+
+   nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
+
+   shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key,
+                                    sampler[nr_samplers]);
+
+   if (LP_DEBUG & DEBUG_TGSI) {
+      unsigned attrib;
+      debug_printf("llvmpipe: Create fragment shader #%u %p:\n", shader->no, (void *) shader);
       tgsi_dump(templ->tokens, 0);
+      debug_printf("usage masks:\n");
+      for (attrib = 0; attrib < shader->info.base.num_inputs; ++attrib) {
+         unsigned usage_mask = shader->info.base.input_usage_mask[attrib];
+         debug_printf("  IN[%u].%s%s%s%s\n",
+                      attrib,
+                      usage_mask & TGSI_WRITEMASK_X ? "x" : "",
+                      usage_mask & TGSI_WRITEMASK_Y ? "y" : "",
+                      usage_mask & TGSI_WRITEMASK_Z ? "z" : "",
+                      usage_mask & TGSI_WRITEMASK_W ? "w" : "");
+      }
+      debug_printf("\n");
    }
 
    return shader;
@@ -1017,19 +1009,47 @@ llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
 
    draw_flush(llvmpipe->draw);
 
+   draw_bind_fragment_shader(llvmpipe->draw,
+                             (llvmpipe->fs ? llvmpipe->fs->draw_data : NULL));
+
    llvmpipe->fs = fs;
 
    llvmpipe->dirty |= LP_NEW_FS;
 }
 
+static void
+remove_shader_variant(struct llvmpipe_context *lp,
+                      struct lp_fragment_shader_variant *variant)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
+   unsigned i;
+
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      debug_printf("llvmpipe: del fs #%u var #%u v created #%u v cached #%u v total cached #%u\n",
+                    variant->shader->no, variant->no, variant->shader->variants_created,
+                    variant->shader->variants_cached, lp->nr_fs_variants);
+   }
+   for (i = 0; i < Elements(variant->function); i++) {
+      if (variant->function[i]) {
+         if (variant->jit_function[i])
+            LLVMFreeMachineCodeForFunction(screen->engine,
+                                           variant->function[i]);
+         LLVMDeleteFunction(variant->function[i]);
+      }
+   }
+   remove_from_list(&variant->list_item_local);
+   variant->shader->variants_cached--;
+   remove_from_list(&variant->list_item_global);
+   lp->nr_fs_variants--;
+   FREE(variant);
+}
 
 static void
 llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
-   struct llvmpipe_screen *screen = llvmpipe_screen(pipe->screen);
    struct lp_fragment_shader *shader = fs;
-   struct lp_fragment_shader_variant *variant;
+   struct lp_fs_variant_list_item *li;
 
    assert(fs != llvmpipe->fs);
    (void) llvmpipe;
@@ -1037,29 +1057,21 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
    /*
     * XXX: we need to flush the context until we have some sort of reference
     * counting in fragment shaders as they may still be binned
+    * Flushing alone might not sufficient we need to wait on it too.
     */
-   draw_flush(llvmpipe->draw);
-   lp_setup_flush(llvmpipe->setup, 0);
-
-   variant = shader->variants;
-   while(variant) {
-      struct lp_fragment_shader_variant *next = variant->next;
-      unsigned i;
-
-      for (i = 0; i < Elements(variant->function); i++) {
-         if (variant->function[i]) {
-            if (variant->jit_function[i])
-               LLVMFreeMachineCodeForFunction(screen->engine,
-                                              variant->function[i]);
-            LLVMDeleteFunction(variant->function[i]);
-         }
-      }
 
-      FREE(variant);
+   llvmpipe_finish(pipe, __FUNCTION__);
 
-      variant = next;
+   li = first_elem(&shader->variants);
+   while(!at_end(&shader->variants, li)) {
+      struct lp_fs_variant_list_item *next = next_elem(li);
+      remove_shader_variant(llvmpipe, li->base);
+      li = next;
    }
 
+   draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data);
+
+   assert(shader->variants_cached == 0);
    FREE((void *) shader->base.tokens);
    FREE(shader);
 }
@@ -1076,19 +1088,20 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
    const void *data = constants ? llvmpipe_resource_data(constants) : NULL;
 
    assert(shader < PIPE_SHADER_TYPES);
-   assert(index == 0);
+   assert(index < PIPE_MAX_CONSTANT_BUFFERS);
 
-   if(llvmpipe->constants[shader] == constants)
+   if(llvmpipe->constants[shader][index] == constants)
       return;
 
    draw_flush(llvmpipe->draw);
 
    /* note: reference counting */
-   pipe_resource_reference(&llvmpipe->constants[shader], constants);
+   pipe_resource_reference(&llvmpipe->constants[shader][index], constants);
 
-   if(shader == PIPE_SHADER_VERTEX) {
-      draw_set_mapped_constant_buffer(llvmpipe->draw, PIPE_SHADER_VERTEX, 0,
-                                      data, size);
+   if(shader == PIPE_SHADER_VERTEX ||
+      shader == PIPE_SHADER_GEOMETRY) {
+      draw_set_mapped_constant_buffer(llvmpipe->draw, shader,
+                                      index, data, size);
    }
 
    llvmpipe->dirty |= LP_NEW_CONSTANTS;
@@ -1099,7 +1112,7 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
  * Return the blend factor equivalent to a destination alpha of one.
  */
 static INLINE unsigned
-force_dst_alpha_one(unsigned factor, boolean alpha)
+force_dst_alpha_one(unsigned factor)
 {
    switch(factor) {
    case PIPE_BLENDFACTOR_DST_ALPHA:
@@ -1110,15 +1123,6 @@ force_dst_alpha_one(unsigned factor, boolean alpha)
       return PIPE_BLENDFACTOR_ZERO;
    }
 
-   if (alpha) {
-      switch(factor) {
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         return PIPE_BLENDFACTOR_ONE;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         return PIPE_BLENDFACTOR_ZERO;
-      }
-   }
-
    return factor;
 }
 
@@ -1137,7 +1141,7 @@ make_variant_key(struct llvmpipe_context *lp,
 {
    unsigned i;
 
-   memset(key, 0, sizeof *key);
+   memset(key, 0, shader->variant_key_size);
 
    if (lp->framebuffer.zsbuf) {
       if (lp->depth_stencil->depth.enabled) {
@@ -1156,7 +1160,6 @@ make_variant_key(struct llvmpipe_context *lp,
    /* alpha.ref_value is passed in jit_context */
 
    key->flatshade = lp->rasterizer->flatshade;
-   key->scissor = lp->rasterizer->scissor;
    if (lp->active_query_count) {
       key->occlusion_count = TRUE;
    }
@@ -1167,25 +1170,22 @@ make_variant_key(struct llvmpipe_context *lp,
 
    key->nr_cbufs = lp->framebuffer.nr_cbufs;
    for (i = 0; i < lp->framebuffer.nr_cbufs; i++) {
+      enum pipe_format format = lp->framebuffer.cbufs[i]->format;
       struct pipe_rt_blend_state *blend_rt = &key->blend.rt[i];
       const struct util_format_description *format_desc;
-      unsigned chan;
 
-      format_desc = util_format_description(lp->framebuffer.cbufs[i]->format);
+      key->cbuf_format[i] = format;
+
+      format_desc = util_format_description(format);
       assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
              format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
 
       blend_rt->colormask = lp->blend->rt[i].colormask;
 
-      /* mask out color channels not present in the color buffer.
-       * Should be simple to incorporate per-cbuf writemasks:
+      /*
+       * Mask out color channels not present in the color buffer.
        */
-      for(chan = 0; chan < 4; ++chan) {
-         enum util_format_swizzle swizzle = format_desc->swizzle[chan];
-
-         if(swizzle > UTIL_FORMAT_SWIZZLE_W)
-            blend_rt->colormask &= ~(1 << chan);
-      }
+      blend_rt->colormask &= util_format_colormask(format_desc);
 
       /*
        * Our swizzled render tiles always have an alpha channel, but the linear
@@ -1199,20 +1199,30 @@ make_variant_key(struct llvmpipe_context *lp,
        *
        * TODO: This should be generalized to all channels for better
        * performance, but only alpha causes correctness issues.
+       *
+       * Also, force rgb/alpha func/factors match, to make AoS blending easier.
        */
       if (format_desc->swizzle[3] > UTIL_FORMAT_SWIZZLE_W) {
-         blend_rt->rgb_src_factor = force_dst_alpha_one(blend_rt->rgb_src_factor, FALSE);
-         blend_rt->rgb_dst_factor = force_dst_alpha_one(blend_rt->rgb_dst_factor, FALSE);
-         blend_rt->alpha_src_factor = force_dst_alpha_one(blend_rt->alpha_src_factor, TRUE);
-         blend_rt->alpha_dst_factor = force_dst_alpha_one(blend_rt->alpha_dst_factor, TRUE);
+         blend_rt->rgb_src_factor   = force_dst_alpha_one(blend_rt->rgb_src_factor);
+         blend_rt->rgb_dst_factor   = force_dst_alpha_one(blend_rt->rgb_dst_factor);
+         blend_rt->alpha_func       = blend_rt->rgb_func;
+         blend_rt->alpha_src_factor = blend_rt->rgb_src_factor;
+         blend_rt->alpha_dst_factor = blend_rt->rgb_dst_factor;
       }
    }
 
-   for(i = 0; i < PIPE_MAX_SAMPLERS; ++i)
-      if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i))
-         lp_sampler_static_state(&key->sampler[i], lp->fragment_sampler_views[i], lp->sampler[i]);
-}
+   /* This value will be the same for all the variants of a given shader:
+    */
+   key->nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
 
+   for(i = 0; i < key->nr_samplers; ++i) {
+      if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
+         lp_sampler_static_state(&key->sampler[i],
+                                lp->fragment_sampler_views[i],
+                                lp->sampler[i]);
+      }
+   }
+}
 
 /**
  * Update fragment state.  This is called just prior to drawing
@@ -1223,21 +1233,42 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
 {
    struct lp_fragment_shader *shader = lp->fs;
    struct lp_fragment_shader_variant_key key;
-   struct lp_fragment_shader_variant *variant;
+   struct lp_fragment_shader_variant *variant = NULL;
+   struct lp_fs_variant_list_item *li;
 
    make_variant_key(lp, shader, &key);
 
-   variant = shader->variants;
-   while(variant) {
-      if(memcmp(&variant->key, &key, sizeof key) == 0)
+   li = first_elem(&shader->variants);
+   while(!at_end(&shader->variants, li)) {
+      if(memcmp(&li->base->key, &key, shader->variant_key_size) == 0) {
+         variant = li->base;
          break;
-
-      variant = variant->next;
+      }
+      li = next_elem(li);
    }
 
-   if (!variant) {
+   if (variant) {
+      move_to_head(&lp->fs_variants_list, &variant->list_item_global);
+   }
+   else {
       int64_t t0, t1;
       int64_t dt;
+      unsigned i;
+      if (lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS) {
+         struct pipe_context *pipe = &lp->pipe;
+
+         /*
+          * XXX: we need to flush the context until we have some sort of reference
+          * counting in fragment shaders as they may still be binned
+          * Flushing alone might not be sufficient we need to wait on it too.
+          */
+         llvmpipe_finish(pipe, __FUNCTION__);
+
+         for (i = 0; i < LP_MAX_SHADER_VARIANTS / 4; i++) {
+            struct lp_fs_variant_list_item *item = last_elem(&lp->fs_variants_list);
+            remove_shader_variant(lp, item->base);
+         }
+      }
       t0 = os_time_get();
 
       variant = generate_variant(lp, shader, &key);
@@ -1246,12 +1277,16 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
       dt = t1 - t0;
       LP_COUNT_ADD(llvm_compile_time, dt);
       LP_COUNT_ADD(nr_llvm_compiles, 2);  /* emit vs. omit in/out test */
+
+      if (variant) {
+         insert_at_head(&shader->variants, &variant->list_item_local);
+         insert_at_head(&lp->fs_variants_list, &variant->list_item_global);
+         lp->nr_fs_variants++;
+         shader->variants_cached++;
+      }
    }
 
-   lp_setup_set_fs_functions(lp->setup, 
-                             variant->jit_function[RAST_WHOLE],
-                             variant->jit_function[RAST_EDGE_TEST],
-                             variant->opaque);
+   lp_setup_set_fs_variant(lp->setup, variant);
 }