X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fllvmpipe%2Flp_bld_depth.c;h=b25e041375070a4bdc2209b2c2a722a723775bb1;hb=d6408372eb359d972614838f838776f1695e3c99;hp=e05bbe5011a40c79a227685552c78e90d9c5138a;hpb=86afe8250edaa2e6129c937a62a695f616c48d70;p=mesa.git

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index e05bbe5011a..b25e0413750 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2009 VMware, Inc.
+ * Copyright 2009-2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -36,45 +36,34 @@
  * flushing would avoid this, but it would most likely result in depth fighting
  * artifacts.
  *
- * We are free to use a different pixel layout though. Since our basic
- * processing unit is a quad (2x2 pixel block) we store the depth/stencil
- * values tiled, a quad at time. That is, a depth buffer containing 
+ * Since we're using linear layout for everything, but we need to deal with
+ * 2x2 quads, we need to load/store multiple values and swizzle them into
+ * place (we could avoid this by doing depth/stencil testing in linear format,
+ * which would be easy for late depth/stencil test as we could do that after
+ * the fragment shader loop just as we do for color buffers, but more tricky
+ * for early depth test as we'd need both masks and interpolated depth in
+ * linear format).
  *
- *  Z11 Z12 Z13 Z14 ...
- *  Z21 Z22 Z23 Z24 ...
- *  Z31 Z32 Z33 Z34 ...
- *  Z41 Z42 Z43 Z44 ...
- *  ... ... ... ... ...
- *
- * will actually be stored in memory as
- *
- *  Z11 Z12 Z21 Z22 Z13 Z14 Z23 Z24 ...
- *  Z31 Z32 Z41 Z42 Z33 Z34 Z43 Z44 ...
- *  ... ... ... ... ... ... ... ... ...
- *
- *
- * Stencil test:
- * Two-sided stencil test is supported but probably not as efficient as
- * it could be.  Currently, we use if/then/else constructs to do the
- * operations for front vs. back-facing polygons.  We could probably do
- * both the front and back arithmetic then use a Select() instruction to
- * choose the result depending on polyon orientation.  We'd have to
- * measure performance both ways and see which is better.
  *
  * @author Jose Fonseca <jfonseca@vmware.com>
+ * @author Brian Paul <jfonseca@vmware.com>
  */
 
 #include "pipe/p_state.h"
 #include "util/u_format.h"
+#include "util/u_cpu_detect.h"
 
 #include "gallivm/lp_bld_type.h"
 #include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_bitarit.h"
 #include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_conv.h"
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_flow.h"
 #include "gallivm/lp_bld_intr.h"
 #include "gallivm/lp_bld_debug.h"
 #include "gallivm/lp_bld_swizzle.h"
+#include "gallivm/lp_bld_pack.h"
 
 #include "lp_bld_depth.h"
 
@@ -102,20 +91,30 @@ lp_build_stencil_test_single(struct lp_build_context *bld,
                              LLVMValueRef stencilRef,
                              LLVMValueRef stencilVals)
 {
+   LLVMBuilderRef builder = bld->gallivm->builder;
    const unsigned stencilMax = 255; /* XXX fix */
    struct lp_type type = bld->type;
    LLVMValueRef res;
 
-   assert(type.sign);
+   /*
+    * SSE2 has intrinsics for signed comparisons, but not unsigned ones. Values
+    * are between 0..255 so ensure we generate the fastest comparisons for
+    * wider elements.
+    */
+   if (type.width <= 8) {
+      assert(!type.sign);
+   } else {
+      assert(type.sign);
+   }
 
    assert(stencil->enabled);
 
    if (stencil->valuemask != stencilMax) {
       /* compute stencilRef = stencilRef & valuemask */
-      LLVMValueRef valuemask = lp_build_const_int_vec(type, stencil->valuemask);
-      stencilRef = LLVMBuildAnd(bld->builder, stencilRef, valuemask, "");
+      LLVMValueRef valuemask = lp_build_const_int_vec(bld->gallivm, type, stencil->valuemask);
+      stencilRef = LLVMBuildAnd(builder, stencilRef, valuemask, "");
       /* compute stencilVals = stencilVals & valuemask */
-      stencilVals = LLVMBuildAnd(bld->builder, stencilVals, valuemask, "");
+      stencilVals = LLVMBuildAnd(builder, stencilVals, valuemask, "");
    }
 
    res = lp_build_cmp(bld, stencil->func, stencilRef, stencilVals);
@@ -127,57 +126,32 @@ lp_build_stencil_test_single(struct lp_build_context *bld,
 /**
  * Do the one or two-sided stencil test comparison.
  * \sa lp_build_stencil_test_single
- * \param face  an integer indicating front (+) or back (-) facing polygon.
- *              If NULL, assume front-facing.
+ * \param front_facing  an integer vector mask, indicating front (~0) or back
+ *                      (0) facing polygon. If NULL, assume front-facing.
  */
 static LLVMValueRef
 lp_build_stencil_test(struct lp_build_context *bld,
                       const struct pipe_stencil_state stencil[2],
                       LLVMValueRef stencilRefs[2],
                       LLVMValueRef stencilVals,
-                      LLVMValueRef face)
+                      LLVMValueRef front_facing)
 {
    LLVMValueRef res;
 
    assert(stencil[0].enabled);
 
-   if (stencil[1].enabled && face) {
-      /* do two-sided test */
-      struct lp_build_flow_context *flow_ctx;
-      struct lp_build_if_state if_ctx;
-      LLVMValueRef front_facing;
-      LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
-      LLVMValueRef result = bld->undef;
-
-      flow_ctx = lp_build_flow_create(bld->builder);
-      lp_build_flow_scope_begin(flow_ctx);
-
-      lp_build_flow_scope_declare(flow_ctx, &result);
+   /* do front face test */
+   res = lp_build_stencil_test_single(bld, &stencil[0],
+                                      stencilRefs[0], stencilVals);
 
-      /* front_facing = face > 0.0 */
-      front_facing = LLVMBuildFCmp(bld->builder, LLVMRealUGT, face, zero, "");
-
-      lp_build_if(&if_ctx, flow_ctx, bld->builder, front_facing);
-      {
-         result = lp_build_stencil_test_single(bld, &stencil[0],
-                                               stencilRefs[0], stencilVals);
-      }
-      lp_build_else(&if_ctx);
-      {
-         result = lp_build_stencil_test_single(bld, &stencil[1],
-                                               stencilRefs[1], stencilVals);
-      }
-      lp_build_endif(&if_ctx);
+   if (stencil[1].enabled && front_facing != NULL) {
+      /* do back face test */
+      LLVMValueRef back_res;
 
-      lp_build_flow_scope_end(flow_ctx);
-      lp_build_flow_destroy(flow_ctx);
+      back_res = lp_build_stencil_test_single(bld, &stencil[1],
+                                              stencilRefs[1], stencilVals);
 
-      res = result;
-   }
-   else {
-      /* do single-side test */
-      res = lp_build_stencil_test_single(bld, &stencil[0],
-                                         stencilRefs[0], stencilVals);
+      res = lp_build_select(bld, front_facing, res, back_res);
    }
 
    return res;
@@ -194,14 +168,13 @@ lp_build_stencil_op_single(struct lp_build_context *bld,
                            const struct pipe_stencil_state *stencil,
                            enum stencil_op op,
                            LLVMValueRef stencilRef,
-                           LLVMValueRef stencilVals,
-                           LLVMValueRef mask)
+                           LLVMValueRef stencilVals)
 
 {
-   const unsigned stencilMax = 255; /* XXX fix */
+   LLVMBuilderRef builder = bld->gallivm->builder;
    struct lp_type type = bld->type;
    LLVMValueRef res;
-   LLVMValueRef max = lp_build_const_int_vec(type, stencilMax);
+   LLVMValueRef max = lp_build_const_int_vec(bld->gallivm, type, 0xff);
    unsigned stencil_op;
 
    assert(type.sign);
@@ -242,33 +215,21 @@ lp_build_stencil_op_single(struct lp_build_context *bld,
       break;
    case PIPE_STENCIL_OP_INCR_WRAP:
       res = lp_build_add(bld, stencilVals, bld->one);
-      res = LLVMBuildAnd(bld->builder, res, max, "");
+      res = LLVMBuildAnd(builder, res, max, "");
       break;
    case PIPE_STENCIL_OP_DECR_WRAP:
       res = lp_build_sub(bld, stencilVals, bld->one);
-      res = LLVMBuildAnd(bld->builder, res, max, "");
+      res = LLVMBuildAnd(builder, res, max, "");
       break;
    case PIPE_STENCIL_OP_INVERT:
-      res = LLVMBuildNot(bld->builder, stencilVals, "");
-      res = LLVMBuildAnd(bld->builder, res, max, "");
+      res = LLVMBuildNot(builder, stencilVals, "");
+      res = LLVMBuildAnd(builder, res, max, "");
       break;
    default:
       assert(0 && "bad stencil op mode");
-      res = NULL;
+      res = bld->undef;
    }
 
-   if (stencil->writemask != stencilMax) {
-      /* compute res = (res & mask) | (stencilVals & ~mask) */
-      LLVMValueRef mask = lp_build_const_int_vec(type, stencil->writemask);
-      LLVMValueRef cmask = LLVMBuildNot(bld->builder, mask, "notWritemask");
-      LLVMValueRef t1 = LLVMBuildAnd(bld->builder, res, mask, "t1");
-      LLVMValueRef t2 = LLVMBuildAnd(bld->builder, stencilVals, cmask, "t2");
-      res = LLVMBuildOr(bld->builder, t1, t2, "t1_or_t2");
-   }
-
-   /* only the update the vector elements enabled by 'mask' */
-   res = lp_build_select(bld, mask, res, stencilVals);
-
    return res;
 }
 
@@ -283,88 +244,92 @@ lp_build_stencil_op(struct lp_build_context *bld,
                     LLVMValueRef stencilRefs[2],
                     LLVMValueRef stencilVals,
                     LLVMValueRef mask,
-                    LLVMValueRef face)
+                    LLVMValueRef front_facing)
 
 {
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   LLVMValueRef res;
+
    assert(stencil[0].enabled);
 
-   if (stencil[1].enabled && face) {
-      /* do two-sided op */
-      struct lp_build_flow_context *flow_ctx;
-      struct lp_build_if_state if_ctx;
-      LLVMValueRef front_facing;
-      LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
-      LLVMValueRef result = bld->undef;
+   /* do front face op */
+   res = lp_build_stencil_op_single(bld, &stencil[0], op,
+                                     stencilRefs[0], stencilVals);
 
-      flow_ctx = lp_build_flow_create(bld->builder);
-      lp_build_flow_scope_begin(flow_ctx);
+   if (stencil[1].enabled && front_facing != NULL) {
+      /* do back face op */
+      LLVMValueRef back_res;
 
-      lp_build_flow_scope_declare(flow_ctx, &result);
+      back_res = lp_build_stencil_op_single(bld, &stencil[1], op,
+                                            stencilRefs[1], stencilVals);
 
-      /* front_facing = face > 0.0 */
-      front_facing = LLVMBuildFCmp(bld->builder, LLVMRealUGT, face, zero, "");
+      res = lp_build_select(bld, front_facing, res, back_res);
+   }
 
-      lp_build_if(&if_ctx, flow_ctx, bld->builder, front_facing);
-      {
-         result = lp_build_stencil_op_single(bld, &stencil[0], op,
-                                             stencilRefs[0], stencilVals, mask);
-      }
-      lp_build_else(&if_ctx);
-      {
-         result = lp_build_stencil_op_single(bld, &stencil[1], op,
-                                             stencilRefs[1], stencilVals, mask);
+   if (stencil[0].writemask != 0xff ||
+       (stencil[1].enabled && front_facing != NULL && stencil[1].writemask != 0xff)) {
+      /* mask &= stencil[0].writemask */
+      LLVMValueRef writemask = lp_build_const_int_vec(bld->gallivm, bld->type,
+                                                      stencil[0].writemask);
+      if (stencil[1].enabled && stencil[1].writemask != stencil[0].writemask && front_facing != NULL) {
+         LLVMValueRef back_writemask = lp_build_const_int_vec(bld->gallivm, bld->type,
+                                                         stencil[1].writemask);
+         writemask = lp_build_select(bld, front_facing, writemask, back_writemask);
       }
-      lp_build_endif(&if_ctx);
-
-      lp_build_flow_scope_end(flow_ctx);
-      lp_build_flow_destroy(flow_ctx);
 
-      return result;
+      mask = LLVMBuildAnd(builder, mask, writemask, "");
+      /* res = (res & mask) | (stencilVals & ~mask) */
+      res = lp_build_select_bitwise(bld, mask, res, stencilVals);
    }
    else {
-      /* do single-sided op */
-      return lp_build_stencil_op_single(bld, &stencil[0], op,
-                                        stencilRefs[0], stencilVals, mask);
+      /* res = mask ? res : stencilVals */
+      res = lp_build_select(bld, mask, res, stencilVals);
    }
+
+   return res;
 }
 
 
 
 /**
- * Return a type appropriate for depth/stencil testing.
+ * Return a type that matches the depth/stencil format.
  */
 struct lp_type
 lp_depth_type(const struct util_format_description *format_desc,
               unsigned length)
 {
    struct lp_type type;
-   unsigned swizzle;
+   unsigned z_swizzle;
 
    assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
    assert(format_desc->block.width == 1);
    assert(format_desc->block.height == 1);
 
-   swizzle = format_desc->swizzle[0];
-   assert(swizzle < 4);
-
    memset(&type, 0, sizeof type);
    type.width = format_desc->block.bits;
 
-   if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
-      type.floating = TRUE;
-      assert(swizzle == 0);
-      assert(format_desc->channel[swizzle].size == format_desc->block.bits);
-   }
-   else if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
-      assert(format_desc->block.bits <= 32);
-      if(format_desc->channel[swizzle].normalized)
-         type.norm = TRUE;
+   z_swizzle = format_desc->swizzle[0];
+   if (z_swizzle < 4) {
+      if (format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
+         type.floating = TRUE;
+         assert(z_swizzle == 0);
+         assert(format_desc->channel[z_swizzle].size == 32);
+      }
+      else if(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+         assert(format_desc->block.bits <= 32);
+         assert(format_desc->channel[z_swizzle].normalized);
+         if (format_desc->channel[z_swizzle].size < format_desc->block.bits) {
+            /* Prefer signed integers when possible, as SSE has less support
+             * for unsigned comparison;
+             */
+            type.sign = TRUE;
+         }
+      }
+      else
+         assert(0);
    }
-   else
-      assert(0);
 
-   assert(type.width <= length);
-   type.length = length / type.width;
+   type.length = length;
 
    return type;
 }
@@ -380,40 +345,33 @@ lp_depth_type(const struct util_format_description *format_desc,
  */
 static boolean
 get_z_shift_and_mask(const struct util_format_description *format_desc,
-                     unsigned *shift, unsigned *mask)
+                     unsigned *shift, unsigned *width, unsigned *mask)
 {
-   const unsigned total_bits = format_desc->block.bits;
+   unsigned total_bits;
    unsigned z_swizzle;
-   unsigned chan;
-   unsigned padding_left, padding_right;
-   
+
    assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
    assert(format_desc->block.width == 1);
    assert(format_desc->block.height == 1);
 
+   /* 64bit d/s format is special already extracted 32 bits */
+   total_bits = format_desc->block.bits > 32 ? 32 : format_desc->block.bits;
+
    z_swizzle = format_desc->swizzle[0];
 
    if (z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
       return FALSE;
 
-   padding_right = 0;
-   for (chan = 0; chan < z_swizzle; ++chan)
-      padding_right += format_desc->channel[chan].size;
+   *width = format_desc->channel[z_swizzle].size;
+   /* & 31 is for the same reason as the 32-bit limit above */
+   *shift = format_desc->channel[z_swizzle].shift & 31;
 
-   padding_left =
-      total_bits - (padding_right + format_desc->channel[z_swizzle].size);
-
-   if (padding_left || padding_right) {
-      unsigned long long mask_left = (1ULL << (total_bits - padding_left)) - 1;
-      unsigned long long mask_right = (1ULL << (padding_right)) - 1;
-      *mask = mask_left ^ mask_right;
-   }
-   else {
+   if (*width == total_bits) {
       *mask = 0xffffffff;
+   } else {
+      *mask = ((1 << *width) - 1) << *shift;
    }
 
-   *shift = padding_left;
-
    return TRUE;
 }
 
@@ -428,17 +386,23 @@ get_s_shift_and_mask(const struct util_format_description *format_desc,
                      unsigned *shift, unsigned *mask)
 {
    unsigned s_swizzle;
-   unsigned chan, sz;
+   unsigned sz;
 
    s_swizzle = format_desc->swizzle[1];
 
    if (s_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
       return FALSE;
 
-   *shift = 0;
-   for (chan = 0; chan < s_swizzle; chan++)
-      *shift += format_desc->channel[chan].size;
+   /* just special case 64bit d/s format */
+   if (format_desc->block.bits > 32) {
+      /* XXX big-endian? */
+      assert(format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
+      *shift = 0;
+      *mask = 0xff;
+      return TRUE;
+   }
 
+   *shift = format_desc->channel[s_swizzle].shift;
    sz = format_desc->channel[s_swizzle].size;
    *mask = (1U << sz) - 1U;
 
@@ -451,41 +415,391 @@ get_s_shift_and_mask(const struct util_format_description *format_desc,
  * Test the depth mask. Add the number of channel which has none zero mask
  * into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}.
  * The counter will add 4.
+ * TODO: could get that out of the fs loop.
  *
  * \param type holds element type of the mask vector.
  * \param maskvalue is the depth test mask.
  * \param counter is a pointer of the uint32 counter.
  */
-static void
-lp_build_occlusion_count(LLVMBuilderRef builder,
+void
+lp_build_occlusion_count(struct gallivm_state *gallivm,
                          struct lp_type type,
                          LLVMValueRef maskvalue,
                          LLVMValueRef counter)
 {
-   LLVMValueRef countmask = lp_build_const_int_vec(type, 1);
-   LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv");
-   LLVMTypeRef i8v16 = LLVMVectorType(LLVMInt8Type(), 16);
-   LLVMValueRef counti = LLVMBuildBitCast(builder, countv, i8v16, "counti");
-   LLVMValueRef maskarray[4] = {
-      LLVMConstInt(LLVMInt32Type(), 0, 0),
-      LLVMConstInt(LLVMInt32Type(), 4, 0),
-      LLVMConstInt(LLVMInt32Type(), 8, 0),
-      LLVMConstInt(LLVMInt32Type(), 12, 0),
-   };
-   LLVMValueRef shufflemask = LLVMConstVector(maskarray, 4);
-   LLVMValueRef shufflev =  LLVMBuildShuffleVector(builder, counti, LLVMGetUndef(i8v16), shufflemask, "shufflev");
-   LLVMValueRef shuffle = LLVMBuildBitCast(builder, shufflev, LLVMInt32Type(), "shuffle");
-   LLVMValueRef count = lp_build_intrinsic_unary(builder, "llvm.ctpop.i32", LLVMInt32Type(), shuffle);
-   LLVMValueRef orig = LLVMBuildLoad(builder, counter, "orig");
-   LLVMValueRef incr = LLVMBuildAdd(builder, orig, count, "incr");
-   LLVMBuildStore(builder, incr, counter);
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMContextRef context = gallivm->context;
+   LLVMValueRef countmask = lp_build_const_int_vec(gallivm, type, 1);
+   LLVMValueRef count, newcount;
+
+   assert(type.length <= 16);
+   assert(type.floating);
+
+   if(util_cpu_caps.has_sse && type.length == 4) {
+      const char *movmskintr = "llvm.x86.sse.movmsk.ps";
+      const char *popcntintr = "llvm.ctpop.i32";
+      LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
+                                           lp_build_vec_type(gallivm, type), "");
+      bits = lp_build_intrinsic_unary(builder, movmskintr,
+                                      LLVMInt32TypeInContext(context), bits);
+      count = lp_build_intrinsic_unary(builder, popcntintr,
+                                       LLVMInt32TypeInContext(context), bits);
+      count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
+   }
+   else if(util_cpu_caps.has_avx && type.length == 8) {
+      const char *movmskintr = "llvm.x86.avx.movmsk.ps.256";
+      const char *popcntintr = "llvm.ctpop.i32";
+      LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
+                                           lp_build_vec_type(gallivm, type), "");
+      bits = lp_build_intrinsic_unary(builder, movmskintr,
+                                      LLVMInt32TypeInContext(context), bits);
+      count = lp_build_intrinsic_unary(builder, popcntintr,
+                                       LLVMInt32TypeInContext(context), bits);
+      count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
+   }
+   else {
+      unsigned i;
+      LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv");
+      LLVMTypeRef counttype = LLVMIntTypeInContext(context, type.length * 8);
+      LLVMTypeRef i8vntype = LLVMVectorType(LLVMInt8TypeInContext(context), type.length * 4);
+      LLVMValueRef shufflev, countd;
+      LLVMValueRef shuffles[16];
+      const char *popcntintr = NULL;
+
+      countv = LLVMBuildBitCast(builder, countv, i8vntype, "");
+
+       for (i = 0; i < type.length; i++) {
+          shuffles[i] = lp_build_const_int32(gallivm, 4*i);
+       }
+
+       shufflev = LLVMConstVector(shuffles, type.length);
+       countd = LLVMBuildShuffleVector(builder, countv, LLVMGetUndef(i8vntype), shufflev, "");
+       countd = LLVMBuildBitCast(builder, countd, counttype, "countd");
+
+       /*
+        * XXX FIXME
+        * this is bad on cpus without popcount (on x86 supported by intel
+        * nehalem, amd barcelona, and up - not tied to sse42).
+        * Would be much faster to just sum the 4 elements of the vector with
+        * some horizontal add (shuffle/add/shuffle/add after the initial and).
+        */
+       switch (type.length) {
+       case 4:
+          popcntintr = "llvm.ctpop.i32";
+          break;
+       case 8:
+          popcntintr = "llvm.ctpop.i64";
+          break;
+       case 16:
+          popcntintr = "llvm.ctpop.i128";
+          break;
+       default:
+          assert(0);
+       }
+       count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd);
+
+       if (type.length > 8) {
+          count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 64), "");
+       }
+       else if (type.length < 8) {
+          count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
+       }
+   }
+   newcount = LLVMBuildLoad(builder, counter, "origcount");
+   newcount = LLVMBuildAdd(builder, newcount, count, "newcount");
+   LLVMBuildStore(builder, newcount, counter);
+}
+
+
+/**
+ * Load depth/stencil values.
+ * The stored values are linear, swizzle them.
+ *
+ * \param type  the data type of the fragment depth/stencil values
+ * \param format_desc  description of the depth/stencil surface
+ * \param is_1d  whether this resource has only one dimension
+ * \param loop_counter  the current loop iteration
+ * \param depth_ptr  pointer to the depth/stencil values of this 4x4 block
+ * \param depth_stride  stride of the depth/stencil buffer
+ * \param z_fb  contains z values loaded from fb (may include padding)
+ * \param s_fb  contains s values loaded from fb (may include padding)
+ */
+void
+lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
+                                     struct lp_type z_src_type,
+                                     const struct util_format_description *format_desc,
+                                     boolean is_1d,
+                                     LLVMValueRef depth_ptr,
+                                     LLVMValueRef depth_stride,
+                                     LLVMValueRef *z_fb,
+                                     LLVMValueRef *s_fb,
+                                     LLVMValueRef loop_counter)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
+   LLVMValueRef zs_dst1, zs_dst2;
+   LLVMValueRef zs_dst_ptr;
+   LLVMValueRef depth_offset1, depth_offset2;
+   LLVMTypeRef load_ptr_type;
+   unsigned depth_bytes = format_desc->block.bits / 8;
+   struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
+   struct lp_type zs_load_type = zs_type;
+
+   zs_load_type.length = zs_load_type.length / 2;
+   load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);
+
+   if (z_src_type.length == 4) {
+      unsigned i;
+      LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
+                                          lp_build_const_int32(gallivm, 1), "");
+      LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
+                                          lp_build_const_int32(gallivm, 2), "");
+      LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
+                                          depth_stride, "");
+      depth_offset1 = LLVMBuildMul(builder, looplsb,
+                                   lp_build_const_int32(gallivm, depth_bytes * 2), "");
+      depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");
+
+      /* just concatenate the loaded 2x2 values into 4-wide vector */
+      for (i = 0; i < 4; i++) {
+         shuffles[i] = lp_build_const_int32(gallivm, i);
+      }
+   }
+   else {
+      unsigned i;
+      LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,
+                                         lp_build_const_int32(gallivm, 1), "");
+      assert(z_src_type.length == 8);
+      depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, "");
+      /*
+       * We load 2x4 values, and need to swizzle them (order
+       * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.
+       */
+      for (i = 0; i < 8; i++) {
+         shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
+      }
+   }
+
+   depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");
+
+   /* Load current z/stencil values from z/stencil buffer */
+   zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
+   zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
+   zs_dst1 = LLVMBuildLoad(builder, zs_dst_ptr, "");
+   if (is_1d) {
+      zs_dst2 = lp_build_undef(gallivm, zs_load_type);
+   }
+   else {
+      zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
+      zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
+      zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, "");
+   }
+
+   *z_fb = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2,
+                                  LLVMConstVector(shuffles, zs_type.length), "");
+   *s_fb = *z_fb;
+
+   if (format_desc->block.bits < z_src_type.width) {
+      /* Extend destination ZS values (e.g., when reading from Z16_UNORM) */
+      *z_fb = LLVMBuildZExt(builder, *z_fb,
+                            lp_build_int_vec_type(gallivm, z_src_type), "");
+   }
+
+   else if (format_desc->block.bits > 32) {
+      /* rely on llvm to handle too wide vector we have here nicely */
+      unsigned i;
+      struct lp_type typex2 = zs_type;
+      struct lp_type s_type = zs_type;
+      LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 4];
+      LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 4];
+      LLVMValueRef tmp;
+
+      typex2.width = typex2.width / 2;
+      typex2.length = typex2.length * 2;
+      s_type.width = s_type.width / 2;
+      s_type.floating = 0;
+
+      tmp = LLVMBuildBitCast(builder, *z_fb,
+                             lp_build_vec_type(gallivm, typex2), "");
+
+      for (i = 0; i < zs_type.length; i++) {
+         shuffles1[i] = lp_build_const_int32(gallivm, i * 2);
+         shuffles2[i] = lp_build_const_int32(gallivm, i * 2 + 1);
+      }
+      *z_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
+                                     LLVMConstVector(shuffles1, zs_type.length), "");
+      *s_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
+                                     LLVMConstVector(shuffles2, zs_type.length), "");
+      *s_fb = LLVMBuildBitCast(builder, *s_fb,
+                               lp_build_vec_type(gallivm, s_type), "");
+      lp_build_name(*s_fb, "s_dst");
+   }
+
+   lp_build_name(*z_fb, "z_dst");
+   lp_build_name(*s_fb, "s_dst");
+   lp_build_name(*z_fb, "z_dst");
 }
 
+/**
+ * Store depth/stencil values.
+ * Incoming values are swizzled (typically n 2x2 quads), stored linear.
+ * If there's a mask it will do select/store otherwise just store.
+ *
+ * \param type  the data type of the fragment depth/stencil values
+ * \param format_desc  description of the depth/stencil surface
+ * \param is_1d  whether this resource has only one dimension
+ * \param mask  the alive/dead pixel mask for the quad (vector)
+ * \param z_fb  z values read from fb (with padding)
+ * \param s_fb  s values read from fb (with padding)
+ * \param loop_counter  the current loop iteration
+ * \param depth_ptr  pointer to the depth/stencil values of this 4x4 block
+ * \param depth_stride  stride of the depth/stencil buffer
+ * \param z_value the depth values to store (with padding)
+ * \param s_value the stencil values to store (with padding)
+ */
+void
+lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm,
+                                      struct lp_type z_src_type,
+                                      const struct util_format_description *format_desc,
+                                      boolean is_1d,
+                                      struct lp_build_mask_context *mask,
+                                      LLVMValueRef z_fb,
+                                      LLVMValueRef s_fb,
+                                      LLVMValueRef loop_counter,
+                                      LLVMValueRef depth_ptr,
+                                      LLVMValueRef depth_stride,
+                                      LLVMValueRef z_value,
+                                      LLVMValueRef s_value)
+{
+   struct lp_build_context z_bld;
+   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef mask_value = NULL;
+   LLVMValueRef zs_dst1, zs_dst2;
+   LLVMValueRef zs_dst_ptr1, zs_dst_ptr2;
+   LLVMValueRef depth_offset1, depth_offset2;
+   LLVMTypeRef load_ptr_type;
+   unsigned depth_bytes = format_desc->block.bits / 8;
+   struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
+   struct lp_type z_type = zs_type;
+   struct lp_type zs_load_type = zs_type;
+
+   zs_load_type.length = zs_load_type.length / 2;
+   load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);
+
+   z_type.width = z_src_type.width;
+
+   lp_build_context_init(&z_bld, gallivm, z_type);
+
+   /*
+    * This is far from ideal, at least for late depth write we should do this
+    * outside the fs loop to avoid all the swizzle stuff.
+    */
+   if (z_src_type.length == 4) {
+      LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
+                                          lp_build_const_int32(gallivm, 1), "");
+      LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
+                                          lp_build_const_int32(gallivm, 2), "");
+      LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
+                                          depth_stride, "");
+      depth_offset1 = LLVMBuildMul(builder, looplsb,
+                                   lp_build_const_int32(gallivm, depth_bytes * 2), "");
+      depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");
+   }
+   else {
+      unsigned i;
+      LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,
+                                         lp_build_const_int32(gallivm, 1), "");
+      assert(z_src_type.length == 8);
+      depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, "");
+      /*
+       * We load 2x4 values, and need to swizzle them (order
+       * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.
+       */
+      for (i = 0; i < 8; i++) {
+         shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
+      }
+   }
+
+   depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");
+
+   zs_dst_ptr1 = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
+   zs_dst_ptr1 = LLVMBuildBitCast(builder, zs_dst_ptr1, load_ptr_type, "");
+   zs_dst_ptr2 = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
+   zs_dst_ptr2 = LLVMBuildBitCast(builder, zs_dst_ptr2, load_ptr_type, "");
+
+   if (format_desc->block.bits > 32) {
+      s_value = LLVMBuildBitCast(builder, s_value, z_bld.vec_type, "");
+   }
+
+   if (mask) {
+      mask_value = lp_build_mask_value(mask);
+      z_value = lp_build_select(&z_bld, mask_value, z_value, z_fb);
+      if (format_desc->block.bits > 32) {
+         s_fb = LLVMBuildBitCast(builder, s_fb, z_bld.vec_type, "");
+         s_value = lp_build_select(&z_bld, mask_value, s_value, s_fb);
+      }
+   }
+
+   if (zs_type.width < z_src_type.width) {
+      /* Truncate ZS values (e.g., when writing to Z16_UNORM) */
+      z_value = LLVMBuildTrunc(builder, z_value,
+                               lp_build_int_vec_type(gallivm, zs_type), "");
+   }
+
+   if (format_desc->block.bits <= 32) {
+      if (z_src_type.length == 4) {
+         zs_dst1 = lp_build_extract_range(gallivm, z_value, 0, 2);
+         zs_dst2 = lp_build_extract_range(gallivm, z_value, 2, 2);
+      }
+      else {
+         assert(z_src_type.length == 8);
+         zs_dst1 = LLVMBuildShuffleVector(builder, z_value, z_value,
+                                          LLVMConstVector(&shuffles[0],
+                                                          zs_load_type.length), "");
+         zs_dst2 = LLVMBuildShuffleVector(builder, z_value, z_value,
+                                          LLVMConstVector(&shuffles[4],
+                                                          zs_load_type.length), "");
+      }
+   }
+   else {
+      if (z_src_type.length == 4) {
+         zs_dst1 = lp_build_interleave2(gallivm, z_type,
+                                        z_value, s_value, 0);
+         zs_dst2 = lp_build_interleave2(gallivm, z_type,
+                                        z_value, s_value, 1);
+      }
+      else {
+         unsigned i;
+         LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 2];
+         assert(z_src_type.length == 8);
+         for (i = 0; i < 8; i++) {
+            shuffles[i*2] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
+            shuffles[i*2+1] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2 +
+                                                   z_src_type.length);
+         }
+         zs_dst1 = LLVMBuildShuffleVector(builder, z_value, s_value,
+                                          LLVMConstVector(&shuffles[0],
+                                                          z_src_type.length), "");
+         zs_dst2 = LLVMBuildShuffleVector(builder, z_value, s_value,
+                                          LLVMConstVector(&shuffles[8],
+                                                          z_src_type.length), "");
+      }
+      zs_dst1 = LLVMBuildBitCast(builder, zs_dst1,
+                                 lp_build_vec_type(gallivm, zs_load_type), "");
+      zs_dst2 = LLVMBuildBitCast(builder, zs_dst2,
+                                 lp_build_vec_type(gallivm, zs_load_type), "");
+   }
 
+   LLVMBuildStore(builder, zs_dst1, zs_dst_ptr1);
+   if (!is_1d) {
+      LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2);
+   }
+}
 
 /**
  * Generate code for performing depth and/or stencil tests.
- * We operate on a vector of values (typically a 2x2 quad).
+ * We operate on a vector of values (typically n 2x2 quads).
  *
  * \param depth  the depth test state
  * \param stencil  the front/back stencil state
@@ -493,31 +807,66 @@ lp_build_occlusion_count(LLVMBuilderRef builder,
  * \param format_desc  description of the depth/stencil surface
  * \param mask  the alive/dead pixel mask for the quad (vector)
  * \param stencil_refs  the front/back stencil ref values (scalar)
- * \param z_src  the incoming depth/stencil values (a 2x2 quad)
- * \param zs_dst_ptr  pointer to depth/stencil values in framebuffer
- * \param facing  contains float value indicating front/back facing polygon
+ * \param z_src  the incoming depth/stencil values (n 2x2 quad values, float32)
+ * \param zs_dst  the depth/stencil values in framebuffer
+ * \param face  contains boolean value indicating front/back facing polygon
  */
 void
-lp_build_depth_stencil_test(LLVMBuilderRef builder,
+lp_build_depth_stencil_test(struct gallivm_state *gallivm,
                             const struct pipe_depth_state *depth,
                             const struct pipe_stencil_state stencil[2],
-                            struct lp_type type,
+                            struct lp_type z_src_type,
                             const struct util_format_description *format_desc,
                             struct lp_build_mask_context *mask,
                             LLVMValueRef stencil_refs[2],
                             LLVMValueRef z_src,
-                            LLVMValueRef zs_dst_ptr,
+                            LLVMValueRef z_fb,
+                            LLVMValueRef s_fb,
                             LLVMValueRef face,
-                            LLVMValueRef counter)
+                            LLVMValueRef *z_value,
+                            LLVMValueRef *s_value,
+                            boolean do_branch)
 {
-   struct lp_build_context bld;
-   struct lp_build_context sbld;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_type z_type;
+   struct lp_build_context z_bld;
+   struct lp_build_context s_bld;
    struct lp_type s_type;
-   LLVMValueRef zs_dst, z_dst = NULL;
+   unsigned z_shift = 0, z_width = 0, z_mask = 0;
+   LLVMValueRef z_dst = NULL;
    LLVMValueRef stencil_vals = NULL;
    LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;
    LLVMValueRef z_pass = NULL, s_pass_mask = NULL;
-   LLVMValueRef orig_mask = mask->value;
+   LLVMValueRef current_mask = lp_build_mask_value(mask);
+   LLVMValueRef front_facing = NULL;
+   boolean have_z, have_s;
+
+   /*
+    * Depths are expected to be between 0 and 1, even if they are stored in
+    * floats. Setting these bits here will ensure that the lp_build_conv() call
+    * below won't try to unnecessarily clamp the incoming values.
+    */
+   if(z_src_type.floating) {
+      z_src_type.sign = FALSE;
+      z_src_type.norm = TRUE;
+   }
+   else {
+      assert(!z_src_type.sign);
+      assert(z_src_type.norm);
+   }
+
+   /* Pick the type matching the depth-stencil format. */
+   z_type = lp_depth_type(format_desc, z_src_type.length);
+
+   /* Pick the intermediate type for depth operations. */
+   z_type.width = z_src_type.width;
+   assert(z_type.length == z_src_type.length);
+
+   /* FIXME: for non-float depth/stencil might generate better code
+    * if we'd always split it up to use 128bit operations.
+    * For stencil we'd almost certainly want to pack to 8xi16 values,
+    * for z just run twice.
+    */
 
    /* Sanity checking */
    {
@@ -534,108 +883,156 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
       assert(format_desc->block.height == 1);
 
       if (stencil[0].enabled) {
-         assert(format_desc->format == PIPE_FORMAT_Z24_UNORM_S8_USCALED ||
-                format_desc->format == PIPE_FORMAT_S8_USCALED_Z24_UNORM);
+         assert(s_swizzle < 4);
+         assert(format_desc->channel[s_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
+         assert(format_desc->channel[s_swizzle].pure_integer);
+         assert(!format_desc->channel[s_swizzle].normalized);
+         assert(format_desc->channel[s_swizzle].size == 8);
       }
 
-      assert(z_swizzle < 4);
-      assert(format_desc->block.bits == type.width);
-      if (type.floating) {
-         assert(z_swizzle == 0);
-         assert(format_desc->channel[z_swizzle].type ==
-                UTIL_FORMAT_TYPE_FLOAT);
-         assert(format_desc->channel[z_swizzle].size ==
-                format_desc->block.bits);
-      }
-      else {
-         assert(format_desc->channel[z_swizzle].type ==
-                UTIL_FORMAT_TYPE_UNSIGNED);
-         assert(format_desc->channel[z_swizzle].normalized);
-         assert(!type.fixed);
-         assert(!type.sign);
-         assert(type.norm);
+      if (depth->enabled) {
+         assert(z_swizzle < 4);
+         if (z_type.floating) {
+            assert(z_swizzle == 0);
+            assert(format_desc->channel[z_swizzle].type ==
+                   UTIL_FORMAT_TYPE_FLOAT);
+            assert(format_desc->channel[z_swizzle].size == 32);
+         }
+         else {
+            assert(format_desc->channel[z_swizzle].type ==
+                   UTIL_FORMAT_TYPE_UNSIGNED);
+            assert(format_desc->channel[z_swizzle].normalized);
+            assert(!z_type.fixed);
+         }
       }
    }
 
 
    /* Setup build context for Z vals */
-   lp_build_context_init(&bld, builder, type);
+   lp_build_context_init(&z_bld, gallivm, z_type);
 
    /* Setup build context for stencil vals */
-   s_type = lp_type_int_vec(type.width);
-   lp_build_context_init(&sbld, builder, s_type);
-
-   /* Load current z/stencil value from z/stencil buffer */
-   zs_dst = LLVMBuildLoad(builder, zs_dst_ptr, "");
-
-   lp_build_name(zs_dst, "zsbufval");
-
+   s_type = lp_int_type(z_type);
+   lp_build_context_init(&s_bld, gallivm, s_type);
 
    /* Compute and apply the Z/stencil bitmasks and shifts.
     */
    {
-      unsigned z_shift, z_mask;
       unsigned s_shift, s_mask;
 
-      if (get_z_shift_and_mask(format_desc, &z_shift, &z_mask)) {
-         if (z_shift) {
-            LLVMValueRef shift = lp_build_const_int_vec(type, z_shift);
-            z_src = LLVMBuildLShr(builder, z_src, shift, "");
-         }
+      z_dst = z_fb;
+      stencil_vals = s_fb;
+
+      have_z = get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask);
+      have_s = get_s_shift_and_mask(format_desc, &s_shift, &s_mask);
 
+      if (have_z) {
          if (z_mask != 0xffffffff) {
-            LLVMValueRef mask = lp_build_const_int_vec(type, z_mask);
-            z_src = LLVMBuildAnd(builder, z_src, mask, "");
-            z_dst = LLVMBuildAnd(builder, zs_dst, mask, "");
-            z_bitmask = mask;  /* used below */
-         }
-         else {
-            z_dst = zs_dst;
+            z_bitmask = lp_build_const_int_vec(gallivm, z_type, z_mask);
          }
 
-         lp_build_name(z_dst, "zsbuf.z");
+         /*
+          * Align the framebuffer Z 's LSB to the right.
+          */
+         if (z_shift) {
+            LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);
+            z_dst = LLVMBuildLShr(builder, z_dst, shift, "z_dst");
+         } else if (z_bitmask) {
+            z_dst = LLVMBuildAnd(builder, z_dst, z_bitmask, "z_dst");
+         } else {
+            lp_build_name(z_dst, "z_dst");
+         }
       }
 
-      if (get_s_shift_and_mask(format_desc, &s_shift, &s_mask)) {
+      if (have_s) {
          if (s_shift) {
-            LLVMValueRef shift = lp_build_const_int_vec(type, s_shift);
-            stencil_vals = LLVMBuildLShr(builder, zs_dst, shift, "");
+            LLVMValueRef shift = lp_build_const_int_vec(gallivm, s_type, s_shift);
+            stencil_vals = LLVMBuildLShr(builder, stencil_vals, shift, "");
             stencil_shift = shift;  /* used below */
          }
-         else {
-            stencil_vals = zs_dst;
-         }
 
          if (s_mask != 0xffffffff) {
-            LLVMValueRef mask = lp_build_const_int_vec(type, s_mask);
+            LLVMValueRef mask = lp_build_const_int_vec(gallivm, s_type, s_mask);
             stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, "");
          }
 
-         lp_build_name(stencil_vals, "stencil");
+         lp_build_name(stencil_vals, "s_dst");
       }
    }
 
-
    if (stencil[0].enabled) {
-      /* convert scalar stencil refs into vectors */
-      stencil_refs[0] = lp_build_broadcast_scalar(&bld, stencil_refs[0]);
-      stencil_refs[1] = lp_build_broadcast_scalar(&bld, stencil_refs[1]);
 
-      s_pass_mask = lp_build_stencil_test(&sbld, stencil,
-                                          stencil_refs, stencil_vals, face);
+      if (face) {
+         LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
+
+         /* front_facing = face != 0 ? ~0 : 0 */
+         front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");
+         front_facing = LLVMBuildSExt(builder, front_facing,
+                                      LLVMIntTypeInContext(gallivm->context,
+                                             s_bld.type.length*s_bld.type.width),
+                                      "");
+         front_facing = LLVMBuildBitCast(builder, front_facing,
+                                         s_bld.int_vec_type, "");
+      }
+
+      s_pass_mask = lp_build_stencil_test(&s_bld, stencil,
+                                          stencil_refs, stencil_vals,
+                                          front_facing);
 
       /* apply stencil-fail operator */
       {
-         LLVMValueRef s_fail_mask = lp_build_andc(&bld, orig_mask, s_pass_mask);
-         stencil_vals = lp_build_stencil_op(&sbld, stencil, S_FAIL_OP,
+         LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, current_mask, s_pass_mask);
+         stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP,
                                             stencil_refs, stencil_vals,
-                                            s_fail_mask, face);
+                                            s_fail_mask, front_facing);
       }
    }
 
    if (depth->enabled) {
+      /*
+       * Convert fragment Z to the desired type, aligning the LSB to the right.
+       */
+
+      assert(z_type.width == z_src_type.width);
+      assert(z_type.length == z_src_type.length);
+      assert(lp_check_value(z_src_type, z_src));
+      if (z_src_type.floating) {
+         /*
+          * Convert from floating point values
+          */
+
+         if (!z_type.floating) {
+            z_src = lp_build_clamped_float_to_unsigned_norm(gallivm,
+                                                            z_src_type,
+                                                            z_width,
+                                                            z_src);
+         }
+      } else {
+         /*
+          * Convert from unsigned normalized values.
+          */
+
+         assert(!z_src_type.sign);
+         assert(!z_src_type.fixed);
+         assert(z_src_type.norm);
+         assert(!z_type.floating);
+         if (z_src_type.width > z_width) {
+            LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_src_type,
+                                                        z_src_type.width - z_width);
+            z_src = LLVMBuildLShr(builder, z_src, shift, "");
+         }
+      }
+      assert(lp_check_value(z_type, z_src));
+
+      lp_build_name(z_src, "z_src");
+
       /* compare src Z to dst Z, returning 'pass' mask */
-      z_pass = lp_build_cmp(&bld, depth->func, z_src, z_dst);
+      z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst);
+
+      /* mask off bits that failed stencil test */
+      if (s_pass_mask) {
+         current_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, "");
+      }
 
       if (!stencil[0].enabled) {
          /* We can potentially skip all remaining operations here, but only
@@ -643,28 +1040,22 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
           * buffer values.  Don't need to update Z buffer values.
           */
          lp_build_mask_update(mask, z_pass);
+
+         if (do_branch) {
+            lp_build_mask_check(mask);
+         }
       }
 
       if (depth->writemask) {
-         LLVMValueRef zselectmask = mask->value;
+         LLVMValueRef z_pass_mask;
 
          /* mask off bits that failed Z test */
-         zselectmask = LLVMBuildAnd(builder, zselectmask, z_pass, "");
-
-         /* mask off bits that failed stencil test */
-         if (s_pass_mask) {
-            zselectmask = LLVMBuildAnd(builder, zselectmask, s_pass_mask, "");
-         }
-
-         /* if combined Z/stencil format, mask off the stencil bits */
-         if (z_bitmask) {
-            zselectmask = LLVMBuildAnd(builder, zselectmask, z_bitmask, "");
-         }
+         z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, "");
 
          /* Mix the old and new Z buffer values.
           * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i]
           */
-         z_dst = lp_build_select(&bld, zselectmask, z_src, z_dst);
+         z_dst = lp_build_select(&z_bld, z_pass_mask, z_src, z_dst);
       }
 
       if (stencil[0].enabled) {
@@ -672,47 +1063,50 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
          LLVMValueRef z_fail_mask, z_pass_mask;
 
          /* apply Z-fail operator */
-         z_fail_mask = lp_build_andc(&bld, orig_mask, z_pass);
-         stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_FAIL_OP,
+         z_fail_mask = lp_build_andnot(&s_bld, current_mask, z_pass);
+         stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP,
                                             stencil_refs, stencil_vals,
-                                            z_fail_mask, face);
+                                            z_fail_mask, front_facing);
 
          /* apply Z-pass operator */
-         z_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, z_pass, "");
-         stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
+         z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, "");
+         stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
                                             stencil_refs, stencil_vals,
-                                            z_pass_mask, face);
+                                            z_pass_mask, front_facing);
       }
    }
    else {
       /* No depth test: apply Z-pass operator to stencil buffer values which
        * passed the stencil test.
        */
-      s_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, s_pass_mask, "");
-      stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
+      s_pass_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, "");
+      stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
                                          stencil_refs, stencil_vals,
-                                         s_pass_mask, face);
+                                         s_pass_mask, front_facing);
    }
 
-   /* The Z bits are already in the right place but we may need to shift the
-    * stencil bits before ORing Z with Stencil to make the final pixel value.
-    */
+   /* Put Z and stencil bits in the right place */
+   if (have_z && z_shift) {
+      LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);
+      z_dst = LLVMBuildShl(builder, z_dst, shift, "");
+   }
    if (stencil_vals && stencil_shift)
-      stencil_vals = LLVMBuildShl(bld.builder, stencil_vals,
+      stencil_vals = LLVMBuildShl(builder, stencil_vals,
                                   stencil_shift, "");
 
-   /* Finally, merge/store the z/stencil values */
-   if ((depth->enabled && depth->writemask) ||
-       (stencil[0].enabled && stencil[0].writemask)) {
-
-      if (z_dst && stencil_vals)
-         zs_dst = LLVMBuildOr(bld.builder, z_dst, stencil_vals, "");
-      else if (z_dst)
-         zs_dst = z_dst;
+   /* Finally, merge the z/stencil values */
+   if (format_desc->block.bits <= 32) {
+      if (have_z && have_s)
+         *z_value = LLVMBuildOr(builder, z_dst, stencil_vals, "");
+      else if (have_z)
+         *z_value = z_dst;
       else
-         zs_dst = stencil_vals;
-
-      LLVMBuildStore(builder, zs_dst, zs_dst_ptr);
+         *z_value = stencil_vals;
+      *s_value = *z_value;
+   }
+   else {
+      *z_value = z_dst;
+      *s_value = stencil_vals;
    }
 
    if (s_pass_mask)
@@ -720,7 +1114,5 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
 
    if (depth->enabled && stencil[0].enabled)
       lp_build_mask_update(mask, z_pass);
-
-   if (counter)
-      lp_build_occlusion_count(builder, type, mask->value, counter);
 }
+