util: Move gallium's PIPE_FORMAT utils to /util/format/
[mesa.git] / src / gallium / drivers / llvmpipe / lp_bld_depth.c
index 1cd36b87909665907b6646e3a0e3cf066a919c3a..63c2fb5d891757c073e8fa4ca8f1e9c28bca8b80 100644 (file)
@@ -50,7 +50,7 @@
  */
 
 #include "pipe/p_state.h"
-#include "util/u_format.h"
+#include "util/format/u_format.h"
 #include "util/u_cpu_detect.h"
 
 #include "gallivm/lp_bld_type.h"
@@ -313,7 +313,7 @@ lp_depth_type(const struct util_format_description *format_desc,
       if (format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
          type.floating = TRUE;
          assert(z_swizzle == 0);
-         assert(format_desc->channel[z_swizzle].size == format_desc->block.bits);
+         assert(format_desc->channel[z_swizzle].size == 32);
       }
       else if(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
          assert(format_desc->block.bits <= 32);
@@ -347,40 +347,31 @@ static boolean
 get_z_shift_and_mask(const struct util_format_description *format_desc,
                      unsigned *shift, unsigned *width, unsigned *mask)
 {
-   const unsigned total_bits = format_desc->block.bits;
+   unsigned total_bits;
    unsigned z_swizzle;
-   unsigned chan;
-   unsigned padding_left, padding_right;
-   
+
    assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
    assert(format_desc->block.width == 1);
    assert(format_desc->block.height == 1);
 
+   /* 64bit d/s format is special already extracted 32 bits */
+   total_bits = format_desc->block.bits > 32 ? 32 : format_desc->block.bits;
+
    z_swizzle = format_desc->swizzle[0];
 
-   if (z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
+   if (z_swizzle == PIPE_SWIZZLE_NONE)
       return FALSE;
 
    *width = format_desc->channel[z_swizzle].size;
+   /* & 31 is for the same reason as the 32-bit limit above */
+   *shift = format_desc->channel[z_swizzle].shift & 31;
 
-   padding_right = 0;
-   for (chan = 0; chan < z_swizzle; ++chan)
-      padding_right += format_desc->channel[chan].size;
-
-   padding_left =
-      total_bits - (padding_right + *width);
-
-   if (padding_left || padding_right) {
-      unsigned long long mask_left = (1ULL << (total_bits - padding_left)) - 1;
-      unsigned long long mask_right = (1ULL << (padding_right)) - 1;
-      *mask = mask_left ^ mask_right;
-   }
-   else {
+   if (*width == total_bits) {
       *mask = 0xffffffff;
+   } else {
+      *mask = ((1 << *width) - 1) << *shift;
    }
 
-   *shift = padding_right;
-
    return TRUE;
 }
 
@@ -395,17 +386,23 @@ get_s_shift_and_mask(const struct util_format_description *format_desc,
                      unsigned *shift, unsigned *mask)
 {
    unsigned s_swizzle;
-   unsigned chan, sz;
+   unsigned sz;
 
    s_swizzle = format_desc->swizzle[1];
 
-   if (s_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
+   if (s_swizzle == PIPE_SWIZZLE_NONE)
       return FALSE;
 
-   *shift = 0;
-   for (chan = 0; chan < s_swizzle; chan++)
-      *shift += format_desc->channel[chan].size;
+   /* just special case 64bit d/s format */
+   if (format_desc->block.bits > 32) {
+      /* XXX big-endian? */
+      assert(format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
+      *shift = 0;
+      *mask = 0xff;
+      return TRUE;
+   }
 
+   *shift = format_desc->channel[s_swizzle].shift;
    sz = format_desc->channel[s_swizzle].size;
    *mask = (1U << sz) - 1U;
 
@@ -418,6 +415,7 @@ get_s_shift_and_mask(const struct util_format_description *format_desc,
  * Test the depth mask. Add the number of channel which has none zero mask
  * into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}.
  * The counter will add 4.
+ * TODO: could get that out of the fs loop.
  *
  * \param type holds element type of the mask vector.
  * \param maskvalue is the depth test mask.
@@ -446,6 +444,7 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
                                       LLVMInt32TypeInContext(context), bits);
       count = lp_build_intrinsic_unary(builder, popcntintr,
                                        LLVMInt32TypeInContext(context), bits);
+      count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
    }
    else if(util_cpu_caps.has_avx && type.length == 8) {
       const char *movmskintr = "llvm.x86.avx.movmsk.ps.256";
@@ -456,6 +455,7 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
                                       LLVMInt32TypeInContext(context), bits);
       count = lp_build_intrinsic_unary(builder, popcntintr,
                                        LLVMInt32TypeInContext(context), bits);
+      count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
    }
    else {
       unsigned i;
@@ -498,8 +498,11 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
        }
        count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd);
 
-       if (type.length > 4) {
-          count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 32), "");
+       if (type.length > 8) {
+          count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 64), "");
+       }
+       else if (type.length < 8) {
+          count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
        }
    }
    newcount = LLVMBuildLoad(builder, counter, "origcount");
@@ -514,27 +517,36 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
  *
  * \param type  the data type of the fragment depth/stencil values
  * \param format_desc  description of the depth/stencil surface
+ * \param is_1d  whether this resource has only one dimension
  * \param loop_counter  the current loop iteration
  * \param depth_ptr  pointer to the depth/stencil values of this 4x4 block
  * \param depth_stride  stride of the depth/stencil buffer
+ * \param z_fb  contains z values loaded from fb (may include padding)
+ * \param s_fb  contains s values loaded from fb (may include padding)
  */
-LLVMValueRef
+void
 lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
                                      struct lp_type z_src_type,
                                      const struct util_format_description *format_desc,
+                                     boolean is_1d,
                                      LLVMValueRef depth_ptr,
                                      LLVMValueRef depth_stride,
+                                     LLVMValueRef *z_fb,
+                                     LLVMValueRef *s_fb,
                                      LLVMValueRef loop_counter)
 {
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
-   LLVMValueRef zs_dst, zs_dst1, zs_dst2;
+   LLVMValueRef zs_dst1, zs_dst2;
    LLVMValueRef zs_dst_ptr;
    LLVMValueRef depth_offset1, depth_offset2;
-   unsigned depth_bits = format_desc->block.bits/8;
+   LLVMTypeRef load_ptr_type;
+   unsigned depth_bytes = format_desc->block.bits / 8;
    struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
    struct lp_type zs_load_type = zs_type;
+
    zs_load_type.length = zs_load_type.length / 2;
+   load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);
 
    if (z_src_type.length == 4) {
       unsigned i;
@@ -545,7 +557,7 @@ lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
       LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
                                           depth_stride, "");
       depth_offset1 = LLVMBuildMul(builder, looplsb,
-                                   lp_build_const_int32(gallivm, depth_bits * 2), "");
+                                   lp_build_const_int32(gallivm, depth_bytes * 2), "");
       depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");
 
       /* just concatenate the loaded 2x2 values into 4-wide vector */
@@ -564,7 +576,6 @@ lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
        * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.
        */
       for (i = 0; i < 8; i++) {
-
          shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
       }
    }
@@ -573,72 +584,118 @@ lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
 
    /* Load current z/stencil values from z/stencil buffer */
    zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
-   zs_dst_ptr = LLVMBuildBitCast(builder,
-                                 zs_dst_ptr,
-                                 LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0), "");
+   zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
    zs_dst1 = LLVMBuildLoad(builder, zs_dst_ptr, "");
-   zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
-   zs_dst_ptr = LLVMBuildBitCast(builder,
-                                 zs_dst_ptr,
-                                 LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0), "");
-   zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, "");
+   if (is_1d) {
+      zs_dst2 = lp_build_undef(gallivm, zs_load_type);
+   }
+   else {
+      zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
+      zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
+      zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, "");
+   }
 
-   zs_dst = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2,
-                                   LLVMConstVector(shuffles, zs_type.length), "");
+   *z_fb = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2,
+                                  LLVMConstVector(shuffles, zs_type.length), "");
+   *s_fb = *z_fb;
 
    if (format_desc->block.bits < z_src_type.width) {
       /* Extend destination ZS values (e.g., when reading from Z16_UNORM) */
-      zs_dst = LLVMBuildZExt(builder, zs_dst, lp_build_int_vec_type(gallivm, z_src_type), "");
+      *z_fb = LLVMBuildZExt(builder, *z_fb,
+                            lp_build_int_vec_type(gallivm, z_src_type), "");
    }
 
-   lp_build_name(zs_dst, "zs_dst");
+   else if (format_desc->block.bits > 32) {
+      /* rely on llvm to handle too wide vector we have here nicely */
+      unsigned i;
+      struct lp_type typex2 = zs_type;
+      struct lp_type s_type = zs_type;
+      LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 4];
+      LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 4];
+      LLVMValueRef tmp;
+
+      typex2.width = typex2.width / 2;
+      typex2.length = typex2.length * 2;
+      s_type.width = s_type.width / 2;
+      s_type.floating = 0;
+
+      tmp = LLVMBuildBitCast(builder, *z_fb,
+                             lp_build_vec_type(gallivm, typex2), "");
+
+      for (i = 0; i < zs_type.length; i++) {
+         shuffles1[i] = lp_build_const_int32(gallivm, i * 2);
+         shuffles2[i] = lp_build_const_int32(gallivm, i * 2 + 1);
+      }
+      *z_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
+                                     LLVMConstVector(shuffles1, zs_type.length), "");
+      *s_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
+                                     LLVMConstVector(shuffles2, zs_type.length), "");
+      *s_fb = LLVMBuildBitCast(builder, *s_fb,
+                               lp_build_vec_type(gallivm, s_type), "");
+      lp_build_name(*s_fb, "s_dst");
+   }
 
-   return zs_dst;
+   lp_build_name(*z_fb, "z_dst");
+   lp_build_name(*s_fb, "s_dst");
+   lp_build_name(*z_fb, "z_dst");
 }
 
 /**
  * Store depth/stencil values.
  * Incoming values are swizzled (typically n 2x2 quads), stored linear.
- * If there's a mask it will do reload/select/store otherwise just store.
+ * If there's a mask it will do select/store otherwise just store.
  *
  * \param type  the data type of the fragment depth/stencil values
  * \param format_desc  description of the depth/stencil surface
+ * \param is_1d  whether this resource has only one dimension
  * \param mask  the alive/dead pixel mask for the quad (vector)
+ * \param z_fb  z values read from fb (with padding)
+ * \param s_fb  s values read from fb (with padding)
  * \param loop_counter  the current loop iteration
  * \param depth_ptr  pointer to the depth/stencil values of this 4x4 block
  * \param depth_stride  stride of the depth/stencil buffer
- * \param zs_value  the depth/stencil values to store
+ * \param z_value the depth values to store (with padding)
+ * \param s_value the stencil values to store (with padding)
  */
 void
 lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm,
                                       struct lp_type z_src_type,
                                       const struct util_format_description *format_desc,
+                                      boolean is_1d,
                                       struct lp_build_mask_context *mask,
+                                      LLVMValueRef z_fb,
+                                      LLVMValueRef s_fb,
                                       LLVMValueRef loop_counter,
                                       LLVMValueRef depth_ptr,
                                       LLVMValueRef depth_stride,
-                                      LLVMValueRef zs_value)
+                                      LLVMValueRef z_value,
+                                      LLVMValueRef s_value)
 {
    struct lp_build_context z_bld;
    LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef mask_value = NULL;
-   LLVMValueRef zs_dst = NULL, zs_dst1, zs_dst2;
+   LLVMValueRef zs_dst1, zs_dst2;
    LLVMValueRef zs_dst_ptr1, zs_dst_ptr2;
    LLVMValueRef depth_offset1, depth_offset2;
-   unsigned depth_bits = format_desc->block.bits/8;
+   LLVMTypeRef load_ptr_type;
+   unsigned depth_bytes = format_desc->block.bits / 8;
    struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
+   struct lp_type z_type = zs_type;
    struct lp_type zs_load_type = zs_type;
+
    zs_load_type.length = zs_load_type.length / 2;
+   load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);
 
-   lp_build_context_init(&z_bld, gallivm, zs_type);
+   z_type.width = z_src_type.width;
+
+   lp_build_context_init(&z_bld, gallivm, z_type);
 
    /*
     * This is far from ideal, at least for late depth write we should do this
     * outside the fs loop to avoid all the swizzle stuff.
     */
    if (z_src_type.length == 4) {
-      unsigned i;
       LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
                                           lp_build_const_int32(gallivm, 1), "");
       LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
@@ -646,13 +703,8 @@ lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm,
       LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
                                           depth_stride, "");
       depth_offset1 = LLVMBuildMul(builder, looplsb,
-                                   lp_build_const_int32(gallivm, depth_bits * 2), "");
+                                   lp_build_const_int32(gallivm, depth_bytes * 2), "");
       depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");
-
-      /* just concatenate the loaded 2x2 values into 4-wide vector */
-      for (i = 0; i < 4; i++) {
-         shuffles[i] = lp_build_const_int32(gallivm, i);
-      }
    }
    else {
       unsigned i;
@@ -669,57 +721,80 @@ lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm,
       }
    }
 
-
    depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");
 
    zs_dst_ptr1 = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
-   zs_dst_ptr1 = LLVMBuildBitCast(builder,
-                                 zs_dst_ptr1,
-                                 LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0), "");
+   zs_dst_ptr1 = LLVMBuildBitCast(builder, zs_dst_ptr1, load_ptr_type, "");
    zs_dst_ptr2 = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
-   zs_dst_ptr2 = LLVMBuildBitCast(builder,
-                                  zs_dst_ptr2,
-                                  LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0), "");
+   zs_dst_ptr2 = LLVMBuildBitCast(builder, zs_dst_ptr2, load_ptr_type, "");
 
-   if (mask) {
-      zs_dst1 = LLVMBuildLoad(builder, zs_dst_ptr1, "");
-      zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr2, "");
-      zs_dst = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2,
-                                      LLVMConstVector(shuffles, zs_type.length),
-                                      "zsbufval");
+   if (format_desc->block.bits > 32) {
+      s_value = LLVMBuildBitCast(builder, s_value, z_bld.vec_type, "");
+   }
 
+   if (mask) {
       mask_value = lp_build_mask_value(mask);
+      z_value = lp_build_select(&z_bld, mask_value, z_value, z_fb);
+      if (format_desc->block.bits > 32) {
+         s_fb = LLVMBuildBitCast(builder, s_fb, z_bld.vec_type, "");
+         s_value = lp_build_select(&z_bld, mask_value, s_value, s_fb);
+      }
    }
 
    if (zs_type.width < z_src_type.width) {
-      /* Truncate incoming ZS and mask values (e.g., when writing to Z16_UNORM) */
-      zs_value = LLVMBuildTrunc(builder, zs_value, z_bld.vec_type, "");
-      if (mask)
-         mask_value = LLVMBuildTrunc(builder, mask_value, z_bld.vec_type, "");
+      /* Truncate ZS values (e.g., when writing to Z16_UNORM) */
+      z_value = LLVMBuildTrunc(builder, z_value,
+                               lp_build_int_vec_type(gallivm, zs_type), "");
    }
 
-   if (mask) {
-      zs_value = lp_build_select(&z_bld, mask_value, zs_value, zs_dst);
-   }
-
-   if (z_src_type.length == 4) {
-      zs_dst1 = lp_build_extract_range(gallivm, zs_value, 0, 2);
-      zs_dst2 = lp_build_extract_range(gallivm, zs_value, 2, 2);
+   if (format_desc->block.bits <= 32) {
+      if (z_src_type.length == 4) {
+         zs_dst1 = lp_build_extract_range(gallivm, z_value, 0, 2);
+         zs_dst2 = lp_build_extract_range(gallivm, z_value, 2, 2);
+      }
+      else {
+         assert(z_src_type.length == 8);
+         zs_dst1 = LLVMBuildShuffleVector(builder, z_value, z_value,
+                                          LLVMConstVector(&shuffles[0],
+                                                          zs_load_type.length), "");
+         zs_dst2 = LLVMBuildShuffleVector(builder, z_value, z_value,
+                                          LLVMConstVector(&shuffles[4],
+                                                          zs_load_type.length), "");
+      }
    }
    else {
-      assert(z_src_type.length == 8);
-      zs_dst1 = LLVMBuildShuffleVector(builder, zs_value, zs_value,
-                                       LLVMConstVector(&shuffles[0],
-                                                       zs_load_type.length),
-                                       "");
-      zs_dst2 = LLVMBuildShuffleVector(builder, zs_value, zs_value,
-                                       LLVMConstVector(&shuffles[4],
-                                                       zs_load_type.length),
-                                       "");
-
+      if (z_src_type.length == 4) {
+         zs_dst1 = lp_build_interleave2(gallivm, z_type,
+                                        z_value, s_value, 0);
+         zs_dst2 = lp_build_interleave2(gallivm, z_type,
+                                        z_value, s_value, 1);
+      }
+      else {
+         unsigned i;
+         LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 2];
+         assert(z_src_type.length == 8);
+         for (i = 0; i < 8; i++) {
+            shuffles[i*2] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
+            shuffles[i*2+1] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2 +
+                                                   z_src_type.length);
+         }
+         zs_dst1 = LLVMBuildShuffleVector(builder, z_value, s_value,
+                                          LLVMConstVector(&shuffles[0],
+                                                          z_src_type.length), "");
+         zs_dst2 = LLVMBuildShuffleVector(builder, z_value, s_value,
+                                          LLVMConstVector(&shuffles[8],
+                                                          z_src_type.length), "");
+      }
+      zs_dst1 = LLVMBuildBitCast(builder, zs_dst1,
+                                 lp_build_vec_type(gallivm, zs_load_type), "");
+      zs_dst2 = LLVMBuildBitCast(builder, zs_dst2,
+                                 lp_build_vec_type(gallivm, zs_load_type), "");
    }
+
    LLVMBuildStore(builder, zs_dst1, zs_dst_ptr1);
-   LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2);
+   if (!is_1d) {
+      LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2);
+   }
 }
 
 /**
@@ -745,13 +820,14 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
                             struct lp_build_mask_context *mask,
                             LLVMValueRef stencil_refs[2],
                             LLVMValueRef z_src,
-                            LLVMValueRef zs_dst,
+                            LLVMValueRef z_fb,
+                            LLVMValueRef s_fb,
                             LLVMValueRef face,
-                            LLVMValueRef *zs_value,
+                            LLVMValueRef *z_value,
+                            LLVMValueRef *s_value,
                             boolean do_branch)
 {
    LLVMBuilderRef builder = gallivm->builder;
-   struct lp_type zs_type;
    struct lp_type z_type;
    struct lp_build_context z_bld;
    struct lp_build_context s_bld;
@@ -761,9 +837,9 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
    LLVMValueRef stencil_vals = NULL;
    LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;
    LLVMValueRef z_pass = NULL, s_pass_mask = NULL;
-   LLVMValueRef orig_mask = lp_build_mask_value(mask);
+   LLVMValueRef current_mask = lp_build_mask_value(mask);
    LLVMValueRef front_facing = NULL;
-
+   boolean have_z, have_s;
 
    /*
     * Depths are expected to be between 0 and 1, even if they are stored in
@@ -780,12 +856,9 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
    }
 
    /* Pick the type matching the depth-stencil format. */
-   zs_type = lp_depth_type(format_desc, z_src_type.length);
+   z_type = lp_depth_type(format_desc, z_src_type.length);
 
    /* Pick the intermediate type for depth operations. */
-   z_type = zs_type;
-   /* FIXME: Cope with a depth test type with higher bit width. */
-   assert(zs_type.width <= z_src_type.width);
    z_type.width = z_src_type.width;
    assert(z_type.length == z_src_type.length);
 
@@ -800,8 +873,8 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
       const unsigned z_swizzle = format_desc->swizzle[0];
       const unsigned s_swizzle = format_desc->swizzle[1];
 
-      assert(z_swizzle != UTIL_FORMAT_SWIZZLE_NONE ||
-             s_swizzle != UTIL_FORMAT_SWIZZLE_NONE);
+      assert(z_swizzle != PIPE_SWIZZLE_NONE ||
+             s_swizzle != PIPE_SWIZZLE_NONE);
 
       assert(depth->enabled || stencil[0].enabled);
 
@@ -819,13 +892,11 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
 
       if (depth->enabled) {
          assert(z_swizzle < 4);
-         assert(format_desc->block.bits <= z_type.width);
          if (z_type.floating) {
             assert(z_swizzle == 0);
             assert(format_desc->channel[z_swizzle].type ==
                    UTIL_FORMAT_TYPE_FLOAT);
-            assert(format_desc->channel[z_swizzle].size ==
-                   format_desc->block.bits);
+            assert(format_desc->channel[z_swizzle].size == 32);
          }
          else {
             assert(format_desc->channel[z_swizzle].type ==
@@ -849,7 +920,13 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
    {
       unsigned s_shift, s_mask;
 
-      if (get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask)) {
+      z_dst = z_fb;
+      stencil_vals = s_fb;
+
+      have_z = get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask);
+      have_s = get_s_shift_and_mask(format_desc, &s_shift, &s_mask);
+
+      if (have_z) {
          if (z_mask != 0xffffffff) {
             z_bitmask = lp_build_const_int_vec(gallivm, z_type, z_mask);
          }
@@ -859,26 +936,20 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
           */
          if (z_shift) {
             LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);
-            z_dst = LLVMBuildLShr(builder, zs_dst, shift, "z_dst");
+            z_dst = LLVMBuildLShr(builder, z_dst, shift, "z_dst");
          } else if (z_bitmask) {
-           /* TODO: Instead of loading a mask from memory and ANDing, it's
-            * probably faster to just shake the bits with two shifts. */
-            z_dst = LLVMBuildAnd(builder, zs_dst, z_bitmask, "z_dst");
+            z_dst = LLVMBuildAnd(builder, z_dst, z_bitmask, "z_dst");
          } else {
-            z_dst = zs_dst;
             lp_build_name(z_dst, "z_dst");
          }
       }
 
-      if (get_s_shift_and_mask(format_desc, &s_shift, &s_mask)) {
+      if (have_s) {
          if (s_shift) {
             LLVMValueRef shift = lp_build_const_int_vec(gallivm, s_type, s_shift);
-            stencil_vals = LLVMBuildLShr(builder, zs_dst, shift, "");
+            stencil_vals = LLVMBuildLShr(builder, stencil_vals, shift, "");
             stencil_shift = shift;  /* used below */
          }
-         else {
-            stencil_vals = zs_dst;
-         }
 
          if (s_mask != 0xffffffff) {
             LLVMValueRef mask = lp_build_const_int_vec(gallivm, s_type, s_mask);
@@ -892,21 +963,49 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
    if (stencil[0].enabled) {
 
       if (face) {
-         LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
-
-         /* front_facing = face != 0 ? ~0 : 0 */
-         front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");
-         front_facing = LLVMBuildSExt(builder, front_facing,
-                                      LLVMIntTypeInContext(gallivm->context,
-                                             s_bld.type.length*s_bld.type.width),
-                                      "");
-         front_facing = LLVMBuildBitCast(builder, front_facing,
-                                         s_bld.int_vec_type, "");
-      }
+         if (0) {
+            /*
+             * XXX: the scalar expansion below produces atrocious code
+             * (basically producing a 64bit scalar value, then moving the 2
+             * 32bit pieces separately to simd, plus 4 shuffles, which is
+             * seriously lame). But the scalar-simd transitions are always
+             * tricky, so no big surprise there.
+             * This here would be way better, however llvm has some serious
+             * trouble later using it in the select, probably because it will
+             * recognize the expression as constant and move the simd value
+             * away (out of the loop) - and then it will suddenly try
+             * constructing i1 high-bit masks out of it later...
+             * (Try piglit stencil-twoside.)
+             * Note this is NOT due to using SExt/Trunc, it fails exactly the
+             * same even when using native compare/select.
+             * I cannot reproduce this problem when using stand-alone compiler
+             * though, suggesting some problem with optimization passes...
+             * (With stand-alone compilation, the construction of this mask
+             * value, no matter if the easy 3 instruction here or the complex
+             * 16+ one below, never gets separated from where it's used.)
+             * The scalar code still has the same problem, but the generated
+             * code looks a bit better at least for some reason, even if
+             * mostly by luck (the fundamental issue clearly is the same).
+             */
+            front_facing = lp_build_broadcast(gallivm, s_bld.vec_type, face);
+            /* front_facing = face != 0 ? ~0 : 0 */
+            front_facing = lp_build_compare(gallivm, s_bld.type,
+                                            PIPE_FUNC_NOTEQUAL,
+                                            front_facing, s_bld.zero);
+         } else {
+            LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
 
-      /* convert scalar stencil refs into vectors */
-      stencil_refs[0] = lp_build_broadcast_scalar(&s_bld, stencil_refs[0]);
-      stencil_refs[1] = lp_build_broadcast_scalar(&s_bld, stencil_refs[1]);
+            /* front_facing = face != 0 ? ~0 : 0 */
+            front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");
+            front_facing = LLVMBuildSExt(builder, front_facing,
+                                         LLVMIntTypeInContext(gallivm->context,
+                                                s_bld.type.length*s_bld.type.width),
+                                         "");
+            front_facing = LLVMBuildBitCast(builder, front_facing,
+                                            s_bld.int_vec_type, "");
+
+         }
+      }
 
       s_pass_mask = lp_build_stencil_test(&s_bld, stencil,
                                           stencil_refs, stencil_vals,
@@ -914,7 +1013,7 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
 
       /* apply stencil-fail operator */
       {
-         LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, orig_mask, s_pass_mask);
+         LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, current_mask, s_pass_mask);
          stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP,
                                             stencil_refs, stencil_vals,
                                             s_fail_mask, front_facing);
@@ -962,6 +1061,11 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
       /* compare src Z to dst Z, returning 'pass' mask */
       z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst);
 
+      /* mask off bits that failed stencil test */
+      if (s_pass_mask) {
+         current_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, "");
+      }
+
       if (!stencil[0].enabled) {
          /* We can potentially skip all remaining operations here, but only
           * if stencil is disabled because we still need to update the stencil
@@ -971,25 +1075,19 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
 
          if (do_branch) {
             lp_build_mask_check(mask);
-            do_branch = FALSE;
          }
       }
 
       if (depth->writemask) {
-         LLVMValueRef zselectmask;
+         LLVMValueRef z_pass_mask;
 
          /* mask off bits that failed Z test */
-         zselectmask = LLVMBuildAnd(builder, orig_mask, z_pass, "");
-
-         /* mask off bits that failed stencil test */
-         if (s_pass_mask) {
-            zselectmask = LLVMBuildAnd(builder, zselectmask, s_pass_mask, "");
-         }
+         z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, "");
 
          /* Mix the old and new Z buffer values.
           * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i]
           */
-         z_dst = lp_build_select(&z_bld, zselectmask, z_src, z_dst);
+         z_dst = lp_build_select(&z_bld, z_pass_mask, z_src, z_dst);
       }
 
       if (stencil[0].enabled) {
@@ -997,13 +1095,13 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
          LLVMValueRef z_fail_mask, z_pass_mask;
 
          /* apply Z-fail operator */
-         z_fail_mask = lp_build_andnot(&z_bld, orig_mask, z_pass);
+         z_fail_mask = lp_build_andnot(&s_bld, current_mask, z_pass);
          stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP,
                                             stencil_refs, stencil_vals,
                                             z_fail_mask, front_facing);
 
          /* apply Z-pass operator */
-         z_pass_mask = LLVMBuildAnd(builder, orig_mask, z_pass, "");
+         z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, "");
          stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
                                             stencil_refs, stencil_vals,
                                             z_pass_mask, front_facing);
@@ -1013,14 +1111,14 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
       /* No depth test: apply Z-pass operator to stencil buffer values which
        * passed the stencil test.
        */
-      s_pass_mask = LLVMBuildAnd(builder, orig_mask, s_pass_mask, "");
+      s_pass_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, "");
       stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
                                          stencil_refs, stencil_vals,
                                          s_pass_mask, front_facing);
    }
 
-   /* Put Z and ztencil bits in the right place */
-   if (z_dst && z_shift) {
+   /* Put Z and stencil bits in the right place */
+   if (have_z && z_shift) {
       LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);
       z_dst = LLVMBuildShl(builder, z_dst, shift, "");
    }
@@ -1028,18 +1126,19 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
       stencil_vals = LLVMBuildShl(builder, stencil_vals,
                                   stencil_shift, "");
 
-   /* Finally, merge/store the z/stencil values */
-   if ((depth->enabled && depth->writemask) ||
-       (stencil[0].enabled && stencil[0].writemask)) {
-
-      if (z_dst && stencil_vals)
-         zs_dst = LLVMBuildOr(builder, z_dst, stencil_vals, "");
-      else if (z_dst)
-         zs_dst = z_dst;
+   /* Finally, merge the z/stencil values */
+   if (format_desc->block.bits <= 32) {
+      if (have_z && have_s)
+         *z_value = LLVMBuildOr(builder, z_dst, stencil_vals, "");
+      else if (have_z)
+         *z_value = z_dst;
       else
-         zs_dst = stencil_vals;
-
-      *zs_value = zs_dst;
+         *z_value = stencil_vals;
+      *s_value = *z_value;
+   }
+   else {
+      *z_value = z_dst;
+      *s_value = stencil_vals;
    }
 
    if (s_pass_mask)
@@ -1047,9 +1146,5 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
 
    if (depth->enabled && stencil[0].enabled)
       lp_build_mask_update(mask, z_pass);
-
-   if (do_branch)
-      lp_build_mask_check(mask);
-
 }