/**************************************************************************
*
- * Copyright 2009 VMware, Inc.
+ * Copyright 2009-2010 VMware, Inc.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* flushing would avoid this, but it would most likely result in depth fighting
* artifacts.
*
- * We are free to use a different pixel layout though. Since our basic
- * processing unit is a quad (2x2 pixel block) we store the depth/stencil
- * values tiled, a quad at time. That is, a depth buffer containing
+ * Since we're using linear layout for everything, but we need to deal with
+ * 2x2 quads, we need to load/store multiple values and swizzle them into
+ * place (we could avoid this by doing depth/stencil testing in linear format,
+ * which would be easy for late depth/stencil test as we could do that after
+ * the fragment shader loop just as we do for color buffers, but more tricky
+ * for early depth test as we'd need both masks and interpolated depth in
+ * linear format).
*
- * Z11 Z12 Z13 Z14 ...
- * Z21 Z22 Z23 Z24 ...
- * Z31 Z32 Z33 Z34 ...
- * Z41 Z42 Z43 Z44 ...
- * ... ... ... ... ...
- *
- * will actually be stored in memory as
- *
- * Z11 Z12 Z21 Z22 Z13 Z14 Z23 Z24 ...
- * Z31 Z32 Z41 Z42 Z33 Z34 Z43 Z44 ...
- * ... ... ... ... ... ... ... ... ...
- *
- *
- * Stencil test:
- * Two-sided stencil test is supported but probably not as efficient as
- * it could be. Currently, we use if/then/else constructs to do the
- * operations for front vs. back-facing polygons. We could probably do
- * both the front and back arithmetic then use a Select() instruction to
- * choose the result depending on polyon orientation. We'd have to
- * measure performance both ways and see which is better.
*
* @author Jose Fonseca <jfonseca@vmware.com>
+ * @author Brian Paul <jfonseca@vmware.com>
*/
#include "pipe/p_state.h"
#include "util/u_format.h"
+#include "util/u_cpu_detect.h"
#include "gallivm/lp_bld_type.h"
#include "gallivm/lp_bld_arit.h"
#include "gallivm/lp_bld_intr.h"
#include "gallivm/lp_bld_debug.h"
#include "gallivm/lp_bld_swizzle.h"
+#include "gallivm/lp_bld_pack.h"
#include "lp_bld_depth.h"
LLVMValueRef stencilRef,
LLVMValueRef stencilVals)
{
+ LLVMBuilderRef builder = bld->gallivm->builder;
const unsigned stencilMax = 255; /* XXX fix */
struct lp_type type = bld->type;
LLVMValueRef res;
- assert(type.sign);
+ /*
+ * SSE2 has intrinsics for signed comparisons, but not unsigned ones. Values
+ * are between 0..255 so ensure we generate the fastest comparisons for
+ * wider elements.
+ */
+ if (type.width <= 8) {
+ assert(!type.sign);
+ } else {
+ assert(type.sign);
+ }
assert(stencil->enabled);
if (stencil->valuemask != stencilMax) {
/* compute stencilRef = stencilRef & valuemask */
- LLVMValueRef valuemask = lp_build_const_int_vec(type, stencil->valuemask);
- stencilRef = LLVMBuildAnd(bld->builder, stencilRef, valuemask, "");
+ LLVMValueRef valuemask = lp_build_const_int_vec(bld->gallivm, type, stencil->valuemask);
+ stencilRef = LLVMBuildAnd(builder, stencilRef, valuemask, "");
/* compute stencilVals = stencilVals & valuemask */
- stencilVals = LLVMBuildAnd(bld->builder, stencilVals, valuemask, "");
+ stencilVals = LLVMBuildAnd(builder, stencilVals, valuemask, "");
}
res = lp_build_cmp(bld, stencil->func, stencilRef, stencilVals);
res = lp_build_stencil_test_single(bld, &stencil[0],
stencilRefs[0], stencilVals);
- if (stencil[1].enabled && front_facing) {
+ if (stencil[1].enabled && front_facing != NULL) {
/* do back face test */
LLVMValueRef back_res;
LLVMValueRef stencilVals)
{
+ LLVMBuilderRef builder = bld->gallivm->builder;
struct lp_type type = bld->type;
LLVMValueRef res;
- LLVMValueRef max = lp_build_const_int_vec(type, 0xff);
+ LLVMValueRef max = lp_build_const_int_vec(bld->gallivm, type, 0xff);
unsigned stencil_op;
assert(type.sign);
break;
case PIPE_STENCIL_OP_INCR_WRAP:
res = lp_build_add(bld, stencilVals, bld->one);
- res = LLVMBuildAnd(bld->builder, res, max, "");
+ res = LLVMBuildAnd(builder, res, max, "");
break;
case PIPE_STENCIL_OP_DECR_WRAP:
res = lp_build_sub(bld, stencilVals, bld->one);
- res = LLVMBuildAnd(bld->builder, res, max, "");
+ res = LLVMBuildAnd(builder, res, max, "");
break;
case PIPE_STENCIL_OP_INVERT:
- res = LLVMBuildNot(bld->builder, stencilVals, "");
- res = LLVMBuildAnd(bld->builder, res, max, "");
+ res = LLVMBuildNot(builder, stencilVals, "");
+ res = LLVMBuildAnd(builder, res, max, "");
break;
default:
assert(0 && "bad stencil op mode");
LLVMValueRef front_facing)
{
+ LLVMBuilderRef builder = bld->gallivm->builder;
LLVMValueRef res;
assert(stencil[0].enabled);
res = lp_build_stencil_op_single(bld, &stencil[0], op,
stencilRefs[0], stencilVals);
- if (stencil[1].enabled && front_facing) {
+ if (stencil[1].enabled && front_facing != NULL) {
/* do back face op */
LLVMValueRef back_res;
res = lp_build_select(bld, front_facing, res, back_res);
}
- if (stencil->writemask != 0xff) {
- /* mask &= stencil->writemask */
- LLVMValueRef writemask = lp_build_const_int_vec(bld->type, stencil->writemask);
- mask = LLVMBuildAnd(bld->builder, mask, writemask, "");
+ if (stencil[0].writemask != 0xff ||
+ (stencil[1].enabled && front_facing != NULL && stencil[1].writemask != 0xff)) {
+ /* mask &= stencil[0].writemask */
+ LLVMValueRef writemask = lp_build_const_int_vec(bld->gallivm, bld->type,
+ stencil[0].writemask);
+ if (stencil[1].enabled && stencil[1].writemask != stencil[0].writemask && front_facing != NULL) {
+ LLVMValueRef back_writemask = lp_build_const_int_vec(bld->gallivm, bld->type,
+ stencil[1].writemask);
+ writemask = lp_build_select(bld, front_facing, writemask, back_writemask);
+ }
+
+ mask = LLVMBuildAnd(builder, mask, writemask, "");
/* res = (res & mask) | (stencilVals & ~mask) */
- res = lp_build_select_bitwise(bld, writemask, res, stencilVals);
+ res = lp_build_select_bitwise(bld, mask, res, stencilVals);
}
else {
/* res = mask ? res : stencilVals */
/**
- * Return a type appropriate for depth/stencil testing.
+ * Return a type that matches the depth/stencil format.
*/
struct lp_type
lp_depth_type(const struct util_format_description *format_desc,
unsigned length)
{
struct lp_type type;
- unsigned swizzle;
+ unsigned z_swizzle;
assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
assert(format_desc->block.width == 1);
assert(format_desc->block.height == 1);
- swizzle = format_desc->swizzle[0];
- assert(swizzle < 4);
-
memset(&type, 0, sizeof type);
type.width = format_desc->block.bits;
- if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
- type.floating = TRUE;
- assert(swizzle == 0);
- assert(format_desc->channel[swizzle].size == format_desc->block.bits);
- }
- else if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
- assert(format_desc->block.bits <= 32);
- if(format_desc->channel[swizzle].normalized)
- type.norm = TRUE;
+ z_swizzle = format_desc->swizzle[0];
+ if (z_swizzle < 4) {
+ if (format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
+ type.floating = TRUE;
+ assert(z_swizzle == 0);
+ assert(format_desc->channel[z_swizzle].size == 32);
+ }
+ else if(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+ assert(format_desc->block.bits <= 32);
+ assert(format_desc->channel[z_swizzle].normalized);
+ if (format_desc->channel[z_swizzle].size < format_desc->block.bits) {
+ /* Prefer signed integers when possible, as SSE has less support
+ * for unsigned comparison;
+ */
+ type.sign = TRUE;
+ }
+ }
+ else
+ assert(0);
}
- else
- assert(0);
- assert(type.width <= length);
- type.length = length / type.width;
+ type.length = length;
return type;
}
*/
static boolean
get_z_shift_and_mask(const struct util_format_description *format_desc,
- unsigned *shift, unsigned *mask)
+ unsigned *shift, unsigned *width, unsigned *mask)
{
- const unsigned total_bits = format_desc->block.bits;
+ unsigned total_bits;
unsigned z_swizzle;
- unsigned chan;
- unsigned padding_left, padding_right;
-
+
assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
assert(format_desc->block.width == 1);
assert(format_desc->block.height == 1);
+ /* 64bit d/s format is special already extracted 32 bits */
+ total_bits = format_desc->block.bits > 32 ? 32 : format_desc->block.bits;
+
z_swizzle = format_desc->swizzle[0];
- if (z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
+ if (z_swizzle == PIPE_SWIZZLE_NONE)
return FALSE;
- padding_right = 0;
- for (chan = 0; chan < z_swizzle; ++chan)
- padding_right += format_desc->channel[chan].size;
+ *width = format_desc->channel[z_swizzle].size;
+ /* & 31 is for the same reason as the 32-bit limit above */
+ *shift = format_desc->channel[z_swizzle].shift & 31;
- padding_left =
- total_bits - (padding_right + format_desc->channel[z_swizzle].size);
-
- if (padding_left || padding_right) {
- unsigned long long mask_left = (1ULL << (total_bits - padding_left)) - 1;
- unsigned long long mask_right = (1ULL << (padding_right)) - 1;
- *mask = mask_left ^ mask_right;
- }
- else {
+ if (*width == total_bits) {
*mask = 0xffffffff;
+ } else {
+ *mask = ((1 << *width) - 1) << *shift;
}
- *shift = padding_left;
-
return TRUE;
}
unsigned *shift, unsigned *mask)
{
unsigned s_swizzle;
- unsigned chan, sz;
+ unsigned sz;
s_swizzle = format_desc->swizzle[1];
- if (s_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
+ if (s_swizzle == PIPE_SWIZZLE_NONE)
return FALSE;
- *shift = 0;
- for (chan = 0; chan < s_swizzle; chan++)
- *shift += format_desc->channel[chan].size;
+ /* just special case 64bit d/s format */
+ if (format_desc->block.bits > 32) {
+ /* XXX big-endian? */
+ assert(format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
+ *shift = 0;
+ *mask = 0xff;
+ return TRUE;
+ }
+ *shift = format_desc->channel[s_swizzle].shift;
sz = format_desc->channel[s_swizzle].size;
*mask = (1U << sz) - 1U;
* Test the depth mask. Add the number of channel which has none zero mask
* into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}.
* The counter will add 4.
+ * TODO: could get that out of the fs loop.
*
* \param type holds element type of the mask vector.
* \param maskvalue is the depth test mask.
* \param counter is a pointer of the uint32 counter.
*/
void
-lp_build_occlusion_count(LLVMBuilderRef builder,
+lp_build_occlusion_count(struct gallivm_state *gallivm,
struct lp_type type,
LLVMValueRef maskvalue,
LLVMValueRef counter)
{
- LLVMValueRef countmask = lp_build_const_int_vec(type, 1);
- LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv");
- LLVMTypeRef i8v16 = LLVMVectorType(LLVMInt8Type(), 16);
- LLVMValueRef counti = LLVMBuildBitCast(builder, countv, i8v16, "counti");
- LLVMValueRef maskarray[4] = {
- LLVMConstInt(LLVMInt32Type(), 0, 0),
- LLVMConstInt(LLVMInt32Type(), 4, 0),
- LLVMConstInt(LLVMInt32Type(), 8, 0),
- LLVMConstInt(LLVMInt32Type(), 12, 0),
- };
- LLVMValueRef shufflemask = LLVMConstVector(maskarray, 4);
- LLVMValueRef shufflev = LLVMBuildShuffleVector(builder, counti, LLVMGetUndef(i8v16), shufflemask, "shufflev");
- LLVMValueRef shuffle = LLVMBuildBitCast(builder, shufflev, LLVMInt32Type(), "shuffle");
- LLVMValueRef count = lp_build_intrinsic_unary(builder, "llvm.ctpop.i32", LLVMInt32Type(), shuffle);
- LLVMValueRef orig = LLVMBuildLoad(builder, counter, "orig");
- LLVMValueRef incr = LLVMBuildAdd(builder, orig, count, "incr");
- LLVMBuildStore(builder, incr, counter);
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMContextRef context = gallivm->context;
+ LLVMValueRef countmask = lp_build_const_int_vec(gallivm, type, 1);
+ LLVMValueRef count, newcount;
+
+ assert(type.length <= 16);
+ assert(type.floating);
+
+ if(util_cpu_caps.has_sse && type.length == 4) {
+ const char *movmskintr = "llvm.x86.sse.movmsk.ps";
+ const char *popcntintr = "llvm.ctpop.i32";
+ LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
+ lp_build_vec_type(gallivm, type), "");
+ bits = lp_build_intrinsic_unary(builder, movmskintr,
+ LLVMInt32TypeInContext(context), bits);
+ count = lp_build_intrinsic_unary(builder, popcntintr,
+ LLVMInt32TypeInContext(context), bits);
+ count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
+ }
+ else if(util_cpu_caps.has_avx && type.length == 8) {
+ const char *movmskintr = "llvm.x86.avx.movmsk.ps.256";
+ const char *popcntintr = "llvm.ctpop.i32";
+ LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
+ lp_build_vec_type(gallivm, type), "");
+ bits = lp_build_intrinsic_unary(builder, movmskintr,
+ LLVMInt32TypeInContext(context), bits);
+ count = lp_build_intrinsic_unary(builder, popcntintr,
+ LLVMInt32TypeInContext(context), bits);
+ count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
+ }
+ else {
+ unsigned i;
+ LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv");
+ LLVMTypeRef counttype = LLVMIntTypeInContext(context, type.length * 8);
+ LLVMTypeRef i8vntype = LLVMVectorType(LLVMInt8TypeInContext(context), type.length * 4);
+ LLVMValueRef shufflev, countd;
+ LLVMValueRef shuffles[16];
+ const char *popcntintr = NULL;
+
+ countv = LLVMBuildBitCast(builder, countv, i8vntype, "");
+
+ for (i = 0; i < type.length; i++) {
+ shuffles[i] = lp_build_const_int32(gallivm, 4*i);
+ }
+
+ shufflev = LLVMConstVector(shuffles, type.length);
+ countd = LLVMBuildShuffleVector(builder, countv, LLVMGetUndef(i8vntype), shufflev, "");
+ countd = LLVMBuildBitCast(builder, countd, counttype, "countd");
+
+ /*
+ * XXX FIXME
+ * this is bad on cpus without popcount (on x86 supported by intel
+ * nehalem, amd barcelona, and up - not tied to sse42).
+ * Would be much faster to just sum the 4 elements of the vector with
+ * some horizontal add (shuffle/add/shuffle/add after the initial and).
+ */
+ switch (type.length) {
+ case 4:
+ popcntintr = "llvm.ctpop.i32";
+ break;
+ case 8:
+ popcntintr = "llvm.ctpop.i64";
+ break;
+ case 16:
+ popcntintr = "llvm.ctpop.i128";
+ break;
+ default:
+ assert(0);
+ }
+ count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd);
+
+ if (type.length > 8) {
+ count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 64), "");
+ }
+ else if (type.length < 8) {
+ count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
+ }
+ }
+ newcount = LLVMBuildLoad(builder, counter, "origcount");
+ newcount = LLVMBuildAdd(builder, newcount, count, "newcount");
+ LLVMBuildStore(builder, newcount, counter);
}
+/**
+ * Load depth/stencil values.
+ * The stored values are linear, swizzle them.
+ *
+ * \param type the data type of the fragment depth/stencil values
+ * \param format_desc description of the depth/stencil surface
+ * \param is_1d whether this resource has only one dimension
+ * \param loop_counter the current loop iteration
+ * \param depth_ptr pointer to the depth/stencil values of this 4x4 block
+ * \param depth_stride stride of the depth/stencil buffer
+ * \param z_fb contains z values loaded from fb (may include padding)
+ * \param s_fb contains s values loaded from fb (may include padding)
+ */
+void
+lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
+ struct lp_type z_src_type,
+ const struct util_format_description *format_desc,
+ boolean is_1d,
+ LLVMValueRef depth_ptr,
+ LLVMValueRef depth_stride,
+ LLVMValueRef *z_fb,
+ LLVMValueRef *s_fb,
+ LLVMValueRef loop_counter)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
+ LLVMValueRef zs_dst1, zs_dst2;
+ LLVMValueRef zs_dst_ptr;
+ LLVMValueRef depth_offset1, depth_offset2;
+ LLVMTypeRef load_ptr_type;
+ unsigned depth_bytes = format_desc->block.bits / 8;
+ struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
+ struct lp_type zs_load_type = zs_type;
+
+ zs_load_type.length = zs_load_type.length / 2;
+ load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);
+
+ if (z_src_type.length == 4) {
+ unsigned i;
+ LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
+ lp_build_const_int32(gallivm, 1), "");
+ LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
+ lp_build_const_int32(gallivm, 2), "");
+ LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
+ depth_stride, "");
+ depth_offset1 = LLVMBuildMul(builder, looplsb,
+ lp_build_const_int32(gallivm, depth_bytes * 2), "");
+ depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");
+
+ /* just concatenate the loaded 2x2 values into 4-wide vector */
+ for (i = 0; i < 4; i++) {
+ shuffles[i] = lp_build_const_int32(gallivm, i);
+ }
+ }
+ else {
+ unsigned i;
+ LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,
+ lp_build_const_int32(gallivm, 1), "");
+ assert(z_src_type.length == 8);
+ depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, "");
+ /*
+ * We load 2x4 values, and need to swizzle them (order
+ * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.
+ */
+ for (i = 0; i < 8; i++) {
+ shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
+ }
+ }
+
+ depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");
+
+ /* Load current z/stencil values from z/stencil buffer */
+ zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
+ zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
+ zs_dst1 = LLVMBuildLoad(builder, zs_dst_ptr, "");
+ if (is_1d) {
+ zs_dst2 = lp_build_undef(gallivm, zs_load_type);
+ }
+ else {
+ zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
+ zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
+ zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, "");
+ }
+
+ *z_fb = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2,
+ LLVMConstVector(shuffles, zs_type.length), "");
+ *s_fb = *z_fb;
+
+ if (format_desc->block.bits < z_src_type.width) {
+ /* Extend destination ZS values (e.g., when reading from Z16_UNORM) */
+ *z_fb = LLVMBuildZExt(builder, *z_fb,
+ lp_build_int_vec_type(gallivm, z_src_type), "");
+ }
+
+ else if (format_desc->block.bits > 32) {
+ /* rely on llvm to handle too wide vector we have here nicely */
+ unsigned i;
+ struct lp_type typex2 = zs_type;
+ struct lp_type s_type = zs_type;
+ LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 4];
+ LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 4];
+ LLVMValueRef tmp;
+
+ typex2.width = typex2.width / 2;
+ typex2.length = typex2.length * 2;
+ s_type.width = s_type.width / 2;
+ s_type.floating = 0;
+
+ tmp = LLVMBuildBitCast(builder, *z_fb,
+ lp_build_vec_type(gallivm, typex2), "");
+
+ for (i = 0; i < zs_type.length; i++) {
+ shuffles1[i] = lp_build_const_int32(gallivm, i * 2);
+ shuffles2[i] = lp_build_const_int32(gallivm, i * 2 + 1);
+ }
+ *z_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
+ LLVMConstVector(shuffles1, zs_type.length), "");
+ *s_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
+ LLVMConstVector(shuffles2, zs_type.length), "");
+ *s_fb = LLVMBuildBitCast(builder, *s_fb,
+ lp_build_vec_type(gallivm, s_type), "");
+ lp_build_name(*s_fb, "s_dst");
+ }
+
+ lp_build_name(*z_fb, "z_dst");
+ lp_build_name(*s_fb, "s_dst");
+ lp_build_name(*z_fb, "z_dst");
+}
+
+/**
+ * Store depth/stencil values.
+ * Incoming values are swizzled (typically n 2x2 quads), stored linear.
+ * If there's a mask it will do select/store otherwise just store.
+ *
+ * \param type the data type of the fragment depth/stencil values
+ * \param format_desc description of the depth/stencil surface
+ * \param is_1d whether this resource has only one dimension
+ * \param mask the alive/dead pixel mask for the quad (vector)
+ * \param z_fb z values read from fb (with padding)
+ * \param s_fb s values read from fb (with padding)
+ * \param loop_counter the current loop iteration
+ * \param depth_ptr pointer to the depth/stencil values of this 4x4 block
+ * \param depth_stride stride of the depth/stencil buffer
+ * \param z_value the depth values to store (with padding)
+ * \param s_value the stencil values to store (with padding)
+ */
+void
+lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm,
+ struct lp_type z_src_type,
+ const struct util_format_description *format_desc,
+ boolean is_1d,
+ struct lp_build_mask_context *mask,
+ LLVMValueRef z_fb,
+ LLVMValueRef s_fb,
+ LLVMValueRef loop_counter,
+ LLVMValueRef depth_ptr,
+ LLVMValueRef depth_stride,
+ LLVMValueRef z_value,
+ LLVMValueRef s_value)
+{
+ struct lp_build_context z_bld;
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef mask_value = NULL;
+ LLVMValueRef zs_dst1, zs_dst2;
+ LLVMValueRef zs_dst_ptr1, zs_dst_ptr2;
+ LLVMValueRef depth_offset1, depth_offset2;
+ LLVMTypeRef load_ptr_type;
+ unsigned depth_bytes = format_desc->block.bits / 8;
+ struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
+ struct lp_type z_type = zs_type;
+ struct lp_type zs_load_type = zs_type;
+
+ zs_load_type.length = zs_load_type.length / 2;
+ load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);
+
+ z_type.width = z_src_type.width;
+
+ lp_build_context_init(&z_bld, gallivm, z_type);
+
+ /*
+ * This is far from ideal, at least for late depth write we should do this
+ * outside the fs loop to avoid all the swizzle stuff.
+ */
+ if (z_src_type.length == 4) {
+ LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
+ lp_build_const_int32(gallivm, 1), "");
+ LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
+ lp_build_const_int32(gallivm, 2), "");
+ LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
+ depth_stride, "");
+ depth_offset1 = LLVMBuildMul(builder, looplsb,
+ lp_build_const_int32(gallivm, depth_bytes * 2), "");
+ depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");
+ }
+ else {
+ unsigned i;
+ LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,
+ lp_build_const_int32(gallivm, 1), "");
+ assert(z_src_type.length == 8);
+ depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, "");
+ /*
+ * We load 2x4 values, and need to swizzle them (order
+ * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.
+ */
+ for (i = 0; i < 8; i++) {
+ shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
+ }
+ }
+
+ depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");
+
+ zs_dst_ptr1 = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
+ zs_dst_ptr1 = LLVMBuildBitCast(builder, zs_dst_ptr1, load_ptr_type, "");
+ zs_dst_ptr2 = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
+ zs_dst_ptr2 = LLVMBuildBitCast(builder, zs_dst_ptr2, load_ptr_type, "");
+
+ if (format_desc->block.bits > 32) {
+ s_value = LLVMBuildBitCast(builder, s_value, z_bld.vec_type, "");
+ }
+
+ if (mask) {
+ mask_value = lp_build_mask_value(mask);
+ z_value = lp_build_select(&z_bld, mask_value, z_value, z_fb);
+ if (format_desc->block.bits > 32) {
+ s_fb = LLVMBuildBitCast(builder, s_fb, z_bld.vec_type, "");
+ s_value = lp_build_select(&z_bld, mask_value, s_value, s_fb);
+ }
+ }
+
+ if (zs_type.width < z_src_type.width) {
+ /* Truncate ZS values (e.g., when writing to Z16_UNORM) */
+ z_value = LLVMBuildTrunc(builder, z_value,
+ lp_build_int_vec_type(gallivm, zs_type), "");
+ }
+
+ if (format_desc->block.bits <= 32) {
+ if (z_src_type.length == 4) {
+ zs_dst1 = lp_build_extract_range(gallivm, z_value, 0, 2);
+ zs_dst2 = lp_build_extract_range(gallivm, z_value, 2, 2);
+ }
+ else {
+ assert(z_src_type.length == 8);
+ zs_dst1 = LLVMBuildShuffleVector(builder, z_value, z_value,
+ LLVMConstVector(&shuffles[0],
+ zs_load_type.length), "");
+ zs_dst2 = LLVMBuildShuffleVector(builder, z_value, z_value,
+ LLVMConstVector(&shuffles[4],
+ zs_load_type.length), "");
+ }
+ }
+ else {
+ if (z_src_type.length == 4) {
+ zs_dst1 = lp_build_interleave2(gallivm, z_type,
+ z_value, s_value, 0);
+ zs_dst2 = lp_build_interleave2(gallivm, z_type,
+ z_value, s_value, 1);
+ }
+ else {
+ unsigned i;
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 2];
+ assert(z_src_type.length == 8);
+ for (i = 0; i < 8; i++) {
+ shuffles[i*2] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
+ shuffles[i*2+1] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2 +
+ z_src_type.length);
+ }
+ zs_dst1 = LLVMBuildShuffleVector(builder, z_value, s_value,
+ LLVMConstVector(&shuffles[0],
+ z_src_type.length), "");
+ zs_dst2 = LLVMBuildShuffleVector(builder, z_value, s_value,
+ LLVMConstVector(&shuffles[8],
+ z_src_type.length), "");
+ }
+ zs_dst1 = LLVMBuildBitCast(builder, zs_dst1,
+ lp_build_vec_type(gallivm, zs_load_type), "");
+ zs_dst2 = LLVMBuildBitCast(builder, zs_dst2,
+ lp_build_vec_type(gallivm, zs_load_type), "");
+ }
+
+ LLVMBuildStore(builder, zs_dst1, zs_dst_ptr1);
+ if (!is_1d) {
+ LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2);
+ }
+}
/**
* Generate code for performing depth and/or stencil tests.
- * We operate on a vector of values (typically a 2x2 quad).
+ * We operate on a vector of values (typically n 2x2 quads).
*
* \param depth the depth test state
* \param stencil the front/back stencil state
* \param format_desc description of the depth/stencil surface
* \param mask the alive/dead pixel mask for the quad (vector)
* \param stencil_refs the front/back stencil ref values (scalar)
- * \param z_src the incoming depth/stencil values (a 2x2 quad, float32)
- * \param zs_dst_ptr pointer to depth/stencil values in framebuffer
- * \param facing contains float value indicating front/back facing polygon
+ * \param z_src the incoming depth/stencil values (n 2x2 quad values, float32)
+ * \param zs_dst the depth/stencil values in framebuffer
+ * \param face contains boolean value indicating front/back facing polygon
*/
void
-lp_build_depth_stencil_test(LLVMBuilderRef builder,
+lp_build_depth_stencil_test(struct gallivm_state *gallivm,
const struct pipe_depth_state *depth,
const struct pipe_stencil_state stencil[2],
struct lp_type z_src_type,
struct lp_build_mask_context *mask,
LLVMValueRef stencil_refs[2],
LLVMValueRef z_src,
- LLVMValueRef zs_dst_ptr,
+ LLVMValueRef z_fb,
+ LLVMValueRef s_fb,
LLVMValueRef face,
- LLVMValueRef *zs_value,
+ LLVMValueRef *z_value,
+ LLVMValueRef *s_value,
boolean do_branch)
{
- struct lp_type type;
- struct lp_build_context bld;
- struct lp_build_context sbld;
+ LLVMBuilderRef builder = gallivm->builder;
+ struct lp_type z_type;
+ struct lp_build_context z_bld;
+ struct lp_build_context s_bld;
struct lp_type s_type;
- LLVMValueRef zs_dst, z_dst = NULL;
+ unsigned z_shift = 0, z_width = 0, z_mask = 0;
+ LLVMValueRef z_dst = NULL;
LLVMValueRef stencil_vals = NULL;
LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;
LLVMValueRef z_pass = NULL, s_pass_mask = NULL;
- LLVMValueRef orig_mask = lp_build_mask_value(mask);
+ LLVMValueRef current_mask = lp_build_mask_value(mask);
LLVMValueRef front_facing = NULL;
-
- /* Prototype a simpler path:
- */
- if (z_src_type.floating &&
- format_desc->format == PIPE_FORMAT_X8Z24_UNORM &&
- depth->enabled)
- {
- LLVMValueRef zscaled;
- LLVMValueRef const_ffffff_float;
- LLVMValueRef const_8_int;
- LLVMTypeRef int32_vec_type;
-
- /* We know the values in z_dst are all >= 0, so allow
- * lp_build_compare to use signed compare intrinsics:
- */
- type.floating = 0;
- type.fixed = 0;
- type.sign = 1;
- type.norm = 1;
- type.width = 32;
- type.length = z_src_type.length;
-
- int32_vec_type = LLVMVectorType(LLVMInt32Type(), z_src_type.length);
-
- const_8_int = lp_build_const_int_vec(type, 8);
- const_ffffff_float = lp_build_const_vec(z_src_type, (float)0xffffff);
-
- zscaled = LLVMBuildFMul(builder, z_src, const_ffffff_float, "zscaled");
- z_src = LLVMBuildFPToSI(builder, zscaled, int32_vec_type, "z_src");
-
- /* Load current z/stencil value from z/stencil buffer */
- z_dst = LLVMBuildLoad(builder, zs_dst_ptr, "zsbufval");
- z_dst = LLVMBuildLShr(builder, z_dst, const_8_int, "z_dst");
-
- /* compare src Z to dst Z, returning 'pass' mask */
- z_pass = lp_build_compare(builder,
- type,
- depth->func, z_src, z_dst);
-
- lp_build_mask_update(mask, z_pass);
-
- if (do_branch)
- lp_build_mask_check(mask);
-
- /* No need to worry about old stencil contents, just blend the
- * old and new values and shift into the correct position for
- * storage.
- */
- if (depth->writemask) {
- type.sign = 1;
- lp_build_context_init(&bld, builder, type);
-
- z_dst = lp_build_select(&bld, lp_build_mask_value(mask), z_src, z_dst);
- z_dst = LLVMBuildShl(builder, z_dst, const_8_int, "z_dst");
- *zs_value = z_dst;
- }
-
- return;
- }
+ boolean have_z, have_s;
/*
* Depths are expected to be between 0 and 1, even if they are stored in
assert(z_src_type.norm);
}
- /* Pick the depth type. */
- type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length);
-
- /* FIXME: Cope with a depth test type with a different bit width. */
- assert(type.width == z_src_type.width);
- assert(type.length == z_src_type.length);
-
- /* Convert fragment Z from float to integer */
- lp_build_conv(builder, z_src_type, type, &z_src, 1, &z_src, 1);
-
- zs_dst_ptr = LLVMBuildBitCast(builder,
- zs_dst_ptr,
- LLVMPointerType(lp_build_vec_type(type), 0), "");
+ /* Pick the type matching the depth-stencil format. */
+ z_type = lp_depth_type(format_desc, z_src_type.length);
+ /* Pick the intermediate type for depth operations. */
+ z_type.width = z_src_type.width;
+ assert(z_type.length == z_src_type.length);
+ /* FIXME: for non-float depth/stencil might generate better code
+ * if we'd always split it up to use 128bit operations.
+ * For stencil we'd almost certainly want to pack to 8xi16 values,
+ * for z just run twice.
+ */
/* Sanity checking */
{
const unsigned z_swizzle = format_desc->swizzle[0];
const unsigned s_swizzle = format_desc->swizzle[1];
- assert(z_swizzle != UTIL_FORMAT_SWIZZLE_NONE ||
- s_swizzle != UTIL_FORMAT_SWIZZLE_NONE);
+ assert(z_swizzle != PIPE_SWIZZLE_NONE ||
+ s_swizzle != PIPE_SWIZZLE_NONE);
assert(depth->enabled || stencil[0].enabled);
assert(format_desc->block.height == 1);
if (stencil[0].enabled) {
- assert(format_desc->format == PIPE_FORMAT_Z24_UNORM_S8_USCALED ||
- format_desc->format == PIPE_FORMAT_S8_USCALED_Z24_UNORM);
+ assert(s_swizzle < 4);
+ assert(format_desc->channel[s_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
+ assert(format_desc->channel[s_swizzle].pure_integer);
+ assert(!format_desc->channel[s_swizzle].normalized);
+ assert(format_desc->channel[s_swizzle].size == 8);
}
- assert(z_swizzle < 4);
- assert(format_desc->block.bits == type.width);
- if (type.floating) {
- assert(z_swizzle == 0);
- assert(format_desc->channel[z_swizzle].type ==
- UTIL_FORMAT_TYPE_FLOAT);
- assert(format_desc->channel[z_swizzle].size ==
- format_desc->block.bits);
- }
- else {
- assert(format_desc->channel[z_swizzle].type ==
- UTIL_FORMAT_TYPE_UNSIGNED);
- assert(format_desc->channel[z_swizzle].normalized);
- assert(!type.fixed);
- assert(!type.sign);
- assert(type.norm);
+ if (depth->enabled) {
+ assert(z_swizzle < 4);
+ if (z_type.floating) {
+ assert(z_swizzle == 0);
+ assert(format_desc->channel[z_swizzle].type ==
+ UTIL_FORMAT_TYPE_FLOAT);
+ assert(format_desc->channel[z_swizzle].size == 32);
+ }
+ else {
+ assert(format_desc->channel[z_swizzle].type ==
+ UTIL_FORMAT_TYPE_UNSIGNED);
+ assert(format_desc->channel[z_swizzle].normalized);
+ assert(!z_type.fixed);
+ }
}
}
/* Setup build context for Z vals */
- lp_build_context_init(&bld, builder, type);
+ lp_build_context_init(&z_bld, gallivm, z_type);
/* Setup build context for stencil vals */
- s_type = lp_type_int_vec(type.width);
- lp_build_context_init(&sbld, builder, s_type);
-
- /* Load current z/stencil value from z/stencil buffer */
- zs_dst = LLVMBuildLoad(builder, zs_dst_ptr, "");
-
- lp_build_name(zs_dst, "zsbufval");
-
+ s_type = lp_int_type(z_type);
+ lp_build_context_init(&s_bld, gallivm, s_type);
/* Compute and apply the Z/stencil bitmasks and shifts.
*/
{
- unsigned z_shift, z_mask;
unsigned s_shift, s_mask;
- if (get_z_shift_and_mask(format_desc, &z_shift, &z_mask)) {
- if (z_shift) {
- LLVMValueRef shift = lp_build_const_int_vec(type, z_shift);
- z_src = LLVMBuildLShr(builder, z_src, shift, "");
- }
+ z_dst = z_fb;
+ stencil_vals = s_fb;
+ have_z = get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask);
+ have_s = get_s_shift_and_mask(format_desc, &s_shift, &s_mask);
+
+ if (have_z) {
if (z_mask != 0xffffffff) {
- LLVMValueRef mask = lp_build_const_int_vec(type, z_mask);
- z_src = LLVMBuildAnd(builder, z_src, mask, "");
- z_dst = LLVMBuildAnd(builder, zs_dst, mask, "");
- z_bitmask = mask; /* used below */
- }
- else {
- z_dst = zs_dst;
+ z_bitmask = lp_build_const_int_vec(gallivm, z_type, z_mask);
}
- lp_build_name(z_dst, "zsbuf.z");
+ /*
+ * Align the framebuffer Z 's LSB to the right.
+ */
+ if (z_shift) {
+ LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);
+ z_dst = LLVMBuildLShr(builder, z_dst, shift, "z_dst");
+ } else if (z_bitmask) {
+ z_dst = LLVMBuildAnd(builder, z_dst, z_bitmask, "z_dst");
+ } else {
+ lp_build_name(z_dst, "z_dst");
+ }
}
- if (get_s_shift_and_mask(format_desc, &s_shift, &s_mask)) {
+ if (have_s) {
if (s_shift) {
- LLVMValueRef shift = lp_build_const_int_vec(type, s_shift);
- stencil_vals = LLVMBuildLShr(builder, zs_dst, shift, "");
+ LLVMValueRef shift = lp_build_const_int_vec(gallivm, s_type, s_shift);
+ stencil_vals = LLVMBuildLShr(builder, stencil_vals, shift, "");
stencil_shift = shift; /* used below */
}
- else {
- stencil_vals = zs_dst;
- }
if (s_mask != 0xffffffff) {
- LLVMValueRef mask = lp_build_const_int_vec(type, s_mask);
+ LLVMValueRef mask = lp_build_const_int_vec(gallivm, s_type, s_mask);
stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, "");
}
- lp_build_name(stencil_vals, "stencil");
+ lp_build_name(stencil_vals, "s_dst");
}
}
if (stencil[0].enabled) {
if (face) {
- LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
-
- /* front_facing = face > 0.0 ? ~0 : 0 */
- front_facing = LLVMBuildFCmp(builder, LLVMRealUGT, face, zero, "");
- front_facing = LLVMBuildSExt(builder, front_facing,
- LLVMIntType(bld.type.length*bld.type.width),
- "");
- front_facing = LLVMBuildBitCast(builder, front_facing,
- bld.int_vec_type, "");
- }
+ if (0) {
+ /*
+ * XXX: the scalar expansion below produces atrocious code
+ * (basically producing a 64bit scalar value, then moving the 2
+ * 32bit pieces separately to simd, plus 4 shuffles, which is
+ * seriously lame). But the scalar-simd transitions are always
+ * tricky, so no big surprise there.
+ * This here would be way better, however llvm has some serious
+ * trouble later using it in the select, probably because it will
+ * recognize the expression as constant and move the simd value
+ * away (out of the loop) - and then it will suddenly try
+ * constructing i1 high-bit masks out of it later...
+ * (Try piglit stencil-twoside.)
+ * Note this is NOT due to using SExt/Trunc, it fails exactly the
+ * same even when using native compare/select.
+ * I cannot reproduce this problem when using stand-alone compiler
+ * though, suggesting some problem with optimization passes...
+ * (With stand-alone compilation, the construction of this mask
+ * value, no matter if the easy 3 instruction here or the complex
+ * 16+ one below, never gets separated from where it's used.)
+ * The scalar code still has the same problem, but the generated
+ * code looks a bit better at least for some reason, even if
+ * mostly by luck (the fundamental issue clearly is the same).
+ */
+ front_facing = lp_build_broadcast(gallivm, s_bld.vec_type, face);
+ /* front_facing = face != 0 ? ~0 : 0 */
+ front_facing = lp_build_compare(gallivm, s_bld.type,
+ PIPE_FUNC_NOTEQUAL,
+ front_facing, s_bld.zero);
+ } else {
+ LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
+
+ /* front_facing = face != 0 ? ~0 : 0 */
+ front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");
+ front_facing = LLVMBuildSExt(builder, front_facing,
+ LLVMIntTypeInContext(gallivm->context,
+ s_bld.type.length*s_bld.type.width),
+ "");
+ front_facing = LLVMBuildBitCast(builder, front_facing,
+ s_bld.int_vec_type, "");
- /* convert scalar stencil refs into vectors */
- stencil_refs[0] = lp_build_broadcast_scalar(&bld, stencil_refs[0]);
- stencil_refs[1] = lp_build_broadcast_scalar(&bld, stencil_refs[1]);
+ }
+ }
- s_pass_mask = lp_build_stencil_test(&sbld, stencil,
+ s_pass_mask = lp_build_stencil_test(&s_bld, stencil,
stencil_refs, stencil_vals,
front_facing);
/* apply stencil-fail operator */
{
- LLVMValueRef s_fail_mask = lp_build_andnot(&bld, orig_mask, s_pass_mask);
- stencil_vals = lp_build_stencil_op(&sbld, stencil, S_FAIL_OP,
+ LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, current_mask, s_pass_mask);
+ stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP,
stencil_refs, stencil_vals,
s_fail_mask, front_facing);
}
}
if (depth->enabled) {
+ /*
+ * Convert fragment Z to the desired type, aligning the LSB to the right.
+ */
+
+ assert(z_type.width == z_src_type.width);
+ assert(z_type.length == z_src_type.length);
+ assert(lp_check_value(z_src_type, z_src));
+ if (z_src_type.floating) {
+ /*
+ * Convert from floating point values
+ */
+
+ if (!z_type.floating) {
+ z_src = lp_build_clamped_float_to_unsigned_norm(gallivm,
+ z_src_type,
+ z_width,
+ z_src);
+ }
+ } else {
+ /*
+ * Convert from unsigned normalized values.
+ */
+
+ assert(!z_src_type.sign);
+ assert(!z_src_type.fixed);
+ assert(z_src_type.norm);
+ assert(!z_type.floating);
+ if (z_src_type.width > z_width) {
+ LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_src_type,
+ z_src_type.width - z_width);
+ z_src = LLVMBuildLShr(builder, z_src, shift, "");
+ }
+ }
+ assert(lp_check_value(z_type, z_src));
+
+ lp_build_name(z_src, "z_src");
+
/* compare src Z to dst Z, returning 'pass' mask */
- z_pass = lp_build_cmp(&bld, depth->func, z_src, z_dst);
+ z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst);
+
+ /* mask off bits that failed stencil test */
+ if (s_pass_mask) {
+ current_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, "");
+ }
if (!stencil[0].enabled) {
/* We can potentially skip all remaining operations here, but only
if (do_branch) {
lp_build_mask_check(mask);
- do_branch = FALSE;
}
}
if (depth->writemask) {
- LLVMValueRef zselectmask = lp_build_mask_value(mask);
+ LLVMValueRef z_pass_mask;
/* mask off bits that failed Z test */
- zselectmask = LLVMBuildAnd(builder, zselectmask, z_pass, "");
-
- /* mask off bits that failed stencil test */
- if (s_pass_mask) {
- zselectmask = LLVMBuildAnd(builder, zselectmask, s_pass_mask, "");
- }
-
- /* if combined Z/stencil format, mask off the stencil bits */
- if (z_bitmask) {
- zselectmask = LLVMBuildAnd(builder, zselectmask, z_bitmask, "");
- }
+ z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, "");
/* Mix the old and new Z buffer values.
- * z_dst[i] = (zselectmask[i] & z_src[i]) | (~zselectmask[i] & z_dst[i])
+ * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i]
*/
- z_dst = lp_build_select_bitwise(&bld, zselectmask, z_src, z_dst);
+ z_dst = lp_build_select(&z_bld, z_pass_mask, z_src, z_dst);
}
if (stencil[0].enabled) {
LLVMValueRef z_fail_mask, z_pass_mask;
/* apply Z-fail operator */
- z_fail_mask = lp_build_andnot(&bld, orig_mask, z_pass);
- stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_FAIL_OP,
+ z_fail_mask = lp_build_andnot(&s_bld, current_mask, z_pass);
+ stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP,
stencil_refs, stencil_vals,
z_fail_mask, front_facing);
/* apply Z-pass operator */
- z_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, z_pass, "");
- stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
+ z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, "");
+ stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
stencil_refs, stencil_vals,
z_pass_mask, front_facing);
}
/* No depth test: apply Z-pass operator to stencil buffer values which
* passed the stencil test.
*/
- s_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, s_pass_mask, "");
- stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
+ s_pass_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, "");
+ stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
stencil_refs, stencil_vals,
s_pass_mask, front_facing);
}
- /* The Z bits are already in the right place but we may need to shift the
- * stencil bits before ORing Z with Stencil to make the final pixel value.
- */
+ /* Put Z and stencil bits in the right place */
+ if (have_z && z_shift) {
+ LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);
+ z_dst = LLVMBuildShl(builder, z_dst, shift, "");
+ }
if (stencil_vals && stencil_shift)
- stencil_vals = LLVMBuildShl(bld.builder, stencil_vals,
+ stencil_vals = LLVMBuildShl(builder, stencil_vals,
stencil_shift, "");
- /* Finally, merge/store the z/stencil values */
- if ((depth->enabled && depth->writemask) ||
- (stencil[0].enabled && stencil[0].writemask)) {
-
- if (z_dst && stencil_vals)
- zs_dst = LLVMBuildOr(bld.builder, z_dst, stencil_vals, "");
- else if (z_dst)
- zs_dst = z_dst;
+ /* Finally, merge the z/stencil values */
+ if (format_desc->block.bits <= 32) {
+ if (have_z && have_s)
+ *z_value = LLVMBuildOr(builder, z_dst, stencil_vals, "");
+ else if (have_z)
+ *z_value = z_dst;
else
- zs_dst = stencil_vals;
-
- *zs_value = zs_dst;
+ *z_value = stencil_vals;
+ *s_value = *z_value;
+ }
+ else {
+ *z_value = z_dst;
+ *s_value = stencil_vals;
}
if (s_pass_mask)
if (depth->enabled && stencil[0].enabled)
lp_build_mask_update(mask, z_pass);
-
- if (do_branch)
- lp_build_mask_check(mask);
-
}
-
-
-void
-lp_build_deferred_depth_write(LLVMBuilderRef builder,
- struct lp_type z_src_type,
- const struct util_format_description *format_desc,
- struct lp_build_mask_context *mask,
- LLVMValueRef zs_dst_ptr,
- LLVMValueRef zs_value)
-{
- struct lp_type type;
- struct lp_build_context bld;
- LLVMValueRef z_dst;
-
- /* XXX: pointlessly redo type logic:
- */
- type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length);
- lp_build_context_init(&bld, builder, type);
-
- z_dst = LLVMBuildLoad(builder, zs_dst_ptr, "zsbufval");
- z_dst = lp_build_select(&bld, lp_build_mask_value(mask), zs_value, z_dst);
-
- LLVMBuildStore(builder, z_dst, zs_dst_ptr);
-}