X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fllvmpipe%2Flp_bld_depth.c;h=63c2fb5d891757c073e8fa4ca8f1e9c28bca8b80;hb=882ca6dfb0f3d17e0f8bc917307d915ab1718069;hp=e768493103e164e7af3509bc0ac4e9ba57bbee41;hpb=cc40abad519cc0f765c6d8f6fad4154bed8dd9c2;p=mesa.git diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c index e768493103e..63c2fb5d891 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c @@ -1,6 +1,6 @@ /************************************************************************** * - * Copyright 2009 VMware, Inc. + * Copyright 2009-2010 VMware, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a @@ -36,36 +36,22 @@ * flushing would avoid this, but it would most likely result in depth fighting * artifacts. * - * We are free to use a different pixel layout though. Since our basic - * processing unit is a quad (2x2 pixel block) we store the depth/stencil - * values tiled, a quad at time. That is, a depth buffer containing + * Since we're using linear layout for everything, but we need to deal with + * 2x2 quads, we need to load/store multiple values and swizzle them into + * place (we could avoid this by doing depth/stencil testing in linear format, + * which would be easy for late depth/stencil test as we could do that after + * the fragment shader loop just as we do for color buffers, but more tricky + * for early depth test as we'd need both masks and interpolated depth in + * linear format). * - * Z11 Z12 Z13 Z14 ... - * Z21 Z22 Z23 Z24 ... - * Z31 Z32 Z33 Z34 ... - * Z41 Z42 Z43 Z44 ... - * ... ... ... ... ... - * - * will actually be stored in memory as - * - * Z11 Z12 Z21 Z22 Z13 Z14 Z23 Z24 ... - * Z31 Z32 Z41 Z42 Z33 Z34 Z43 Z44 ... - * ... ... ... ... ... ... ... ... ... - * - * - * Stencil test: - * Two-sided stencil test is supported but probably not as efficient as - * it could be. Currently, we use if/then/else constructs to do the - * operations for front vs. back-facing polygons. We could probably do - * both the front and back arithmetic then use a Select() instruction to - * choose the result depending on polyon orientation. We'd have to - * measure performance both ways and see which is better. * * @author Jose Fonseca + * @author Brian Paul */ #include "pipe/p_state.h" -#include "util/u_format.h" +#include "util/format/u_format.h" +#include "util/u_cpu_detect.h" #include "gallivm/lp_bld_type.h" #include "gallivm/lp_bld_arit.h" @@ -77,6 +63,7 @@ #include "gallivm/lp_bld_intr.h" #include "gallivm/lp_bld_debug.h" #include "gallivm/lp_bld_swizzle.h" +#include "gallivm/lp_bld_pack.h" #include "lp_bld_depth.h" @@ -104,20 +91,30 @@ lp_build_stencil_test_single(struct lp_build_context *bld, LLVMValueRef stencilRef, LLVMValueRef stencilVals) { + LLVMBuilderRef builder = bld->gallivm->builder; const unsigned stencilMax = 255; /* XXX fix */ struct lp_type type = bld->type; LLVMValueRef res; - assert(type.sign); + /* + * SSE2 has intrinsics for signed comparisons, but not unsigned ones. Values + * are between 0..255 so ensure we generate the fastest comparisons for + * wider elements. + */ + if (type.width <= 8) { + assert(!type.sign); + } else { + assert(type.sign); + } assert(stencil->enabled); if (stencil->valuemask != stencilMax) { /* compute stencilRef = stencilRef & valuemask */ - LLVMValueRef valuemask = lp_build_const_int_vec(type, stencil->valuemask); - stencilRef = LLVMBuildAnd(bld->builder, stencilRef, valuemask, ""); + LLVMValueRef valuemask = lp_build_const_int_vec(bld->gallivm, type, stencil->valuemask); + stencilRef = LLVMBuildAnd(builder, stencilRef, valuemask, ""); /* compute stencilVals = stencilVals & valuemask */ - stencilVals = LLVMBuildAnd(bld->builder, stencilVals, valuemask, ""); + stencilVals = LLVMBuildAnd(builder, stencilVals, valuemask, ""); } res = lp_build_cmp(bld, stencil->func, stencilRef, stencilVals); @@ -147,7 +144,7 @@ lp_build_stencil_test(struct lp_build_context *bld, res = lp_build_stencil_test_single(bld, &stencil[0], stencilRefs[0], stencilVals); - if (stencil[1].enabled && front_facing) { + if (stencil[1].enabled && front_facing != NULL) { /* do back face test */ LLVMValueRef back_res; @@ -174,9 +171,10 @@ lp_build_stencil_op_single(struct lp_build_context *bld, LLVMValueRef stencilVals) { + LLVMBuilderRef builder = bld->gallivm->builder; struct lp_type type = bld->type; LLVMValueRef res; - LLVMValueRef max = lp_build_const_int_vec(type, 0xff); + LLVMValueRef max = lp_build_const_int_vec(bld->gallivm, type, 0xff); unsigned stencil_op; assert(type.sign); @@ -217,15 +215,15 @@ lp_build_stencil_op_single(struct lp_build_context *bld, break; case PIPE_STENCIL_OP_INCR_WRAP: res = lp_build_add(bld, stencilVals, bld->one); - res = LLVMBuildAnd(bld->builder, res, max, ""); + res = LLVMBuildAnd(builder, res, max, ""); break; case PIPE_STENCIL_OP_DECR_WRAP: res = lp_build_sub(bld, stencilVals, bld->one); - res = LLVMBuildAnd(bld->builder, res, max, ""); + res = LLVMBuildAnd(builder, res, max, ""); break; case PIPE_STENCIL_OP_INVERT: - res = LLVMBuildNot(bld->builder, stencilVals, ""); - res = LLVMBuildAnd(bld->builder, res, max, ""); + res = LLVMBuildNot(builder, stencilVals, ""); + res = LLVMBuildAnd(builder, res, max, ""); break; default: assert(0 && "bad stencil op mode"); @@ -249,6 +247,7 @@ lp_build_stencil_op(struct lp_build_context *bld, LLVMValueRef front_facing) { + LLVMBuilderRef builder = bld->gallivm->builder; LLVMValueRef res; assert(stencil[0].enabled); @@ -257,7 +256,7 @@ lp_build_stencil_op(struct lp_build_context *bld, res = lp_build_stencil_op_single(bld, &stencil[0], op, stencilRefs[0], stencilVals); - if (stencil[1].enabled && front_facing) { + if (stencil[1].enabled && front_facing != NULL) { /* do back face op */ LLVMValueRef back_res; @@ -267,12 +266,20 @@ lp_build_stencil_op(struct lp_build_context *bld, res = lp_build_select(bld, front_facing, res, back_res); } - if (stencil->writemask != 0xff) { - /* mask &= stencil->writemask */ - LLVMValueRef writemask = lp_build_const_int_vec(bld->type, stencil->writemask); - mask = LLVMBuildAnd(bld->builder, mask, writemask, ""); + if (stencil[0].writemask != 0xff || + (stencil[1].enabled && front_facing != NULL && stencil[1].writemask != 0xff)) { + /* mask &= stencil[0].writemask */ + LLVMValueRef writemask = lp_build_const_int_vec(bld->gallivm, bld->type, + stencil[0].writemask); + if (stencil[1].enabled && stencil[1].writemask != stencil[0].writemask && front_facing != NULL) { + LLVMValueRef back_writemask = lp_build_const_int_vec(bld->gallivm, bld->type, + stencil[1].writemask); + writemask = lp_build_select(bld, front_facing, writemask, back_writemask); + } + + mask = LLVMBuildAnd(builder, mask, writemask, ""); /* res = (res & mask) | (stencilVals & ~mask) */ - res = lp_build_select_bitwise(bld, writemask, res, stencilVals); + res = lp_build_select_bitwise(bld, mask, res, stencilVals); } else { /* res = mask ? res : stencilVals */ @@ -285,40 +292,44 @@ lp_build_stencil_op(struct lp_build_context *bld, /** - * Return a type appropriate for depth/stencil testing. + * Return a type that matches the depth/stencil format. */ struct lp_type lp_depth_type(const struct util_format_description *format_desc, unsigned length) { struct lp_type type; - unsigned swizzle; + unsigned z_swizzle; assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS); assert(format_desc->block.width == 1); assert(format_desc->block.height == 1); - swizzle = format_desc->swizzle[0]; - assert(swizzle < 4); - memset(&type, 0, sizeof type); type.width = format_desc->block.bits; - if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_FLOAT) { - type.floating = TRUE; - assert(swizzle == 0); - assert(format_desc->channel[swizzle].size == format_desc->block.bits); - } - else if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) { - assert(format_desc->block.bits <= 32); - if(format_desc->channel[swizzle].normalized) - type.norm = TRUE; + z_swizzle = format_desc->swizzle[0]; + if (z_swizzle < 4) { + if (format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT) { + type.floating = TRUE; + assert(z_swizzle == 0); + assert(format_desc->channel[z_swizzle].size == 32); + } + else if(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) { + assert(format_desc->block.bits <= 32); + assert(format_desc->channel[z_swizzle].normalized); + if (format_desc->channel[z_swizzle].size < format_desc->block.bits) { + /* Prefer signed integers when possible, as SSE has less support + * for unsigned comparison; + */ + type.sign = TRUE; + } + } + else + assert(0); } - else - assert(0); - assert(type.width <= length); - type.length = length / type.width; + type.length = length; return type; } @@ -334,40 +345,33 @@ lp_depth_type(const struct util_format_description *format_desc, */ static boolean get_z_shift_and_mask(const struct util_format_description *format_desc, - unsigned *shift, unsigned *mask) + unsigned *shift, unsigned *width, unsigned *mask) { - const unsigned total_bits = format_desc->block.bits; + unsigned total_bits; unsigned z_swizzle; - unsigned chan; - unsigned padding_left, padding_right; - + assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS); assert(format_desc->block.width == 1); assert(format_desc->block.height == 1); + /* 64bit d/s format is special already extracted 32 bits */ + total_bits = format_desc->block.bits > 32 ? 32 : format_desc->block.bits; + z_swizzle = format_desc->swizzle[0]; - if (z_swizzle == UTIL_FORMAT_SWIZZLE_NONE) + if (z_swizzle == PIPE_SWIZZLE_NONE) return FALSE; - padding_right = 0; - for (chan = 0; chan < z_swizzle; ++chan) - padding_right += format_desc->channel[chan].size; + *width = format_desc->channel[z_swizzle].size; + /* & 31 is for the same reason as the 32-bit limit above */ + *shift = format_desc->channel[z_swizzle].shift & 31; - padding_left = - total_bits - (padding_right + format_desc->channel[z_swizzle].size); - - if (padding_left || padding_right) { - unsigned long long mask_left = (1ULL << (total_bits - padding_left)) - 1; - unsigned long long mask_right = (1ULL << (padding_right)) - 1; - *mask = mask_left ^ mask_right; - } - else { + if (*width == total_bits) { *mask = 0xffffffff; + } else { + *mask = ((1 << *width) - 1) << *shift; } - *shift = padding_left; - return TRUE; } @@ -382,17 +386,23 @@ get_s_shift_and_mask(const struct util_format_description *format_desc, unsigned *shift, unsigned *mask) { unsigned s_swizzle; - unsigned chan, sz; + unsigned sz; s_swizzle = format_desc->swizzle[1]; - if (s_swizzle == UTIL_FORMAT_SWIZZLE_NONE) + if (s_swizzle == PIPE_SWIZZLE_NONE) return FALSE; - *shift = 0; - for (chan = 0; chan < s_swizzle; chan++) - *shift += format_desc->channel[chan].size; + /* just special case 64bit d/s format */ + if (format_desc->block.bits > 32) { + /* XXX big-endian? */ + assert(format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); + *shift = 0; + *mask = 0xff; + return TRUE; + } + *shift = format_desc->channel[s_swizzle].shift; sz = format_desc->channel[s_swizzle].size; *mask = (1U << sz) - 1U; @@ -405,41 +415,391 @@ get_s_shift_and_mask(const struct util_format_description *format_desc, * Test the depth mask. Add the number of channel which has none zero mask * into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}. * The counter will add 4. + * TODO: could get that out of the fs loop. * * \param type holds element type of the mask vector. * \param maskvalue is the depth test mask. * \param counter is a pointer of the uint32 counter. */ void -lp_build_occlusion_count(LLVMBuilderRef builder, +lp_build_occlusion_count(struct gallivm_state *gallivm, struct lp_type type, LLVMValueRef maskvalue, LLVMValueRef counter) { - LLVMValueRef countmask = lp_build_const_int_vec(type, 1); - LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv"); - LLVMTypeRef i8v16 = LLVMVectorType(LLVMInt8Type(), 16); - LLVMValueRef counti = LLVMBuildBitCast(builder, countv, i8v16, "counti"); - LLVMValueRef maskarray[4] = { - LLVMConstInt(LLVMInt32Type(), 0, 0), - LLVMConstInt(LLVMInt32Type(), 4, 0), - LLVMConstInt(LLVMInt32Type(), 8, 0), - LLVMConstInt(LLVMInt32Type(), 12, 0), - }; - LLVMValueRef shufflemask = LLVMConstVector(maskarray, 4); - LLVMValueRef shufflev = LLVMBuildShuffleVector(builder, counti, LLVMGetUndef(i8v16), shufflemask, "shufflev"); - LLVMValueRef shuffle = LLVMBuildBitCast(builder, shufflev, LLVMInt32Type(), "shuffle"); - LLVMValueRef count = lp_build_intrinsic_unary(builder, "llvm.ctpop.i32", LLVMInt32Type(), shuffle); - LLVMValueRef orig = LLVMBuildLoad(builder, counter, "orig"); - LLVMValueRef incr = LLVMBuildAdd(builder, orig, count, "incr"); - LLVMBuildStore(builder, incr, counter); + LLVMBuilderRef builder = gallivm->builder; + LLVMContextRef context = gallivm->context; + LLVMValueRef countmask = lp_build_const_int_vec(gallivm, type, 1); + LLVMValueRef count, newcount; + + assert(type.length <= 16); + assert(type.floating); + + if(util_cpu_caps.has_sse && type.length == 4) { + const char *movmskintr = "llvm.x86.sse.movmsk.ps"; + const char *popcntintr = "llvm.ctpop.i32"; + LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue, + lp_build_vec_type(gallivm, type), ""); + bits = lp_build_intrinsic_unary(builder, movmskintr, + LLVMInt32TypeInContext(context), bits); + count = lp_build_intrinsic_unary(builder, popcntintr, + LLVMInt32TypeInContext(context), bits); + count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), ""); + } + else if(util_cpu_caps.has_avx && type.length == 8) { + const char *movmskintr = "llvm.x86.avx.movmsk.ps.256"; + const char *popcntintr = "llvm.ctpop.i32"; + LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue, + lp_build_vec_type(gallivm, type), ""); + bits = lp_build_intrinsic_unary(builder, movmskintr, + LLVMInt32TypeInContext(context), bits); + count = lp_build_intrinsic_unary(builder, popcntintr, + LLVMInt32TypeInContext(context), bits); + count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), ""); + } + else { + unsigned i; + LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv"); + LLVMTypeRef counttype = LLVMIntTypeInContext(context, type.length * 8); + LLVMTypeRef i8vntype = LLVMVectorType(LLVMInt8TypeInContext(context), type.length * 4); + LLVMValueRef shufflev, countd; + LLVMValueRef shuffles[16]; + const char *popcntintr = NULL; + + countv = LLVMBuildBitCast(builder, countv, i8vntype, ""); + + for (i = 0; i < type.length; i++) { + shuffles[i] = lp_build_const_int32(gallivm, 4*i); + } + + shufflev = LLVMConstVector(shuffles, type.length); + countd = LLVMBuildShuffleVector(builder, countv, LLVMGetUndef(i8vntype), shufflev, ""); + countd = LLVMBuildBitCast(builder, countd, counttype, "countd"); + + /* + * XXX FIXME + * this is bad on cpus without popcount (on x86 supported by intel + * nehalem, amd barcelona, and up - not tied to sse42). + * Would be much faster to just sum the 4 elements of the vector with + * some horizontal add (shuffle/add/shuffle/add after the initial and). + */ + switch (type.length) { + case 4: + popcntintr = "llvm.ctpop.i32"; + break; + case 8: + popcntintr = "llvm.ctpop.i64"; + break; + case 16: + popcntintr = "llvm.ctpop.i128"; + break; + default: + assert(0); + } + count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd); + + if (type.length > 8) { + count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 64), ""); + } + else if (type.length < 8) { + count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), ""); + } + } + newcount = LLVMBuildLoad(builder, counter, "origcount"); + newcount = LLVMBuildAdd(builder, newcount, count, "newcount"); + LLVMBuildStore(builder, newcount, counter); } +/** + * Load depth/stencil values. + * The stored values are linear, swizzle them. + * + * \param type the data type of the fragment depth/stencil values + * \param format_desc description of the depth/stencil surface + * \param is_1d whether this resource has only one dimension + * \param loop_counter the current loop iteration + * \param depth_ptr pointer to the depth/stencil values of this 4x4 block + * \param depth_stride stride of the depth/stencil buffer + * \param z_fb contains z values loaded from fb (may include padding) + * \param s_fb contains s values loaded from fb (may include padding) + */ +void +lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm, + struct lp_type z_src_type, + const struct util_format_description *format_desc, + boolean is_1d, + LLVMValueRef depth_ptr, + LLVMValueRef depth_stride, + LLVMValueRef *z_fb, + LLVMValueRef *s_fb, + LLVMValueRef loop_counter) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4]; + LLVMValueRef zs_dst1, zs_dst2; + LLVMValueRef zs_dst_ptr; + LLVMValueRef depth_offset1, depth_offset2; + LLVMTypeRef load_ptr_type; + unsigned depth_bytes = format_desc->block.bits / 8; + struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length); + struct lp_type zs_load_type = zs_type; + + zs_load_type.length = zs_load_type.length / 2; + load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0); + + if (z_src_type.length == 4) { + unsigned i; + LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter, + lp_build_const_int32(gallivm, 1), ""); + LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter, + lp_build_const_int32(gallivm, 2), ""); + LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb, + depth_stride, ""); + depth_offset1 = LLVMBuildMul(builder, looplsb, + lp_build_const_int32(gallivm, depth_bytes * 2), ""); + depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, ""); + + /* just concatenate the loaded 2x2 values into 4-wide vector */ + for (i = 0; i < 4; i++) { + shuffles[i] = lp_build_const_int32(gallivm, i); + } + } + else { + unsigned i; + LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter, + lp_build_const_int32(gallivm, 1), ""); + assert(z_src_type.length == 8); + depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, ""); + /* + * We load 2x4 values, and need to swizzle them (order + * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately. + */ + for (i = 0; i < 8; i++) { + shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2); + } + } + + depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, ""); + + /* Load current z/stencil values from z/stencil buffer */ + zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, ""); + zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, ""); + zs_dst1 = LLVMBuildLoad(builder, zs_dst_ptr, ""); + if (is_1d) { + zs_dst2 = lp_build_undef(gallivm, zs_load_type); + } + else { + zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, ""); + zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, ""); + zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, ""); + } + + *z_fb = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2, + LLVMConstVector(shuffles, zs_type.length), ""); + *s_fb = *z_fb; + + if (format_desc->block.bits < z_src_type.width) { + /* Extend destination ZS values (e.g., when reading from Z16_UNORM) */ + *z_fb = LLVMBuildZExt(builder, *z_fb, + lp_build_int_vec_type(gallivm, z_src_type), ""); + } + + else if (format_desc->block.bits > 32) { + /* rely on llvm to handle too wide vector we have here nicely */ + unsigned i; + struct lp_type typex2 = zs_type; + struct lp_type s_type = zs_type; + LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 4]; + LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 4]; + LLVMValueRef tmp; + + typex2.width = typex2.width / 2; + typex2.length = typex2.length * 2; + s_type.width = s_type.width / 2; + s_type.floating = 0; + + tmp = LLVMBuildBitCast(builder, *z_fb, + lp_build_vec_type(gallivm, typex2), ""); + + for (i = 0; i < zs_type.length; i++) { + shuffles1[i] = lp_build_const_int32(gallivm, i * 2); + shuffles2[i] = lp_build_const_int32(gallivm, i * 2 + 1); + } + *z_fb = LLVMBuildShuffleVector(builder, tmp, tmp, + LLVMConstVector(shuffles1, zs_type.length), ""); + *s_fb = LLVMBuildShuffleVector(builder, tmp, tmp, + LLVMConstVector(shuffles2, zs_type.length), ""); + *s_fb = LLVMBuildBitCast(builder, *s_fb, + lp_build_vec_type(gallivm, s_type), ""); + lp_build_name(*s_fb, "s_dst"); + } + + lp_build_name(*z_fb, "z_dst"); + lp_build_name(*s_fb, "s_dst"); + lp_build_name(*z_fb, "z_dst"); +} + +/** + * Store depth/stencil values. + * Incoming values are swizzled (typically n 2x2 quads), stored linear. + * If there's a mask it will do select/store otherwise just store. + * + * \param type the data type of the fragment depth/stencil values + * \param format_desc description of the depth/stencil surface + * \param is_1d whether this resource has only one dimension + * \param mask the alive/dead pixel mask for the quad (vector) + * \param z_fb z values read from fb (with padding) + * \param s_fb s values read from fb (with padding) + * \param loop_counter the current loop iteration + * \param depth_ptr pointer to the depth/stencil values of this 4x4 block + * \param depth_stride stride of the depth/stencil buffer + * \param z_value the depth values to store (with padding) + * \param s_value the stencil values to store (with padding) + */ +void +lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm, + struct lp_type z_src_type, + const struct util_format_description *format_desc, + boolean is_1d, + struct lp_build_mask_context *mask, + LLVMValueRef z_fb, + LLVMValueRef s_fb, + LLVMValueRef loop_counter, + LLVMValueRef depth_ptr, + LLVMValueRef depth_stride, + LLVMValueRef z_value, + LLVMValueRef s_value) +{ + struct lp_build_context z_bld; + LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4]; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef mask_value = NULL; + LLVMValueRef zs_dst1, zs_dst2; + LLVMValueRef zs_dst_ptr1, zs_dst_ptr2; + LLVMValueRef depth_offset1, depth_offset2; + LLVMTypeRef load_ptr_type; + unsigned depth_bytes = format_desc->block.bits / 8; + struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length); + struct lp_type z_type = zs_type; + struct lp_type zs_load_type = zs_type; + + zs_load_type.length = zs_load_type.length / 2; + load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0); + + z_type.width = z_src_type.width; + + lp_build_context_init(&z_bld, gallivm, z_type); + + /* + * This is far from ideal, at least for late depth write we should do this + * outside the fs loop to avoid all the swizzle stuff. + */ + if (z_src_type.length == 4) { + LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter, + lp_build_const_int32(gallivm, 1), ""); + LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter, + lp_build_const_int32(gallivm, 2), ""); + LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb, + depth_stride, ""); + depth_offset1 = LLVMBuildMul(builder, looplsb, + lp_build_const_int32(gallivm, depth_bytes * 2), ""); + depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, ""); + } + else { + unsigned i; + LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter, + lp_build_const_int32(gallivm, 1), ""); + assert(z_src_type.length == 8); + depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, ""); + /* + * We load 2x4 values, and need to swizzle them (order + * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately. + */ + for (i = 0; i < 8; i++) { + shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2); + } + } + + depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, ""); + + zs_dst_ptr1 = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, ""); + zs_dst_ptr1 = LLVMBuildBitCast(builder, zs_dst_ptr1, load_ptr_type, ""); + zs_dst_ptr2 = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, ""); + zs_dst_ptr2 = LLVMBuildBitCast(builder, zs_dst_ptr2, load_ptr_type, ""); + + if (format_desc->block.bits > 32) { + s_value = LLVMBuildBitCast(builder, s_value, z_bld.vec_type, ""); + } + + if (mask) { + mask_value = lp_build_mask_value(mask); + z_value = lp_build_select(&z_bld, mask_value, z_value, z_fb); + if (format_desc->block.bits > 32) { + s_fb = LLVMBuildBitCast(builder, s_fb, z_bld.vec_type, ""); + s_value = lp_build_select(&z_bld, mask_value, s_value, s_fb); + } + } + + if (zs_type.width < z_src_type.width) { + /* Truncate ZS values (e.g., when writing to Z16_UNORM) */ + z_value = LLVMBuildTrunc(builder, z_value, + lp_build_int_vec_type(gallivm, zs_type), ""); + } + + if (format_desc->block.bits <= 32) { + if (z_src_type.length == 4) { + zs_dst1 = lp_build_extract_range(gallivm, z_value, 0, 2); + zs_dst2 = lp_build_extract_range(gallivm, z_value, 2, 2); + } + else { + assert(z_src_type.length == 8); + zs_dst1 = LLVMBuildShuffleVector(builder, z_value, z_value, + LLVMConstVector(&shuffles[0], + zs_load_type.length), ""); + zs_dst2 = LLVMBuildShuffleVector(builder, z_value, z_value, + LLVMConstVector(&shuffles[4], + zs_load_type.length), ""); + } + } + else { + if (z_src_type.length == 4) { + zs_dst1 = lp_build_interleave2(gallivm, z_type, + z_value, s_value, 0); + zs_dst2 = lp_build_interleave2(gallivm, z_type, + z_value, s_value, 1); + } + else { + unsigned i; + LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 2]; + assert(z_src_type.length == 8); + for (i = 0; i < 8; i++) { + shuffles[i*2] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2); + shuffles[i*2+1] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2 + + z_src_type.length); + } + zs_dst1 = LLVMBuildShuffleVector(builder, z_value, s_value, + LLVMConstVector(&shuffles[0], + z_src_type.length), ""); + zs_dst2 = LLVMBuildShuffleVector(builder, z_value, s_value, + LLVMConstVector(&shuffles[8], + z_src_type.length), ""); + } + zs_dst1 = LLVMBuildBitCast(builder, zs_dst1, + lp_build_vec_type(gallivm, zs_load_type), ""); + zs_dst2 = LLVMBuildBitCast(builder, zs_dst2, + lp_build_vec_type(gallivm, zs_load_type), ""); + } + + LLVMBuildStore(builder, zs_dst1, zs_dst_ptr1); + if (!is_1d) { + LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2); + } +} /** * Generate code for performing depth and/or stencil tests. - * We operate on a vector of values (typically a 2x2 quad). + * We operate on a vector of values (typically n 2x2 quads). * * \param depth the depth test state * \param stencil the front/back stencil state @@ -447,12 +807,12 @@ lp_build_occlusion_count(LLVMBuilderRef builder, * \param format_desc description of the depth/stencil surface * \param mask the alive/dead pixel mask for the quad (vector) * \param stencil_refs the front/back stencil ref values (scalar) - * \param z_src the incoming depth/stencil values (a 2x2 quad, float32) - * \param zs_dst_ptr pointer to depth/stencil values in framebuffer - * \param facing contains float value indicating front/back facing polygon + * \param z_src the incoming depth/stencil values (n 2x2 quad values, float32) + * \param zs_dst the depth/stencil values in framebuffer + * \param face contains boolean value indicating front/back facing polygon */ void -lp_build_depth_stencil_test(LLVMBuilderRef builder, +lp_build_depth_stencil_test(struct gallivm_state *gallivm, const struct pipe_depth_state *depth, const struct pipe_stencil_state stencil[2], struct lp_type z_src_type, @@ -460,80 +820,26 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder, struct lp_build_mask_context *mask, LLVMValueRef stencil_refs[2], LLVMValueRef z_src, - LLVMValueRef zs_dst_ptr, + LLVMValueRef z_fb, + LLVMValueRef s_fb, LLVMValueRef face, - LLVMValueRef *zs_value, + LLVMValueRef *z_value, + LLVMValueRef *s_value, boolean do_branch) { - struct lp_type type; - struct lp_build_context bld; - struct lp_build_context sbld; + LLVMBuilderRef builder = gallivm->builder; + struct lp_type z_type; + struct lp_build_context z_bld; + struct lp_build_context s_bld; struct lp_type s_type; - LLVMValueRef zs_dst, z_dst = NULL; + unsigned z_shift = 0, z_width = 0, z_mask = 0; + LLVMValueRef z_dst = NULL; LLVMValueRef stencil_vals = NULL; LLVMValueRef z_bitmask = NULL, stencil_shift = NULL; LLVMValueRef z_pass = NULL, s_pass_mask = NULL; - LLVMValueRef orig_mask = lp_build_mask_value(mask); + LLVMValueRef current_mask = lp_build_mask_value(mask); LLVMValueRef front_facing = NULL; - - /* Prototype a simpler path: - */ - if (z_src_type.floating && - format_desc->format == PIPE_FORMAT_X8Z24_UNORM && - depth->enabled) - { - LLVMValueRef zscaled; - LLVMValueRef const_ffffff_float; - LLVMValueRef const_8_int; - LLVMTypeRef int32_vec_type; - - /* We know the values in z_dst are all >= 0, so allow - * lp_build_compare to use signed compare intrinsics: - */ - type.floating = 0; - type.fixed = 0; - type.sign = 1; - type.norm = 1; - type.width = 32; - type.length = z_src_type.length; - - int32_vec_type = LLVMVectorType(LLVMInt32Type(), z_src_type.length); - - const_8_int = lp_build_const_int_vec(type, 8); - const_ffffff_float = lp_build_const_vec(z_src_type, (float)0xffffff); - - zscaled = LLVMBuildFMul(builder, z_src, const_ffffff_float, "zscaled"); - z_src = LLVMBuildFPToSI(builder, zscaled, int32_vec_type, "z_src"); - - /* Load current z/stencil value from z/stencil buffer */ - z_dst = LLVMBuildLoad(builder, zs_dst_ptr, "zsbufval"); - z_dst = LLVMBuildLShr(builder, z_dst, const_8_int, "z_dst"); - - /* compare src Z to dst Z, returning 'pass' mask */ - z_pass = lp_build_compare(builder, - type, - depth->func, z_src, z_dst); - - lp_build_mask_update(mask, z_pass); - - if (do_branch) - lp_build_mask_check(mask); - - /* No need to worry about old stencil contents, just blend the - * old and new values and shift into the correct position for - * storage. - */ - if (depth->writemask) { - type.sign = 1; - lp_build_context_init(&bld, builder, type); - - z_dst = lp_build_select(&bld, lp_build_mask_value(mask), z_src, z_dst); - z_dst = LLVMBuildShl(builder, z_dst, const_8_int, "z_dst"); - *zs_value = z_dst; - } - - return; - } + boolean have_z, have_s; /* * Depths are expected to be between 0 and 1, even if they are stored in @@ -549,29 +855,26 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder, assert(z_src_type.norm); } - /* Pick the depth type. */ - type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length); - - /* FIXME: Cope with a depth test type with a different bit width. */ - assert(type.width == z_src_type.width); - assert(type.length == z_src_type.length); - - /* Convert fragment Z from float to integer */ - lp_build_conv(builder, z_src_type, type, &z_src, 1, &z_src, 1); - - zs_dst_ptr = LLVMBuildBitCast(builder, - zs_dst_ptr, - LLVMPointerType(lp_build_vec_type(type), 0), ""); + /* Pick the type matching the depth-stencil format. */ + z_type = lp_depth_type(format_desc, z_src_type.length); + /* Pick the intermediate type for depth operations. */ + z_type.width = z_src_type.width; + assert(z_type.length == z_src_type.length); + /* FIXME: for non-float depth/stencil might generate better code + * if we'd always split it up to use 128bit operations. + * For stencil we'd almost certainly want to pack to 8xi16 values, + * for z just run twice. + */ /* Sanity checking */ { const unsigned z_swizzle = format_desc->swizzle[0]; const unsigned s_swizzle = format_desc->swizzle[1]; - assert(z_swizzle != UTIL_FORMAT_SWIZZLE_NONE || - s_swizzle != UTIL_FORMAT_SWIZZLE_NONE); + assert(z_swizzle != PIPE_SWIZZLE_NONE || + s_swizzle != PIPE_SWIZZLE_NONE); assert(depth->enabled || stencil[0].enabled); @@ -580,121 +883,188 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder, assert(format_desc->block.height == 1); if (stencil[0].enabled) { - assert(format_desc->format == PIPE_FORMAT_Z24_UNORM_S8_USCALED || - format_desc->format == PIPE_FORMAT_S8_USCALED_Z24_UNORM); + assert(s_swizzle < 4); + assert(format_desc->channel[s_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED); + assert(format_desc->channel[s_swizzle].pure_integer); + assert(!format_desc->channel[s_swizzle].normalized); + assert(format_desc->channel[s_swizzle].size == 8); } - assert(z_swizzle < 4); - assert(format_desc->block.bits == type.width); - if (type.floating) { - assert(z_swizzle == 0); - assert(format_desc->channel[z_swizzle].type == - UTIL_FORMAT_TYPE_FLOAT); - assert(format_desc->channel[z_swizzle].size == - format_desc->block.bits); - } - else { - assert(format_desc->channel[z_swizzle].type == - UTIL_FORMAT_TYPE_UNSIGNED); - assert(format_desc->channel[z_swizzle].normalized); - assert(!type.fixed); - assert(!type.sign); - assert(type.norm); + if (depth->enabled) { + assert(z_swizzle < 4); + if (z_type.floating) { + assert(z_swizzle == 0); + assert(format_desc->channel[z_swizzle].type == + UTIL_FORMAT_TYPE_FLOAT); + assert(format_desc->channel[z_swizzle].size == 32); + } + else { + assert(format_desc->channel[z_swizzle].type == + UTIL_FORMAT_TYPE_UNSIGNED); + assert(format_desc->channel[z_swizzle].normalized); + assert(!z_type.fixed); + } } } /* Setup build context for Z vals */ - lp_build_context_init(&bld, builder, type); + lp_build_context_init(&z_bld, gallivm, z_type); /* Setup build context for stencil vals */ - s_type = lp_type_int_vec(type.width); - lp_build_context_init(&sbld, builder, s_type); - - /* Load current z/stencil value from z/stencil buffer */ - zs_dst = LLVMBuildLoad(builder, zs_dst_ptr, ""); - - lp_build_name(zs_dst, "zsbufval"); - + s_type = lp_int_type(z_type); + lp_build_context_init(&s_bld, gallivm, s_type); /* Compute and apply the Z/stencil bitmasks and shifts. */ { - unsigned z_shift, z_mask; unsigned s_shift, s_mask; - if (get_z_shift_and_mask(format_desc, &z_shift, &z_mask)) { - if (z_shift) { - LLVMValueRef shift = lp_build_const_int_vec(type, z_shift); - z_src = LLVMBuildLShr(builder, z_src, shift, ""); - } + z_dst = z_fb; + stencil_vals = s_fb; + have_z = get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask); + have_s = get_s_shift_and_mask(format_desc, &s_shift, &s_mask); + + if (have_z) { if (z_mask != 0xffffffff) { - LLVMValueRef mask = lp_build_const_int_vec(type, z_mask); - z_src = LLVMBuildAnd(builder, z_src, mask, ""); - z_dst = LLVMBuildAnd(builder, zs_dst, mask, ""); - z_bitmask = mask; /* used below */ - } - else { - z_dst = zs_dst; + z_bitmask = lp_build_const_int_vec(gallivm, z_type, z_mask); } - lp_build_name(z_dst, "zsbuf.z"); + /* + * Align the framebuffer Z 's LSB to the right. + */ + if (z_shift) { + LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift); + z_dst = LLVMBuildLShr(builder, z_dst, shift, "z_dst"); + } else if (z_bitmask) { + z_dst = LLVMBuildAnd(builder, z_dst, z_bitmask, "z_dst"); + } else { + lp_build_name(z_dst, "z_dst"); + } } - if (get_s_shift_and_mask(format_desc, &s_shift, &s_mask)) { + if (have_s) { if (s_shift) { - LLVMValueRef shift = lp_build_const_int_vec(type, s_shift); - stencil_vals = LLVMBuildLShr(builder, zs_dst, shift, ""); + LLVMValueRef shift = lp_build_const_int_vec(gallivm, s_type, s_shift); + stencil_vals = LLVMBuildLShr(builder, stencil_vals, shift, ""); stencil_shift = shift; /* used below */ } - else { - stencil_vals = zs_dst; - } if (s_mask != 0xffffffff) { - LLVMValueRef mask = lp_build_const_int_vec(type, s_mask); + LLVMValueRef mask = lp_build_const_int_vec(gallivm, s_type, s_mask); stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, ""); } - lp_build_name(stencil_vals, "stencil"); + lp_build_name(stencil_vals, "s_dst"); } } if (stencil[0].enabled) { if (face) { - LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0); - - /* front_facing = face > 0.0 ? ~0 : 0 */ - front_facing = LLVMBuildFCmp(builder, LLVMRealUGT, face, zero, ""); - front_facing = LLVMBuildSExt(builder, front_facing, - LLVMIntType(bld.type.length*bld.type.width), - ""); - front_facing = LLVMBuildBitCast(builder, front_facing, - bld.int_vec_type, ""); - } + if (0) { + /* + * XXX: the scalar expansion below produces atrocious code + * (basically producing a 64bit scalar value, then moving the 2 + * 32bit pieces separately to simd, plus 4 shuffles, which is + * seriously lame). But the scalar-simd transitions are always + * tricky, so no big surprise there. + * This here would be way better, however llvm has some serious + * trouble later using it in the select, probably because it will + * recognize the expression as constant and move the simd value + * away (out of the loop) - and then it will suddenly try + * constructing i1 high-bit masks out of it later... + * (Try piglit stencil-twoside.) + * Note this is NOT due to using SExt/Trunc, it fails exactly the + * same even when using native compare/select. + * I cannot reproduce this problem when using stand-alone compiler + * though, suggesting some problem with optimization passes... + * (With stand-alone compilation, the construction of this mask + * value, no matter if the easy 3 instruction here or the complex + * 16+ one below, never gets separated from where it's used.) + * The scalar code still has the same problem, but the generated + * code looks a bit better at least for some reason, even if + * mostly by luck (the fundamental issue clearly is the same). + */ + front_facing = lp_build_broadcast(gallivm, s_bld.vec_type, face); + /* front_facing = face != 0 ? ~0 : 0 */ + front_facing = lp_build_compare(gallivm, s_bld.type, + PIPE_FUNC_NOTEQUAL, + front_facing, s_bld.zero); + } else { + LLVMValueRef zero = lp_build_const_int32(gallivm, 0); + + /* front_facing = face != 0 ? ~0 : 0 */ + front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, ""); + front_facing = LLVMBuildSExt(builder, front_facing, + LLVMIntTypeInContext(gallivm->context, + s_bld.type.length*s_bld.type.width), + ""); + front_facing = LLVMBuildBitCast(builder, front_facing, + s_bld.int_vec_type, ""); - /* convert scalar stencil refs into vectors */ - stencil_refs[0] = lp_build_broadcast_scalar(&bld, stencil_refs[0]); - stencil_refs[1] = lp_build_broadcast_scalar(&bld, stencil_refs[1]); + } + } - s_pass_mask = lp_build_stencil_test(&sbld, stencil, + s_pass_mask = lp_build_stencil_test(&s_bld, stencil, stencil_refs, stencil_vals, front_facing); /* apply stencil-fail operator */ { - LLVMValueRef s_fail_mask = lp_build_andnot(&bld, orig_mask, s_pass_mask); - stencil_vals = lp_build_stencil_op(&sbld, stencil, S_FAIL_OP, + LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, current_mask, s_pass_mask); + stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP, stencil_refs, stencil_vals, s_fail_mask, front_facing); } } if (depth->enabled) { + /* + * Convert fragment Z to the desired type, aligning the LSB to the right. + */ + + assert(z_type.width == z_src_type.width); + assert(z_type.length == z_src_type.length); + assert(lp_check_value(z_src_type, z_src)); + if (z_src_type.floating) { + /* + * Convert from floating point values + */ + + if (!z_type.floating) { + z_src = lp_build_clamped_float_to_unsigned_norm(gallivm, + z_src_type, + z_width, + z_src); + } + } else { + /* + * Convert from unsigned normalized values. + */ + + assert(!z_src_type.sign); + assert(!z_src_type.fixed); + assert(z_src_type.norm); + assert(!z_type.floating); + if (z_src_type.width > z_width) { + LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_src_type, + z_src_type.width - z_width); + z_src = LLVMBuildLShr(builder, z_src, shift, ""); + } + } + assert(lp_check_value(z_type, z_src)); + + lp_build_name(z_src, "z_src"); + /* compare src Z to dst Z, returning 'pass' mask */ - z_pass = lp_build_cmp(&bld, depth->func, z_src, z_dst); + z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst); + + /* mask off bits that failed stencil test */ + if (s_pass_mask) { + current_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, ""); + } if (!stencil[0].enabled) { /* We can potentially skip all remaining operations here, but only @@ -705,30 +1075,19 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder, if (do_branch) { lp_build_mask_check(mask); - do_branch = FALSE; } } if (depth->writemask) { - LLVMValueRef zselectmask = lp_build_mask_value(mask); + LLVMValueRef z_pass_mask; /* mask off bits that failed Z test */ - zselectmask = LLVMBuildAnd(builder, zselectmask, z_pass, ""); - - /* mask off bits that failed stencil test */ - if (s_pass_mask) { - zselectmask = LLVMBuildAnd(builder, zselectmask, s_pass_mask, ""); - } - - /* if combined Z/stencil format, mask off the stencil bits */ - if (z_bitmask) { - zselectmask = LLVMBuildAnd(builder, zselectmask, z_bitmask, ""); - } + z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, ""); /* Mix the old and new Z buffer values. - * z_dst[i] = (zselectmask[i] & z_src[i]) | (~zselectmask[i] & z_dst[i]) + * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i] */ - z_dst = lp_build_select_bitwise(&bld, zselectmask, z_src, z_dst); + z_dst = lp_build_select(&z_bld, z_pass_mask, z_src, z_dst); } if (stencil[0].enabled) { @@ -736,14 +1095,14 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder, LLVMValueRef z_fail_mask, z_pass_mask; /* apply Z-fail operator */ - z_fail_mask = lp_build_andnot(&bld, orig_mask, z_pass); - stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_FAIL_OP, + z_fail_mask = lp_build_andnot(&s_bld, current_mask, z_pass); + stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP, stencil_refs, stencil_vals, z_fail_mask, front_facing); /* apply Z-pass operator */ - z_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, z_pass, ""); - stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP, + z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, ""); + stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP, stencil_refs, stencil_vals, z_pass_mask, front_facing); } @@ -752,31 +1111,34 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder, /* No depth test: apply Z-pass operator to stencil buffer values which * passed the stencil test. */ - s_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, s_pass_mask, ""); - stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP, + s_pass_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, ""); + stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP, stencil_refs, stencil_vals, s_pass_mask, front_facing); } - /* The Z bits are already in the right place but we may need to shift the - * stencil bits before ORing Z with Stencil to make the final pixel value. - */ + /* Put Z and stencil bits in the right place */ + if (have_z && z_shift) { + LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift); + z_dst = LLVMBuildShl(builder, z_dst, shift, ""); + } if (stencil_vals && stencil_shift) - stencil_vals = LLVMBuildShl(bld.builder, stencil_vals, + stencil_vals = LLVMBuildShl(builder, stencil_vals, stencil_shift, ""); - /* Finally, merge/store the z/stencil values */ - if ((depth->enabled && depth->writemask) || - (stencil[0].enabled && stencil[0].writemask)) { - - if (z_dst && stencil_vals) - zs_dst = LLVMBuildOr(bld.builder, z_dst, stencil_vals, ""); - else if (z_dst) - zs_dst = z_dst; + /* Finally, merge the z/stencil values */ + if (format_desc->block.bits <= 32) { + if (have_z && have_s) + *z_value = LLVMBuildOr(builder, z_dst, stencil_vals, ""); + else if (have_z) + *z_value = z_dst; else - zs_dst = stencil_vals; - - *zs_value = zs_dst; + *z_value = stencil_vals; + *s_value = *z_value; + } + else { + *z_value = z_dst; + *s_value = stencil_vals; } if (s_pass_mask) @@ -784,33 +1146,5 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder, if (depth->enabled && stencil[0].enabled) lp_build_mask_update(mask, z_pass); - - if (do_branch) - lp_build_mask_check(mask); - } - - -void -lp_build_deferred_depth_write(LLVMBuilderRef builder, - struct lp_type z_src_type, - const struct util_format_description *format_desc, - struct lp_build_mask_context *mask, - LLVMValueRef zs_dst_ptr, - LLVMValueRef zs_value) -{ - struct lp_type type; - struct lp_build_context bld; - LLVMValueRef z_dst; - - /* XXX: pointlessly redo type logic: - */ - type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length); - lp_build_context_init(&bld, builder, type); - - z_dst = LLVMBuildLoad(builder, zs_dst_ptr, "zsbufval"); - z_dst = lp_build_select(&bld, lp_build_mask_value(mask), zs_value, z_dst); - - LLVMBuildStore(builder, z_dst, zs_dst_ptr); -}