+/**
+ * Perform the occlusion test and increase the counter.
+ * Test the depth mask. Add the number of channel which has none zero mask
+ * into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}.
+ * The counter will add 4.
+ * TODO: could get that out of the fs loop.
+ *
+ * \param type holds element type of the mask vector.
+ * \param maskvalue is the depth test mask.
+ * \param counter is a pointer of the uint32 counter.
+ */
+void
+lp_build_occlusion_count(struct gallivm_state *gallivm,
+ struct lp_type type,
+ LLVMValueRef maskvalue,
+ LLVMValueRef counter)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMContextRef context = gallivm->context;
+ LLVMValueRef countmask = lp_build_const_int_vec(gallivm, type, 1);
+ LLVMValueRef count, newcount;
+
+ assert(type.length <= 16);
+ assert(type.floating);
+
+ if(util_cpu_caps.has_sse && type.length == 4) {
+ const char *movmskintr = "llvm.x86.sse.movmsk.ps";
+ const char *popcntintr = "llvm.ctpop.i32";
+ LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
+ lp_build_vec_type(gallivm, type), "");
+ bits = lp_build_intrinsic_unary(builder, movmskintr,
+ LLVMInt32TypeInContext(context), bits);
+ count = lp_build_intrinsic_unary(builder, popcntintr,
+ LLVMInt32TypeInContext(context), bits);
+ count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
+ }
+ else if(util_cpu_caps.has_avx && type.length == 8) {
+ const char *movmskintr = "llvm.x86.avx.movmsk.ps.256";
+ const char *popcntintr = "llvm.ctpop.i32";
+ LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
+ lp_build_vec_type(gallivm, type), "");
+ bits = lp_build_intrinsic_unary(builder, movmskintr,
+ LLVMInt32TypeInContext(context), bits);
+ count = lp_build_intrinsic_unary(builder, popcntintr,
+ LLVMInt32TypeInContext(context), bits);
+ count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
+ }
+ else {
+ unsigned i;
+ LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv");
+ LLVMTypeRef counttype = LLVMIntTypeInContext(context, type.length * 8);
+ LLVMTypeRef i8vntype = LLVMVectorType(LLVMInt8TypeInContext(context), type.length * 4);
+ LLVMValueRef shufflev, countd;
+ LLVMValueRef shuffles[16];
+ const char *popcntintr = NULL;
+
+ countv = LLVMBuildBitCast(builder, countv, i8vntype, "");
+
+ for (i = 0; i < type.length; i++) {
+ shuffles[i] = lp_build_const_int32(gallivm, 4*i);
+ }
+
+ shufflev = LLVMConstVector(shuffles, type.length);
+ countd = LLVMBuildShuffleVector(builder, countv, LLVMGetUndef(i8vntype), shufflev, "");
+ countd = LLVMBuildBitCast(builder, countd, counttype, "countd");
+
+ /*
+ * XXX FIXME
+ * this is bad on cpus without popcount (on x86 supported by intel
+ * nehalem, amd barcelona, and up - not tied to sse42).
+ * Would be much faster to just sum the 4 elements of the vector with
+ * some horizontal add (shuffle/add/shuffle/add after the initial and).
+ */
+ switch (type.length) {
+ case 4:
+ popcntintr = "llvm.ctpop.i32";
+ break;
+ case 8:
+ popcntintr = "llvm.ctpop.i64";
+ break;
+ case 16:
+ popcntintr = "llvm.ctpop.i128";
+ break;
+ default:
+ assert(0);
+ }
+ count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd);
+
+ if (type.length > 8) {
+ count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 64), "");
+ }
+ else if (type.length < 8) {
+ count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
+ }
+ }
+ newcount = LLVMBuildLoad(builder, counter, "origcount");
+ newcount = LLVMBuildAdd(builder, newcount, count, "newcount");
+ LLVMBuildStore(builder, newcount, counter);
+}
+
+
+/**
+ * Load depth/stencil values.
+ * The stored values are linear, swizzle them.
+ *
+ * \param type the data type of the fragment depth/stencil values
+ * \param format_desc description of the depth/stencil surface
+ * \param is_1d whether this resource has only one dimension
+ * \param loop_counter the current loop iteration
+ * \param depth_ptr pointer to the depth/stencil values of this 4x4 block
+ * \param depth_stride stride of the depth/stencil buffer
+ * \param z_fb contains z values loaded from fb (may include padding)
+ * \param s_fb contains s values loaded from fb (may include padding)
+ */
+void
+lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
+ struct lp_type z_src_type,
+ const struct util_format_description *format_desc,
+ boolean is_1d,
+ LLVMValueRef depth_ptr,
+ LLVMValueRef depth_stride,
+ LLVMValueRef *z_fb,
+ LLVMValueRef *s_fb,
+ LLVMValueRef loop_counter)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
+ LLVMValueRef zs_dst1, zs_dst2;
+ LLVMValueRef zs_dst_ptr;
+ LLVMValueRef depth_offset1, depth_offset2;
+ LLVMTypeRef load_ptr_type;
+ unsigned depth_bytes = format_desc->block.bits / 8;
+ struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
+ struct lp_type zs_load_type = zs_type;
+
+ zs_load_type.length = zs_load_type.length / 2;
+ load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);
+
+ if (z_src_type.length == 4) {
+ unsigned i;
+ LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
+ lp_build_const_int32(gallivm, 1), "");
+ LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
+ lp_build_const_int32(gallivm, 2), "");
+ LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
+ depth_stride, "");
+ depth_offset1 = LLVMBuildMul(builder, looplsb,
+ lp_build_const_int32(gallivm, depth_bytes * 2), "");
+ depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");
+
+ /* just concatenate the loaded 2x2 values into 4-wide vector */
+ for (i = 0; i < 4; i++) {
+ shuffles[i] = lp_build_const_int32(gallivm, i);
+ }
+ }
+ else {
+ unsigned i;
+ LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,
+ lp_build_const_int32(gallivm, 1), "");
+ assert(z_src_type.length == 8);
+ depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, "");
+ /*
+ * We load 2x4 values, and need to swizzle them (order
+ * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.
+ */
+ for (i = 0; i < 8; i++) {
+ shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
+ }
+ }
+
+ depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");
+
+ /* Load current z/stencil values from z/stencil buffer */
+ zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
+ zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
+ zs_dst1 = LLVMBuildLoad(builder, zs_dst_ptr, "");
+ if (is_1d) {
+ zs_dst2 = lp_build_undef(gallivm, zs_load_type);
+ }
+ else {
+ zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
+ zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
+ zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, "");
+ }
+
+ *z_fb = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2,
+ LLVMConstVector(shuffles, zs_type.length), "");
+ *s_fb = *z_fb;
+
+ if (format_desc->block.bits < z_src_type.width) {
+ /* Extend destination ZS values (e.g., when reading from Z16_UNORM) */
+ *z_fb = LLVMBuildZExt(builder, *z_fb,
+ lp_build_int_vec_type(gallivm, z_src_type), "");
+ }
+
+ else if (format_desc->block.bits > 32) {
+ /* rely on llvm to handle too wide vector we have here nicely */
+ unsigned i;
+ struct lp_type typex2 = zs_type;
+ struct lp_type s_type = zs_type;
+ LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 4];
+ LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 4];
+ LLVMValueRef tmp;
+
+ typex2.width = typex2.width / 2;
+ typex2.length = typex2.length * 2;
+ s_type.width = s_type.width / 2;
+ s_type.floating = 0;
+
+ tmp = LLVMBuildBitCast(builder, *z_fb,
+ lp_build_vec_type(gallivm, typex2), "");
+
+ for (i = 0; i < zs_type.length; i++) {
+ shuffles1[i] = lp_build_const_int32(gallivm, i * 2);
+ shuffles2[i] = lp_build_const_int32(gallivm, i * 2 + 1);
+ }
+ *z_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
+ LLVMConstVector(shuffles1, zs_type.length), "");
+ *s_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
+ LLVMConstVector(shuffles2, zs_type.length), "");
+ *s_fb = LLVMBuildBitCast(builder, *s_fb,
+ lp_build_vec_type(gallivm, s_type), "");
+ lp_build_name(*s_fb, "s_dst");
+ }
+
+ lp_build_name(*z_fb, "z_dst");
+ lp_build_name(*s_fb, "s_dst");
+ lp_build_name(*z_fb, "z_dst");
+}
+
+/**
+ * Store depth/stencil values.
+ * Incoming values are swizzled (typically n 2x2 quads), stored linear.
+ * If there's a mask it will do select/store otherwise just store.
+ *
+ * \param type the data type of the fragment depth/stencil values
+ * \param format_desc description of the depth/stencil surface
+ * \param is_1d whether this resource has only one dimension
+ * \param mask the alive/dead pixel mask for the quad (vector)
+ * \param z_fb z values read from fb (with padding)
+ * \param s_fb s values read from fb (with padding)
+ * \param loop_counter the current loop iteration
+ * \param depth_ptr pointer to the depth/stencil values of this 4x4 block
+ * \param depth_stride stride of the depth/stencil buffer
+ * \param z_value the depth values to store (with padding)
+ * \param s_value the stencil values to store (with padding)
+ */
+void
+lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm,
+ struct lp_type z_src_type,
+ const struct util_format_description *format_desc,
+ boolean is_1d,
+ struct lp_build_mask_context *mask,
+ LLVMValueRef z_fb,
+ LLVMValueRef s_fb,
+ LLVMValueRef loop_counter,
+ LLVMValueRef depth_ptr,
+ LLVMValueRef depth_stride,
+ LLVMValueRef z_value,
+ LLVMValueRef s_value)
+{
+ struct lp_build_context z_bld;
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef mask_value = NULL;
+ LLVMValueRef zs_dst1, zs_dst2;
+ LLVMValueRef zs_dst_ptr1, zs_dst_ptr2;
+ LLVMValueRef depth_offset1, depth_offset2;
+ LLVMTypeRef load_ptr_type;
+ unsigned depth_bytes = format_desc->block.bits / 8;
+ struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
+ struct lp_type z_type = zs_type;
+ struct lp_type zs_load_type = zs_type;
+
+ zs_load_type.length = zs_load_type.length / 2;
+ load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);
+
+ z_type.width = z_src_type.width;
+
+ lp_build_context_init(&z_bld, gallivm, z_type);
+
+ /*
+ * This is far from ideal, at least for late depth write we should do this
+ * outside the fs loop to avoid all the swizzle stuff.
+ */
+ if (z_src_type.length == 4) {
+ LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
+ lp_build_const_int32(gallivm, 1), "");
+ LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
+ lp_build_const_int32(gallivm, 2), "");
+ LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
+ depth_stride, "");
+ depth_offset1 = LLVMBuildMul(builder, looplsb,
+ lp_build_const_int32(gallivm, depth_bytes * 2), "");
+ depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");
+ }
+ else {
+ unsigned i;
+ LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,
+ lp_build_const_int32(gallivm, 1), "");
+ assert(z_src_type.length == 8);
+ depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, "");
+ /*
+ * We load 2x4 values, and need to swizzle them (order
+ * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.
+ */
+ for (i = 0; i < 8; i++) {
+ shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
+ }
+ }
+
+ depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");
+
+ zs_dst_ptr1 = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
+ zs_dst_ptr1 = LLVMBuildBitCast(builder, zs_dst_ptr1, load_ptr_type, "");
+ zs_dst_ptr2 = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
+ zs_dst_ptr2 = LLVMBuildBitCast(builder, zs_dst_ptr2, load_ptr_type, "");
+
+ if (format_desc->block.bits > 32) {
+ s_value = LLVMBuildBitCast(builder, s_value, z_bld.vec_type, "");
+ }
+
+ if (mask) {
+ mask_value = lp_build_mask_value(mask);
+ z_value = lp_build_select(&z_bld, mask_value, z_value, z_fb);
+ if (format_desc->block.bits > 32) {
+ s_fb = LLVMBuildBitCast(builder, s_fb, z_bld.vec_type, "");
+ s_value = lp_build_select(&z_bld, mask_value, s_value, s_fb);
+ }
+ }
+
+ if (zs_type.width < z_src_type.width) {
+ /* Truncate ZS values (e.g., when writing to Z16_UNORM) */
+ z_value = LLVMBuildTrunc(builder, z_value,
+ lp_build_int_vec_type(gallivm, zs_type), "");
+ }
+
+ if (format_desc->block.bits <= 32) {
+ if (z_src_type.length == 4) {
+ zs_dst1 = lp_build_extract_range(gallivm, z_value, 0, 2);
+ zs_dst2 = lp_build_extract_range(gallivm, z_value, 2, 2);
+ }
+ else {
+ assert(z_src_type.length == 8);
+ zs_dst1 = LLVMBuildShuffleVector(builder, z_value, z_value,
+ LLVMConstVector(&shuffles[0],
+ zs_load_type.length), "");
+ zs_dst2 = LLVMBuildShuffleVector(builder, z_value, z_value,
+ LLVMConstVector(&shuffles[4],
+ zs_load_type.length), "");
+ }
+ }
+ else {
+ if (z_src_type.length == 4) {
+ zs_dst1 = lp_build_interleave2(gallivm, z_type,
+ z_value, s_value, 0);
+ zs_dst2 = lp_build_interleave2(gallivm, z_type,
+ z_value, s_value, 1);
+ }
+ else {
+ unsigned i;
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 2];
+ assert(z_src_type.length == 8);
+ for (i = 0; i < 8; i++) {
+ shuffles[i*2] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
+ shuffles[i*2+1] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2 +
+ z_src_type.length);
+ }
+ zs_dst1 = LLVMBuildShuffleVector(builder, z_value, s_value,
+ LLVMConstVector(&shuffles[0],
+ z_src_type.length), "");
+ zs_dst2 = LLVMBuildShuffleVector(builder, z_value, s_value,
+ LLVMConstVector(&shuffles[8],
+ z_src_type.length), "");
+ }
+ zs_dst1 = LLVMBuildBitCast(builder, zs_dst1,
+ lp_build_vec_type(gallivm, zs_load_type), "");
+ zs_dst2 = LLVMBuildBitCast(builder, zs_dst2,
+ lp_build_vec_type(gallivm, zs_load_type), "");
+ }
+
+ LLVMBuildStore(builder, zs_dst1, zs_dst_ptr1);
+ if (!is_1d) {
+ LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2);
+ }
+}