+ LLVMValueRef count, newcount;
+
+ assert(type.length <= 16);
+ assert(type.floating);
+
+ if(util_cpu_caps.has_sse && type.length == 4) {
+ const char *movmskintr = "llvm.x86.sse.movmsk.ps";
+ const char *popcntintr = "llvm.ctpop.i32";
+ LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
+ lp_build_vec_type(gallivm, type), "");
+ bits = lp_build_intrinsic_unary(builder, movmskintr,
+ LLVMInt32TypeInContext(context), bits);
+ count = lp_build_intrinsic_unary(builder, popcntintr,
+ LLVMInt32TypeInContext(context), bits);
+ count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
+ }
+ else if(util_cpu_caps.has_avx && type.length == 8) {
+ const char *movmskintr = "llvm.x86.avx.movmsk.ps.256";
+ const char *popcntintr = "llvm.ctpop.i32";
+ LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
+ lp_build_vec_type(gallivm, type), "");
+ bits = lp_build_intrinsic_unary(builder, movmskintr,
+ LLVMInt32TypeInContext(context), bits);
+ count = lp_build_intrinsic_unary(builder, popcntintr,
+ LLVMInt32TypeInContext(context), bits);
+ count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
+ }
+ else {
+ unsigned i;
+ LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv");
+ LLVMTypeRef counttype = LLVMIntTypeInContext(context, type.length * 8);
+ LLVMTypeRef i8vntype = LLVMVectorType(LLVMInt8TypeInContext(context), type.length * 4);
+ LLVMValueRef shufflev, countd;
+ LLVMValueRef shuffles[16];
+ const char *popcntintr = NULL;
+
+ countv = LLVMBuildBitCast(builder, countv, i8vntype, "");
+
+ for (i = 0; i < type.length; i++) {
+ shuffles[i] = lp_build_const_int32(gallivm, 4*i);
+ }
+
+ shufflev = LLVMConstVector(shuffles, type.length);
+ countd = LLVMBuildShuffleVector(builder, countv, LLVMGetUndef(i8vntype), shufflev, "");
+ countd = LLVMBuildBitCast(builder, countd, counttype, "countd");
+
+ /*
+ * XXX FIXME
+ * this is bad on cpus without popcount (on x86 supported by intel
+ * nehalem, amd barcelona, and up - not tied to sse42).
+ * Would be much faster to just sum the 4 elements of the vector with
+ * some horizontal add (shuffle/add/shuffle/add after the initial and).
+ */
+ switch (type.length) {
+ case 4:
+ popcntintr = "llvm.ctpop.i32";
+ break;
+ case 8:
+ popcntintr = "llvm.ctpop.i64";
+ break;
+ case 16:
+ popcntintr = "llvm.ctpop.i128";
+ break;
+ default:
+ assert(0);
+ }
+ count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd);
+
+ if (type.length > 8) {
+ count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 64), "");
+ }
+ else if (type.length < 8) {
+ count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
+ }
+ }
+ newcount = LLVMBuildLoad(builder, counter, "origcount");
+ newcount = LLVMBuildAdd(builder, newcount, count, "newcount");
+ LLVMBuildStore(builder, newcount, counter);
+}
+
+
+/**
+ * Load depth/stencil values.
+ * The stored values are linear, swizzle them.
+ *
+ * \param type the data type of the fragment depth/stencil values
+ * \param format_desc description of the depth/stencil surface
+ * \param is_1d whether this resource has only one dimension
+ * \param loop_counter the current loop iteration
+ * \param depth_ptr pointer to the depth/stencil values of this 4x4 block
+ * \param depth_stride stride of the depth/stencil buffer
+ * \param z_fb contains z values loaded from fb (may include padding)
+ * \param s_fb contains s values loaded from fb (may include padding)
+ */
+void
+lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
+ struct lp_type z_src_type,
+ const struct util_format_description *format_desc,
+ boolean is_1d,
+ LLVMValueRef depth_ptr,
+ LLVMValueRef depth_stride,
+ LLVMValueRef *z_fb,
+ LLVMValueRef *s_fb,
+ LLVMValueRef loop_counter)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
+ LLVMValueRef zs_dst1, zs_dst2;
+ LLVMValueRef zs_dst_ptr;
+ LLVMValueRef depth_offset1, depth_offset2;
+ LLVMTypeRef load_ptr_type;
+ unsigned depth_bytes = format_desc->block.bits / 8;
+ struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
+ struct lp_type zs_load_type = zs_type;
+
+ zs_load_type.length = zs_load_type.length / 2;
+ load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);
+
+ if (z_src_type.length == 4) {
+ unsigned i;
+ LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
+ lp_build_const_int32(gallivm, 1), "");
+ LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
+ lp_build_const_int32(gallivm, 2), "");
+ LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
+ depth_stride, "");
+ depth_offset1 = LLVMBuildMul(builder, looplsb,
+ lp_build_const_int32(gallivm, depth_bytes * 2), "");
+ depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");
+
+ /* just concatenate the loaded 2x2 values into 4-wide vector */
+ for (i = 0; i < 4; i++) {
+ shuffles[i] = lp_build_const_int32(gallivm, i);
+ }
+ }
+ else {
+ unsigned i;
+ LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,
+ lp_build_const_int32(gallivm, 1), "");
+ assert(z_src_type.length == 8);
+ depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, "");
+ /*
+ * We load 2x4 values, and need to swizzle them (order
+ * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.
+ */
+ for (i = 0; i < 8; i++) {
+ shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
+ }
+ }
+
+ depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");
+
+ /* Load current z/stencil values from z/stencil buffer */
+ zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
+ zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
+ zs_dst1 = LLVMBuildLoad(builder, zs_dst_ptr, "");
+ if (is_1d) {
+ zs_dst2 = lp_build_undef(gallivm, zs_load_type);
+ }
+ else {
+ zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
+ zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
+ zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, "");
+ }
+
+ *z_fb = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2,
+ LLVMConstVector(shuffles, zs_type.length), "");
+ *s_fb = *z_fb;
+
+ if (format_desc->block.bits < z_src_type.width) {
+ /* Extend destination ZS values (e.g., when reading from Z16_UNORM) */
+ *z_fb = LLVMBuildZExt(builder, *z_fb,
+ lp_build_int_vec_type(gallivm, z_src_type), "");
+ }
+
+ else if (format_desc->block.bits > 32) {
+ /* rely on llvm to handle too wide vector we have here nicely */
+ unsigned i;
+ struct lp_type typex2 = zs_type;
+ struct lp_type s_type = zs_type;
+ LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 4];
+ LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 4];
+ LLVMValueRef tmp;
+
+ typex2.width = typex2.width / 2;
+ typex2.length = typex2.length * 2;
+ s_type.width = s_type.width / 2;
+ s_type.floating = 0;
+
+ tmp = LLVMBuildBitCast(builder, *z_fb,
+ lp_build_vec_type(gallivm, typex2), "");
+
+ for (i = 0; i < zs_type.length; i++) {
+ shuffles1[i] = lp_build_const_int32(gallivm, i * 2);
+ shuffles2[i] = lp_build_const_int32(gallivm, i * 2 + 1);
+ }
+ *z_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
+ LLVMConstVector(shuffles1, zs_type.length), "");
+ *s_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
+ LLVMConstVector(shuffles2, zs_type.length), "");
+ *s_fb = LLVMBuildBitCast(builder, *s_fb,
+ lp_build_vec_type(gallivm, s_type), "");
+ lp_build_name(*s_fb, "s_dst");
+ }
+
+ lp_build_name(*z_fb, "z_dst");
+ lp_build_name(*s_fb, "s_dst");
+ lp_build_name(*z_fb, "z_dst");