}
/**
- * Build shuffle vectors that match PACKxx instructions.
+ * Build shuffle vectors that match PACKxx (SSE) instructions or
+ * VPERM (Altivec).
*/
static LLVMValueRef
lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n)
assert(n <= LP_MAX_VECTOR_LENGTH);
for(i = 0; i < n; ++i)
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
elems[i] = lp_build_const_int32(gallivm, 2*i);
+#else
+ elems[i] = lp_build_const_int32(gallivm, 2*i+1);
+#endif
return LLVMConstVector(elems, n);
}
return tmp[0];
}
+
+/**
+ * Combines vectors to reduce from num_srcs to num_dsts.
+ * Returns the number of src vectors concatenated in a single dst.
+ *
+ * num_srcs must be exactly divisible by num_dsts.
+ *
+ * e.g. For num_srcs = 4 and src = [x, y, z, w]
+ * num_dsts = 1 dst = [xyzw] return = 4
+ * num_dsts = 2 dst = [xy, zw] return = 2
+ */
+int
+lp_build_concat_n(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ LLVMValueRef *src,
+ unsigned num_srcs,
+ LLVMValueRef *dst,
+ unsigned num_dsts)
+{
+ int size = num_srcs / num_dsts;
+ int i;
+
+ assert(num_srcs >= num_dsts);
+ assert((num_srcs % size) == 0);
+
+ if (num_srcs == num_dsts) {
+ for (i = 0; i < num_dsts; ++i) {
+ dst[i] = src[i];
+ }
+ return 1;
+ }
+
+ for (i = 0; i < num_dsts; ++i) {
+ dst[i] = lp_build_concat(gallivm, &src[i * size], src_type, size);
+ }
+
+ return size;
+}
+
+
/**
* Interleave vector elements.
*
- * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions.
+ * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions
+ * (but not for 256bit AVX vectors).
*/
LLVMValueRef
lp_build_interleave2(struct gallivm_state *gallivm,
{
LLVMValueRef shuffle;
+ if (type.length == 2 && type.width == 128 && util_cpu_caps.has_avx) {
+ /*
+ * XXX: This is a workaround for llvm code generation deficiency. Strangely
+ * enough, while this needs vinsertf128/vextractf128 instructions (hence
+ * a natural match when using 2x128bit vectors) the "normal" unpack shuffle
+ * generates code ranging from atrocious (llvm 3.1) to terrible (llvm 3.2, 3.3).
+ * So use some different shuffles instead (the exact shuffles don't seem to
+ * matter, as long as not using 128bit wide vectors, works with 8x32 or 4x64).
+ */
+ struct lp_type tmp_type = type;
+ LLVMValueRef srchalf[2], tmpdst;
+ tmp_type.length = 4;
+ tmp_type.width = 64;
+ a = LLVMBuildBitCast(gallivm->builder, a, lp_build_vec_type(gallivm, tmp_type), "");
+ b = LLVMBuildBitCast(gallivm->builder, b, lp_build_vec_type(gallivm, tmp_type), "");
+ srchalf[0] = lp_build_extract_range(gallivm, a, lo_hi * 2, 2);
+ srchalf[1] = lp_build_extract_range(gallivm, b, lo_hi * 2, 2);
+ tmp_type.length = 2;
+ tmpdst = lp_build_concat(gallivm, srchalf, tmp_type, 2);
+ return LLVMBuildBitCast(gallivm->builder, tmpdst, lp_build_vec_type(gallivm, type), "");
+ }
+
shuffle = lp_build_const_unpack_shuffle(gallivm, type.length, lo_hi);
return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
assert(src_type.length * 2 == dst_type.length);
/* Check for special cases first */
- if(util_cpu_caps.has_sse2 && src_type.width * src_type.length >= 128) {
+ if((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
+ src_type.width * src_type.length >= 128) {
const char *intrinsic = NULL;
switch(src_type.width) {
case 32:
- if(dst_type.sign) {
- intrinsic = "llvm.x86.sse2.packssdw.128";
- }
- else {
- if (util_cpu_caps.has_sse4_1) {
- intrinsic = "llvm.x86.sse41.packusdw";
+ if (util_cpu_caps.has_sse2) {
+ if(dst_type.sign) {
+ intrinsic = "llvm.x86.sse2.packssdw.128";
+ }
+ else {
+ if (util_cpu_caps.has_sse4_1) {
+ intrinsic = "llvm.x86.sse41.packusdw";
#if HAVE_LLVM < 0x0207
- /* llvm < 2.7 has inconsistent signatures except for packusdw */
- intr_type = dst_type;
+ /* llvm < 2.7 has inconsistent signatures except for packusdw */
+ intr_type = dst_type;
#endif
- }
+ }
+ }
+ } else if (util_cpu_caps.has_altivec) {
+ if (dst_type.sign) {
+ intrinsic = "llvm.ppc.altivec.vpkswus";
+ } else {
+ intrinsic = "llvm.ppc.altivec.vpkuwus";
+ }
}
break;
case 16:
if (dst_type.sign) {
- intrinsic = "llvm.x86.sse2.packsswb.128";
- }
- else {
- intrinsic = "llvm.x86.sse2.packuswb.128";
+ if (util_cpu_caps.has_sse2) {
+ intrinsic = "llvm.x86.sse2.packsswb.128";
+ } else if (util_cpu_caps.has_altivec) {
+ intrinsic = "llvm.ppc.altivec.vpkshss";
+ }
+ } else {
+ if (util_cpu_caps.has_sse2) {
+ intrinsic = "llvm.x86.sse2.packuswb.128";
+ } else if (util_cpu_caps.has_altivec) {
+ intrinsic = "llvm.ppc.altivec.vpkshus";
+ }
}
break;
/* default uses generic shuffle below */