+/*
+ * Untwiddle and transpose, much like the above.
+ * However, this is after conversion, so we get packed vectors.
+ * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data,
+ * the vectors will look like:
+ * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may
+ * be swizzled here). Extending to 16bit should be trivial.
+ * Should also be extended to handle twice wide vectors with AVX2...
+ */
+static void
+fs_twiddle_transpose(struct gallivm_state *gallivm,
+ struct lp_type type,
+ LLVMValueRef *src,
+ unsigned src_count,
+ LLVMValueRef *dst)
+{
+ unsigned i, j;
+ struct lp_type type64, type16, type32;
+ LLVMTypeRef type64_t, type8_t, type16_t, type32_t;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef tmp[4], shuf[8];
+ for (j = 0; j < 2; j++) {
+ shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0);
+ shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2);
+ shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1);
+ shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3);
+ }
+
+ assert(src_count == 4 || src_count == 2 || src_count == 1);
+ assert(type.width == 8);
+ assert(type.length == 16);
+
+ type8_t = lp_build_vec_type(gallivm, type);
+
+ type64 = type;
+ type64.length /= 8;
+ type64.width *= 8;
+ type64_t = lp_build_vec_type(gallivm, type64);
+
+ type16 = type;
+ type16.length /= 2;
+ type16.width *= 2;
+ type16_t = lp_build_vec_type(gallivm, type16);
+
+ type32 = type;
+ type32.length /= 4;
+ type32.width *= 4;
+ type32_t = lp_build_vec_type(gallivm, type32);
+
+ lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp);
+
+ if (src_count == 1) {
+ /* transpose was no-op, just untwiddle */
+ LLVMValueRef shuf_vec;
+ shuf_vec = LLVMConstVector(shuf, 8);
+ tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, "");
+ tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, "");
+ dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, "");
+ } else if (src_count == 2) {
+ LLVMValueRef shuf_vec;
+ shuf_vec = LLVMConstVector(shuf, 4);
+
+ for (i = 0; i < 2; i++) {
+ tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, "");
+ tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, "");
+ dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, "");
+ }
+ } else {
+ for (j = 0; j < 2; j++) {
+ LLVMValueRef lo, hi, lo2, hi2;
+ /*
+ * Note that if we only really have 3 valid channels (rgb)
+ * and we don't need alpha we could substitute a undef here
+ * for the respective channel (causing llvm to drop conversion
+ * for alpha).
+ */
+ /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */
+ lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, "");
+ hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, "");
+ lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0);
+ hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1);
+ dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, "");
+ dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, "");
+ }
+ }
+}
+
+