- shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), "");
- shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(type, 8), "");
- *y = LLVMBuildLShr(builder, packed, shift, "");
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+ /*
+ * Avoid shift with per-element count.
+ * No support on x86, gets translated to roughly 5 instructions
+ * per element. Didn't measure performance but cuts shader size
+ * by quite a bit (less difference if cpu has no sse4.1 support).
+ */
+ if (util_cpu_caps.has_sse2 && n > 1) {
+ LLVMValueRef sel, tmp, tmp2;
+ struct lp_build_context bld32;
+
+ lp_build_context_init(&bld32, gallivm, type);
+
+ tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 8), "");
+ tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(gallivm, type, 16), "");
+ sel = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(gallivm, type, 0));
+ *y = lp_build_select(&bld32, sel, tmp, tmp2);
+ } else
+#endif
+ {
+ LLVMValueRef shift;
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+ shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, 16), "");
+ shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(gallivm, type, 8), "");
+#else
+ shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, -16), "");
+ shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(gallivm, type, 16), "");
+#endif
+ *y = LLVMBuildLShr(builder, packed, shift, "");
+ }
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN