+
+ /* Special case 4x4f --> 1x16ub
+ */
+ if (src_type.floating == 1 &&
+ src_type.fixed == 0 &&
+ src_type.sign == 1 &&
+ src_type.norm == 0 &&
+ src_type.width == 32 &&
+ src_type.length == 4 &&
+
+ dst_type.floating == 0 &&
+ dst_type.fixed == 0 &&
+ dst_type.sign == 0 &&
+ dst_type.norm == 1 &&
+ dst_type.width == 8 &&
+ dst_type.length == 16 &&
+
+ util_cpu_caps.has_sse2)
+ {
+ int i;
+
+ for (i = 0; i < num_dsts; i++, src += 4) {
+ struct lp_type int16_type = dst_type;
+ struct lp_type int32_type = dst_type;
+ LLVMValueRef lo, hi;
+ LLVMValueRef src_int0;
+ LLVMValueRef src_int1;
+ LLVMValueRef src_int2;
+ LLVMValueRef src_int3;
+ LLVMTypeRef int16_vec_type;
+ LLVMTypeRef int32_vec_type;
+ LLVMTypeRef src_vec_type;
+ LLVMTypeRef dst_vec_type;
+ LLVMValueRef const_255f;
+ LLVMValueRef a, b, c, d;
+
+ int16_type.width *= 2;
+ int16_type.length /= 2;
+ int16_type.sign = 1;
+
+ int32_type.width *= 4;
+ int32_type.length /= 4;
+ int32_type.sign = 1;
+
+ src_vec_type = lp_build_vec_type(gallivm, src_type);
+ dst_vec_type = lp_build_vec_type(gallivm, dst_type);
+ int16_vec_type = lp_build_vec_type(gallivm, int16_type);
+ int32_vec_type = lp_build_vec_type(gallivm, int32_type);
+
+ const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
+
+ a = LLVMBuildFMul(builder, src[0], const_255f, "");
+ b = LLVMBuildFMul(builder, src[1], const_255f, "");
+ c = LLVMBuildFMul(builder, src[2], const_255f, "");
+ d = LLVMBuildFMul(builder, src[3], const_255f, "");
+
+ {
+ struct lp_build_context bld;
+
+ bld.gallivm = gallivm;
+ bld.type = src_type;
+ bld.vec_type = src_vec_type;
+ bld.int_elem_type = lp_build_elem_type(gallivm, int32_type);
+ bld.int_vec_type = int32_vec_type;
+ bld.undef = lp_build_undef(gallivm, src_type);
+ bld.zero = lp_build_zero(gallivm, src_type);
+ bld.one = lp_build_one(gallivm, src_type);
+
+ src_int0 = lp_build_iround(&bld, a);
+ src_int1 = lp_build_iround(&bld, b);
+ src_int2 = lp_build_iround(&bld, c);
+ src_int3 = lp_build_iround(&bld, d);
+ }
+ /* relying on clamping behavior of sse2 intrinsics here */
+ lo = lp_build_pack2(gallivm, int32_type, int16_type, src_int0, src_int1);
+ hi = lp_build_pack2(gallivm, int32_type, int16_type, src_int2, src_int3);
+ dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
+ }
+ return;
+ }
+