assert(lp_check_value(type, a));
assert(lp_check_value(type, b));
- if(a == bld->zero)
+ if (a == bld->zero)
return b;
- if(b == bld->zero)
+ if (b == bld->zero)
return a;
- if(a == bld->undef || b == bld->undef)
+ if (a == bld->undef || b == bld->undef)
return bld->undef;
- if(bld->type.norm) {
+ if (type.norm) {
const char *intrinsic = NULL;
- if(a == bld->one || b == bld->one)
+ if (!type.sign && (a == bld->one || b == bld->one))
return bld->one;
if (!type.floating && !type.fixed) {
if (type.width * type.length == 128) {
- if(util_cpu_caps.has_sse2) {
- if(type.width == 8)
- intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
- if(type.width == 16)
- intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
+ if (util_cpu_caps.has_sse2) {
+ if (type.width == 8)
+ intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
+ HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.b" : NULL;
+ if (type.width == 16)
+ intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
+ HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.w" : NULL;
} else if (util_cpu_caps.has_altivec) {
- if(type.width == 8)
- intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
- if(type.width == 16)
- intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
+ if (type.width == 8)
+ intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
+ if (type.width == 16)
+ intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
}
}
if (type.width * type.length == 256) {
- if(util_cpu_caps.has_avx2) {
- if(type.width == 8)
- intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
- if(type.width == 16)
- intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
+ if (util_cpu_caps.has_avx2) {
+ if (type.width == 8)
+ intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
+ HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.b" : NULL;
+ if (type.width == 16)
+ intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
+ HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.w" : NULL;
}
}
}
LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
- } else {
- a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
}
}
if(bld->type.norm && (bld->type.floating || bld->type.fixed))
res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
+ if (type.norm && !type.floating && !type.fixed) {
+ if (!type.sign) {
+ /*
+ * newer llvm versions no longer support the intrinsics, but recognize
+ * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
+ * code, it is important we match the pattern llvm uses (and pray llvm
+ * doesn't change it - and hope they decide on the same pattern for
+ * all backends supporting it...).
+ * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
+ * interfere with llvm's ability to recognize the pattern but seems
+ * a bit brittle.
+ */
+ LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
+ res = lp_build_select(bld, overflowed,
+ LLVMConstAllOnes(bld->int_vec_type), res);
+ }
+ }
+
/* XXX clamp to floor of -1 or 0??? */
return res;
assert(lp_check_value(type, a));
assert(lp_check_value(type, b));
- if(b == bld->zero)
+ if (b == bld->zero)
return a;
- if(a == bld->undef || b == bld->undef)
+ if (a == bld->undef || b == bld->undef)
return bld->undef;
- if(a == b)
+ if (a == b)
return bld->zero;
- if(bld->type.norm) {
+ if (type.norm) {
const char *intrinsic = NULL;
- if(b == bld->one)
+ if (!type.sign && b == bld->one)
return bld->zero;
if (!type.floating && !type.fixed) {
if (type.width * type.length == 128) {
if (util_cpu_caps.has_sse2) {
- if(type.width == 8)
- intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
- if(type.width == 16)
- intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
+ if (type.width == 8)
+ intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
+ HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.b" : NULL;
+ if (type.width == 16)
+ intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
+ HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.w" : NULL;
} else if (util_cpu_caps.has_altivec) {
- if(type.width == 8)
- intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
- if(type.width == 16)
- intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
+ if (type.width == 8)
+ intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
+ if (type.width == 16)
+ intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
}
}
if (type.width * type.length == 256) {
if (util_cpu_caps.has_avx2) {
- if(type.width == 8)
- intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
- if(type.width == 16)
- intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
+ if (type.width == 8)
+ intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
+ HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.b" : NULL;
+ if (type.width == 16)
+ intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
+ HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.w" : NULL;
}
}
}
LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
} else {
- a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
+ /*
+ * This must match llvm pattern for saturated unsigned sub.
+ * (lp_build_max_simple actually does the job with its current
+ * definition but do it explicitly here.)
+ * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
+ * interfere with llvm's ability to recognize the pattern but seems
+ * a bit brittle.
+ */
+ LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
+ a = lp_build_select(bld, no_ov, a, b);
}
}
* @sa Michael Herf, The "double blend trick", May 2000,
* http://www.stereopsis.com/doubleblend.html
*/
-static LLVMValueRef
+LLVMValueRef
lp_build_mul_norm(struct gallivm_state *gallivm,
struct lp_type wide_type,
LLVMValueRef a, LLVMValueRef b)
if(b == 2 && bld->type.floating)
return lp_build_add(bld, a, a);
- if(util_is_power_of_two(b)) {
+ if(util_is_power_of_two_or_zero(b)) {
unsigned shift = ffs(b) - 1;
if(bld->type.floating) {
else if ((util_cpu_caps.has_altivec &&
(type.width == 32 && type.length == 4)))
return TRUE;
+ else if (util_cpu_caps.has_neon)
+ return TRUE;
return FALSE;
}
LLVMValueRef a,
enum lp_build_round_mode mode)
{
- if (util_cpu_caps.has_sse4_1) {
+ if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
const char *intrinsic_root;
else {
LLVMValueRef half;
- half = lp_build_const_vec(bld->gallivm, type, 0.5);
+ half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
if (type.sign) {
LLVMTypeRef vec_type = bld->vec_type;