#include <float.h>
+#include <llvm/Config/llvm-config.h>
+
#include "util/u_memory.h"
#include "util/u_debug.h"
#include "util/u_math.h"
intrinsic = "llvm.ppc.altivec.vminfp";
intr_size = 128;
}
- } else if (HAVE_LLVM < 0x0309 &&
- util_cpu_caps.has_avx2 && type.length > 4) {
- intr_size = 256;
- switch (type.width) {
- case 8:
- intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
- break;
- case 16:
- intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
- break;
- case 32:
- intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
- break;
- }
- } else if (HAVE_LLVM < 0x0309 &&
- util_cpu_caps.has_sse2 && type.length >= 2) {
- intr_size = 128;
- if ((type.width == 8 || type.width == 16) &&
- (type.width * type.length <= 64) &&
- (gallivm_debug & GALLIVM_DEBUG_PERF)) {
- debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
- __FUNCTION__);
- }
- if (type.width == 8 && !type.sign) {
- intrinsic = "llvm.x86.sse2.pminu.b";
- }
- else if (type.width == 16 && type.sign) {
- intrinsic = "llvm.x86.sse2.pmins.w";
- }
- if (util_cpu_caps.has_sse4_1) {
- if (type.width == 8 && type.sign) {
- intrinsic = "llvm.x86.sse41.pminsb";
- }
- if (type.width == 16 && !type.sign) {
- intrinsic = "llvm.x86.sse41.pminuw";
- }
- if (type.width == 32 && !type.sign) {
- intrinsic = "llvm.x86.sse41.pminud";
- }
- if (type.width == 32 && type.sign) {
- intrinsic = "llvm.x86.sse41.pminsd";
- }
- }
} else if (util_cpu_caps.has_altivec) {
intr_size = 128;
if (type.width == 8) {
LLVMTypeRef type = LLVMTypeOf(a);
assert(type == LLVMTypeOf(b));
assert(type == LLVMTypeOf(c));
- if (HAVE_LLVM < 0x0304) {
- /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
- * not supported, and instead it falls-back to a C function.
- */
- return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
- }
+
char intrinsic[32];
lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
LLVMValueRef args[] = { a, b, c };
intrinsic = "llvm.ppc.altivec.vmaxfp";
intr_size = 128;
}
- } else if (HAVE_LLVM < 0x0309 &&
- util_cpu_caps.has_avx2 && type.length > 4) {
- intr_size = 256;
- switch (type.width) {
- case 8:
- intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
- break;
- case 16:
- intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
- break;
- case 32:
- intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
- break;
- }
- } else if (HAVE_LLVM < 0x0309 &&
- util_cpu_caps.has_sse2 && type.length >= 2) {
- intr_size = 128;
- if ((type.width == 8 || type.width == 16) &&
- (type.width * type.length <= 64) &&
- (gallivm_debug & GALLIVM_DEBUG_PERF)) {
- debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
- __FUNCTION__);
- }
- if (type.width == 8 && !type.sign) {
- intrinsic = "llvm.x86.sse2.pmaxu.b";
- intr_size = 128;
- }
- else if (type.width == 16 && type.sign) {
- intrinsic = "llvm.x86.sse2.pmaxs.w";
- }
- if (util_cpu_caps.has_sse4_1) {
- if (type.width == 8 && type.sign) {
- intrinsic = "llvm.x86.sse41.pmaxsb";
- }
- if (type.width == 16 && !type.sign) {
- intrinsic = "llvm.x86.sse41.pmaxuw";
- }
- if (type.width == 32 && !type.sign) {
- intrinsic = "llvm.x86.sse41.pmaxud";
- }
- if (type.width == 32 && type.sign) {
- intrinsic = "llvm.x86.sse41.pmaxsd";
- }
- }
} else if (util_cpu_caps.has_altivec) {
intr_size = 128;
if (type.width == 8) {
return bld->one;
if (!type.floating && !type.fixed) {
+ if (LLVM_VERSION_MAJOR >= 8) {
+ char intrin[32];
+ intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
+ lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
+ return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
+ }
if (type.width * type.length == 128) {
if (util_cpu_caps.has_sse2) {
if (type.width == 8)
- intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
- HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.b" : NULL;
+ intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
if (type.width == 16)
- intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
- HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.w" : NULL;
+ intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
} else if (util_cpu_caps.has_altivec) {
if (type.width == 8)
intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
if (type.width * type.length == 256) {
if (util_cpu_caps.has_avx2) {
if (type.width == 8)
- intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
- HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.b" : NULL;
+ intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
if (type.width == 16)
- intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
- HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.w" : NULL;
+ intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
}
}
}
* NOTE: cmp/select does sext/trunc of the mask. Does not seem to
* interfere with llvm's ability to recognize the pattern but seems
* a bit brittle.
+ * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
*/
LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
res = lp_build_select(bld, overflowed,
return bld->zero;
if (!type.floating && !type.fixed) {
+ if (LLVM_VERSION_MAJOR >= 8) {
+ char intrin[32];
+ intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
+ lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
+ return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
+ }
if (type.width * type.length == 128) {
if (util_cpu_caps.has_sse2) {
if (type.width == 8)
- intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
- HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.b" : NULL;
+ intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
if (type.width == 16)
- intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
- HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.w" : NULL;
+ intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
} else if (util_cpu_caps.has_altivec) {
if (type.width == 8)
intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
if (type.width * type.length == 256) {
if (util_cpu_caps.has_avx2) {
if (type.width == 8)
- intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
- HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.b" : NULL;
+ intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
if (type.width == 16)
- intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
- HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.w" : NULL;
+ intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
}
}
}
* NOTE: cmp/select does sext/trunc of the mask. Does not seem to
* interfere with llvm's ability to recognize the pattern but seems
* a bit brittle.
+ * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
*/
LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
a = lp_build_select(bld, no_ov, a, b);
* https://llvm.org/bugs/show_bug.cgi?id=30845
* So, whip up our own code, albeit only for length 4 and 8 (which
* should be good enough)...
+ * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
+ * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
+ * for signed), which the fallback code does not, without this llvm
+ * will likely still produce atrocious code.
*/
- if ((bld->type.length == 4 || bld->type.length == 8) &&
+ if (LLVM_VERSION_MAJOR < 7 &&
+ (bld->type.length == 4 || bld->type.length == 8) &&
((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
util_cpu_caps.has_sse4_1)) {
const char *intrinsic = NULL;
return a;
if(type.floating) {
- if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
- /* Workaround llvm.org/PR27332 */
- LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
- unsigned long long absMask = ~(1ULL << (type.width - 1));
- LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
- a = LLVMBuildBitCast(builder, a, int_vec_type, "");
- a = LLVMBuildAnd(builder, a, mask, "");
- a = LLVMBuildBitCast(builder, a, vec_type, "");
- return a;
- } else {
- char intrinsic[32];
- lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
- return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
- }
+ char intrinsic[32];
+ lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
+ return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
}
- if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) {
+ if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) {
switch(type.width) {
case 8:
return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
}
}
- else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) {
+ else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) {
switch(type.width) {
case 8:
return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
LLVMTypeRef int_vec_type = bld->int_vec_type;
LLVMTypeRef vec_type = bld->vec_type;
- assert(type.width == 32); /* might want to handle doubles at some point */
-
inttype = type;
inttype.floating = 0;
lp_build_context_init(&intbld, bld->gallivm, inttype);
LLVMTypeRef int_vec_type = bld->int_vec_type;
LLVMTypeRef vec_type = bld->vec_type;
- assert(type.width == 32); /* might want to handle doubles at some point */
-
inttype = type;
inttype.floating = 0;
lp_build_context_init(&intbld, bld->gallivm, inttype);
/**
* Do one Newton-Raphson step to improve reciprocate precision:
*
- * x_{i+1} = x_i * (2 - a * x_i)
+ * x_{i+1} = x_i + x_i * (1 - a * x_i)
*
* XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
* +/-Inf, giving NaN instead. Certain applications rely on this behavior,
- * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
+ * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
* halo. It would be necessary to clamp the argument to prevent this.
*
* See also:
LLVMValueRef rcp_a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
- LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
+ LLVMValueRef neg_a;
LLVMValueRef res;
- res = LLVMBuildFMul(builder, a, rcp_a, "");
- res = LLVMBuildFSub(builder, two, res, "");
- res = LLVMBuildFMul(builder, rcp_a, res, "");
+ neg_a = LLVMBuildFNeg(builder, a, "");
+ res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
+ res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
return res;
}