LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
LLVMValueRef h;
- if (util_cpu_caps.has_f16c && HAVE_LLVM >= 0x0301 &&
+ if (util_cpu_caps.has_f16c &&
(src_length == 4 || src_length == 8)) {
const char *intrinsic = NULL;
if (src_length == 4) {
*
* Convert float32 to half floats, preserving Infs and NaNs,
* with rounding towards zero (trunc).
+ * XXX: For GL, would prefer rounding towards nearest(-even).
*/
LLVMValueRef
lp_build_float_to_half(struct gallivm_state *gallivm,
struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
LLVMValueRef result;
- if (util_cpu_caps.has_f16c && HAVE_LLVM >= 0x0301 &&
+ /*
+ * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits
+ * directly, without any (x86 or generic) intrinsics.
+ * Albeit the rounding mode cannot be specified (and is undefined,
+ * though in practice on x86 seems to do nearest-even but it may
+ * be dependent on instruction set support), so is essentially
+ * useless.
+ */
+
+ if (util_cpu_caps.has_f16c &&
(length == 4 || length == 8)) {
struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */
LLVMValueRef index = LLVMConstInt(i32t, i, 0);
LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");
#if 0
- /* XXX: not really supported by backends */
+ /*
+ * XXX: not really supported by backends.
+ * Even if they would now, rounding mode cannot be specified and
+ * is undefined.
+ */
LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);
#else
LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
else if (dst_width == (mantissa + 1)) {
/*
* The destination width matches exactly what can be represented in
- * floating point (i.e., mantissa + 1 bits). So do a straight
- * multiplication followed by casting. No further rounding is necessary.
+ * floating point (i.e., mantissa + 1 bits). Even so correct rounding
+ * still needs to be applied (only for numbers in [0.5-1.0] would
+ * conversion using truncation after scaling be sufficient).
*/
-
double scale;
+ struct lp_build_context uf32_bld;
+ lp_build_context_init(&uf32_bld, gallivm, src_type);
scale = (double)((1ULL << dst_width) - 1);
res = LLVMBuildFMul(builder, src,
lp_build_const_vec(gallivm, src_type, scale), "");
- res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
+ res = lp_build_iround(&uf32_bld, res);
}
else {
/*
* important, we also get exact results for 0.0 and 1.0.
*/
- unsigned n = MIN2(src_type.width - 1, dst_width);
+ unsigned n = MIN2(src_type.width - 1u, dst_width);
double scale = (double)(1ULL << n);
unsigned lshift = dst_width - n;
unsigned num_srcs,
LLVMValueRef *dst)
{
- int i;
+ unsigned i;
int num_dsts = num_srcs;
if (src_type.floating == dst_type->floating &&
src_type.sign == dst_type->sign)
return num_dsts;
- /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub
+ /* Special case 4x4x32 -> 1x16x8 or 2x8x32 -> 1x16x8
*/
- if (src_type.floating == 1 &&
- src_type.fixed == 0 &&
- src_type.sign == 1 &&
- src_type.norm == 0 &&
+ if (src_type.norm == 0 &&
src_type.width == 32 &&
+ src_type.fixed == 0 &&
dst_type->floating == 0 &&
dst_type->fixed == 0 &&
- dst_type->sign == 0 &&
- dst_type->norm == 1 &&
- dst_type->width == 8)
- {
- /* Special case 4x4f --> 1x16ub */
+ dst_type->width == 8 &&
+
+ ((src_type.floating == 1 && src_type.sign == 1 && dst_type->norm == 1) ||
+ (src_type.floating == 0 && dst_type->floating == 0 &&
+ src_type.sign == dst_type->sign && dst_type->norm == 0))) {
+
+ /* Special case 4x4x32 --> 1x16x8 */
if (src_type.length == 4 &&
- util_cpu_caps.has_sse2)
+ (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
{
num_dsts = (num_srcs + 3) / 4;
dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
return num_dsts;
}
- /* Special case 2x8f --> 1x16ub */
+ /* Special case 2x8x32 --> 1x16x8 */
if (src_type.length == 8 &&
util_cpu_caps.has_avx)
{
if (src_type.width == dst_type->width) {
lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
} else {
- for (i = 0; i < num_srcs; ++i) {
- lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1);
+ /*
+ * If dst_width is 16 bits and src_width 32 and the dst vector size
+ * 64bit, try feeding 2 vectors at once so pack intrinsics can be used.
+ * (For AVX, this isn't needed, since we usually get 256bit src and
+ * 128bit dst vectors which works ok. If we do AVX2 pack this should
+ * be extended but need to be able to tell conversion code about pack
+ * ordering first.)
+ */
+ unsigned ratio = 1;
+ if (src_type.width == 2 * dst_type->width &&
+ src_type.length == dst_type->length &&
+ dst_type->floating == 0 && (num_srcs % 2 == 0) &&
+ dst_type->width * dst_type->length == 64) {
+ ratio = 2;
+ num_dsts /= 2;
+ dst_type->length *= 2;
+ }
+ for (i = 0; i < num_dsts; i++) {
+ lp_build_conv(gallivm, src_type, *dst_type, &src[i*ratio], ratio, &dst[i], 1);
}
}
num_tmps = num_srcs;
- /* Special case 4x4f --> 1x16ub, 2x4f -> 1x8ub, 1x4f -> 1x4ub
+ /*
+ * Special case 4x4x32 --> 1x16x8, 2x4x32 -> 1x8x8, 1x4x32 -> 1x4x8
+ * Only float -> s/unorm8 and (u)int32->(u)int8.
+ * XXX: This should cover all interesting backend cases for 8 bit,
+ * but should use same strategy if dst is 16 bit.
*/
- if (src_type.floating == 1 &&
- src_type.fixed == 0 &&
- src_type.sign == 1 &&
- src_type.norm == 0 &&
+ if (src_type.norm == 0 &&
src_type.width == 32 &&
src_type.length == 4 &&
+ src_type.fixed == 0 &&
dst_type.floating == 0 &&
dst_type.fixed == 0 &&
- dst_type.sign == 0 &&
- dst_type.norm == 1 &&
dst_type.width == 8 &&
+ ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||
+ (src_type.floating == 0 && dst_type.floating == 0 &&
+ src_type.sign == dst_type.sign && dst_type.norm == 0)) &&
+
((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
(num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
- util_cpu_caps.has_sse2)
+ (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
{
struct lp_build_context bld;
struct lp_type int16_type, int32_type;
struct lp_type dst_type_ext = dst_type;
- LLVMValueRef const_255f;
+ LLVMValueRef const_scale;
unsigned i, j;
lp_build_context_init(&bld, gallivm, src_type);
int32_type.length /= 4;
int32_type.sign = 1;
- const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
+ const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));
for (i = 0; i < num_dsts; ++i, src += 4) {
LLVMValueRef lo, hi;
- for (j = 0; j < dst_type.length / 4; ++j) {
- tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
- tmp[j] = lp_build_iround(&bld, tmp[j]);
+ if (src_type.floating) {
+ for (j = 0; j < dst_type.length / 4; ++j) {
+ /*
+ * XXX This is not actually fully correct. The float to int
+ * conversion will produce 0x80000000 value for everything
+ * out of range and NaNs (on x86, llvm.x86.sse2.cvtps2dq).
+ * Hence, NaNs and negatives will get clamped just fine to zero
+ * (relying on clamping pack behavior) when converting to unorm,
+ * however too large values (both finite and infinite) will also
+ * end up as zero, not 255.
+ * For snorm, for now we'll keep bug compatibility with generic
+ * conversion path (meaning too large values are fine, but
+ * NaNs get converted to -128 (purely by luck, as we don't
+ * specify nan behavior for the max there) instead of 0).
+ */
+ if (dst_type.sign) {
+ tmp[j] = lp_build_min(&bld, bld.one, src[j]);
+
+ }
+ else {
+ if (0) {
+ tmp[j] = lp_build_min_ext(&bld, bld.one, src[j],
+ GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
+ }
+ tmp[j] = src[j];
+ }
+ tmp[j] = LLVMBuildFMul(builder, tmp[j], const_scale, "");
+ tmp[j] = lp_build_iround(&bld, tmp[j]);
+ }
+ } else {
+ for (j = 0; j < dst_type.length / 4; ++j) {
+ if (!dst_type.sign) {
+ /*
+ * Pack clamp is always signed->unsigned (or signed->signed).
+ * Hence need min.
+ */
+ LLVMValueRef const_max;
+ const_max = lp_build_const_int_vec(gallivm, src_type, 255);
+ tmp[j] = lp_build_min(&bld, src[j], const_max);
+ } else {
+ tmp[j] = src[j];
+ }
+ }
}
if (num_srcs == 1) {
return;
}
- /* Special case 2x8f --> 1x16ub, 1x8f ->1x8ub
+ /* Special case 2x8x32 --> 1x16x8, 1x8x32 ->1x8x8
*/
- else if (src_type.floating == 1 &&
- src_type.fixed == 0 &&
- src_type.sign == 1 &&
- src_type.norm == 0 &&
- src_type.width == 32 &&
- src_type.length == 8 &&
-
- dst_type.floating == 0 &&
- dst_type.fixed == 0 &&
- dst_type.sign == 0 &&
- dst_type.norm == 1 &&
- dst_type.width == 8 &&
+ else if (src_type.norm == 0 &&
+ src_type.width == 32 &&
+ src_type.length == 8 &&
+ src_type.fixed == 0 &&
+
+ dst_type.floating == 0 &&
+ dst_type.fixed == 0 &&
+ dst_type.width == 8 &&
+
+ ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||
+ (src_type.floating == 0 && dst_type.floating == 0 &&
+ src_type.sign == dst_type.sign && dst_type.norm == 0)) &&
((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
(num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
struct lp_build_context bld;
struct lp_type int16_type, int32_type;
struct lp_type dst_type_ext = dst_type;
- LLVMValueRef const_255f;
+ LLVMValueRef const_scale;
unsigned i;
lp_build_context_init(&bld, gallivm, src_type);
int32_type.length /= 4;
int32_type.sign = 1;
- const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
+ const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));
for (i = 0; i < num_dsts; ++i, src += 2) {
- LLVMValueRef lo, hi, a, b;
-
- a = LLVMBuildFMul(builder, src[0], const_255f, "");
- a = lp_build_iround(&bld, a);
- tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
- tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
- /* relying on clamping behavior of sse2 intrinsics here */
- lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
-
- if (num_srcs == 1) {
- hi = lo;
+ unsigned j;
+ for (j = 0; j < (num_srcs == 1 ? 1 : 2); j++) {
+ LLVMValueRef lo, hi, a;
+
+ a = src[j];
+ if (src_type.floating) {
+ if (dst_type.sign) {
+ a = lp_build_min(&bld, bld.one, a);
+
+ }
+ else {
+ if (0) {
+ a = lp_build_min_ext(&bld, bld.one, a,
+ GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
+ }
+ }
+ a = LLVMBuildFMul(builder, a, const_scale, "");
+ a = lp_build_iround(&bld, a);
+ } else {
+ if (!dst_type.sign) {
+ LLVMValueRef const_max;
+ const_max = lp_build_const_int_vec(gallivm, src_type, 255);
+ a = lp_build_min(&bld, a, const_max);
+ }
+ }
+ lo = lp_build_extract_range(gallivm, a, 0, 4);
+ hi = lp_build_extract_range(gallivm, a, 4, 4);
+ /* relying on clamping behavior of sse2 intrinsics here */
+ tmp[j] = lp_build_pack2(gallivm, int32_type, int16_type, lo, hi);
}
- else {
- b = LLVMBuildFMul(builder, src[1], const_255f, "");
- b = lp_build_iround(&bld, b);
- tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
- tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
- hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
+ if (num_srcs == 1) {
+ tmp[1] = tmp[0];
}
- dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
+ dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, tmp[0], tmp[1]);
}
if (num_srcs == 1) {
/*
* these functions will use fptosi in some form which won't work
- * with 32bit uint dst.
+ * with 32bit uint dst. Causes lp_test_conv failures though.
*/
- assert(dst_type.sign || dst_type.width < 32);
+ if (0)
+ assert(dst_type.sign || dst_type.width < 32);
if (dst_type.sign && dst_type.norm && !dst_type.fixed) {
struct lp_build_context bld;
unsigned dst_shift = lp_const_shift(dst_type);
unsigned src_offset = lp_const_offset(src_type);
unsigned dst_offset = lp_const_offset(dst_type);
+ struct lp_build_context bld;
+ lp_build_context_init(&bld, gallivm, tmp_type);
/* Compensate for different offsets */
- if (dst_offset > src_offset && src_type.width > dst_type.width) {
+ /* sscaled -> unorm and similar would cause negative shift count, skip */
+ if (dst_offset > src_offset && src_type.width > dst_type.width && src_shift > 0) {
for (i = 0; i < num_tmps; ++i) {
LLVMValueRef shifted;
- LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - 1);
- if(src_type.sign)
- shifted = LLVMBuildAShr(builder, tmp[i], shift, "");
- else
- shifted = LLVMBuildLShr(builder, tmp[i], shift, "");
+ shifted = lp_build_shr_imm(&bld, tmp[i], src_shift - 1);
tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");
}
}
if(src_shift > dst_shift) {
- LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type,
- src_shift - dst_shift);
for(i = 0; i < num_tmps; ++i)
- if(src_type.sign)
- tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
- else
- tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
+ tmp[i] = lp_build_shr_imm(&bld, tmp[i], src_shift - dst_shift);
}
}
new_type.width = dst_type.width;
new_type.length = dst_type.length;
+ /*
+ * Note that resize when using packs can sometimes get min/max
+ * clamping for free. Should be able to exploit this...
+ */
lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
tmp_type = new_type;
unsigned dst_shift = lp_const_shift(dst_type);
unsigned src_offset = lp_const_offset(src_type);
unsigned dst_offset = lp_const_offset(dst_type);
+ struct lp_build_context bld;
+ lp_build_context_init(&bld, gallivm, tmp_type);
if (src_shift < dst_shift) {
LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
- LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, dst_shift - src_shift);
- for (i = 0; i < num_tmps; ++i) {
- pre_shift[i] = tmp[i];
- tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
+ if (dst_shift - src_shift < dst_type.width) {
+ for (i = 0; i < num_tmps; ++i) {
+ pre_shift[i] = tmp[i];
+ tmp[i] = lp_build_shl_imm(&bld, tmp[i], dst_shift - src_shift);
+ }
+ }
+ else {
+ /*
+ * This happens for things like sscaled -> unorm conversions. Shift
+ * counts equal to bit width cause undefined results, so hack around it.
+ */
+ for (i = 0; i < num_tmps; ++i) {
+ pre_shift[i] = tmp[i];
+ tmp[i] = lp_build_zero(gallivm, dst_type);
+ }
}
/* Compensate for different offsets */