X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fauxiliary%2Fgallivm%2Flp_bld_conv.c;h=14244470c9050a2e3f3ef2d95af697ca19acc279;hb=45ed627d894aa4d51682e8b07e7234bbc6e7c02d;hp=127b13bc28607bf9acdf3c4425ce6d2268bced33;hpb=695cc370a280a637f411f5ff3877b3fd1c05e424;p=mesa.git diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c index 127b13bc286..14244470c90 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c @@ -63,13 +63,146 @@ #include "util/u_debug.h" #include "util/u_math.h" +#include "util/u_half.h" #include "util/u_cpu_detect.h" #include "lp_bld_type.h" #include "lp_bld_const.h" #include "lp_bld_arit.h" +#include "lp_bld_bitarit.h" #include "lp_bld_pack.h" #include "lp_bld_conv.h" +#include "lp_bld_logic.h" +#include "lp_bld_intr.h" +#include "lp_bld_printf.h" +#include "lp_bld_format.h" + + + +/** + * Converts int16 half-float to float32 + * Note this can be performed in 1 instruction if vcvtph2ps exists (f16c/cvt16) + * [llvm.x86.vcvtph2ps / _mm_cvtph_ps] + * + * @param src value to convert + * + */ +LLVMValueRef +lp_build_half_to_float(struct gallivm_state *gallivm, + LLVMValueRef src) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMTypeRef src_type = LLVMTypeOf(src); + unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ? + LLVMGetVectorSize(src_type) : 1; + + struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length); + struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length); + LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type); + LLVMValueRef h; + + if (util_cpu_caps.has_f16c && + (src_length == 4 || src_length == 8)) { + const char *intrinsic = NULL; + if (src_length == 4) { + src = lp_build_pad_vector(gallivm, src, 8); + intrinsic = "llvm.x86.vcvtph2ps.128"; + } + else { + intrinsic = "llvm.x86.vcvtph2ps.256"; + } + return lp_build_intrinsic_unary(builder, intrinsic, + lp_build_vec_type(gallivm, f32_type), src); + } + + /* Convert int16 vector to int32 vector by zero ext (might generate bad code) */ + h = LLVMBuildZExt(builder, src, int_vec_type, ""); + return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true); +} + + +/** + * Converts float32 to int16 half-float + * Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16) + * [llvm.x86.vcvtps2ph / _mm_cvtps_ph] + * + * @param src value to convert + * + * Convert float32 to half floats, preserving Infs and NaNs, + * with rounding towards zero (trunc). + */ +LLVMValueRef +lp_build_float_to_half(struct gallivm_state *gallivm, + LLVMValueRef src) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMTypeRef f32_vec_type = LLVMTypeOf(src); + unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind + ? LLVMGetVectorSize(f32_vec_type) : 1; + struct lp_type i32_type = lp_type_int_vec(32, 32 * length); + struct lp_type i16_type = lp_type_int_vec(16, 16 * length); + LLVMValueRef result; + + if (util_cpu_caps.has_f16c && + (length == 4 || length == 8)) { + struct lp_type i168_type = lp_type_int_vec(16, 16 * 8); + unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */ + LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); + const char *intrinsic = NULL; + if (length == 4) { + intrinsic = "llvm.x86.vcvtps2ph.128"; + } + else { + intrinsic = "llvm.x86.vcvtps2ph.256"; + } + result = lp_build_intrinsic_binary(builder, intrinsic, + lp_build_vec_type(gallivm, i168_type), + src, LLVMConstInt(i32t, mode, 0)); + if (length == 4) { + result = lp_build_extract_range(gallivm, result, 0, 4); + } + } + + else { + result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true); + /* Convert int32 vector to int16 vector by trunc (might generate bad code) */ + result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), ""); + } + + /* + * Debugging code. + */ + if (0) { + LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); + LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context); + LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context); + LLVMValueRef ref_result = LLVMGetUndef(LLVMVectorType(i16t, length)); + unsigned i; + + LLVMTypeRef func_type = LLVMFunctionType(i16t, &f32t, 1, 0); + LLVMValueRef func = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)util_float_to_half)); + func = LLVMBuildBitCast(builder, func, LLVMPointerType(func_type, 0), "util_float_to_half"); + + for (i = 0; i < length; ++i) { + LLVMValueRef index = LLVMConstInt(i32t, i, 0); + LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, ""); +#if 0 + /* XXX: not really supported by backends */ + LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32); +#else + LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, ""); +#endif + ref_result = LLVMBuildInsertElement(builder, ref_result, f16, index, ""); + } + + lp_build_print_value(gallivm, "src = ", src); + lp_build_print_value(gallivm, "llvm = ", result); + lp_build_print_value(gallivm, "util = ", ref_result); + lp_build_printf(gallivm, "\n"); + } + + return result; +} /** @@ -89,66 +222,121 @@ * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1]. */ LLVMValueRef -lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder, +lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm, struct lp_type src_type, unsigned dst_width, LLVMValueRef src) { - LLVMTypeRef int_vec_type = lp_build_int_vec_type(src_type); + LLVMBuilderRef builder = gallivm->builder; + LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type); LLVMValueRef res; unsigned mantissa; - unsigned n; - unsigned long long ubound; - unsigned long long mask; - double scale; - double bias; assert(src_type.floating); + assert(dst_width <= src_type.width); + src_type.sign = FALSE; mantissa = lp_mantissa(src_type); - /* We cannot carry more bits than the mantissa */ - n = MIN2(mantissa, dst_width); + if (dst_width <= mantissa) { + /* + * Apply magic coefficients that will make the desired result to appear + * in the lowest significant bits of the mantissa, with correct rounding. + * + * This only works if the destination width fits in the mantissa. + */ - /* This magic coefficients will make the desired result to appear in the - * lowest significant bits of the mantissa. - */ - ubound = ((unsigned long long)1 << n); - mask = ubound - 1; - scale = (double)mask/ubound; - bias = (double)((unsigned long long)1 << (mantissa - n)); - - res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), ""); - res = LLVMBuildFAdd(builder, res, lp_build_const_vec(src_type, bias), ""); - res = LLVMBuildBitCast(builder, res, int_vec_type, ""); - - if(dst_width > n) { - int shift = dst_width - n; - res = LLVMBuildShl(builder, res, lp_build_const_int_vec(src_type, shift), ""); - - /* TODO: Fill in the empty lower bits for additional precision? */ - /* YES: this fixes progs/trivial/tri-z-eq.c. - * Otherwise vertex Z=1.0 values get converted to something like - * 0xfffffb00 and the test for equality with 0xffffffff fails. + unsigned long long ubound; + unsigned long long mask; + double scale; + double bias; + + ubound = (1ULL << dst_width); + mask = ubound - 1; + scale = (double)mask/ubound; + bias = (double)(1ULL << (mantissa - dst_width)); + + res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), ""); + /* instead of fadd/and could (with sse2) just use lp_build_iround */ + res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), ""); + res = LLVMBuildBitCast(builder, res, int_vec_type, ""); + res = LLVMBuildAnd(builder, res, + lp_build_const_int_vec(gallivm, src_type, mask), ""); + } + else if (dst_width == (mantissa + 1)) { + /* + * The destination width matches exactly what can be represented in + * floating point (i.e., mantissa + 1 bits). Even so correct rounding + * still needs to be applied (only for numbers in [0.5-1.0] would + * conversion using truncation after scaling be sufficient). */ -#if 0 - { - LLVMValueRef msb; - msb = LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, dst_width - 1), ""); - msb = LLVMBuildShl(builder, msb, lp_build_const_int_vec(src_type, shift), ""); - msb = LLVMBuildSub(builder, msb, lp_build_const_int_vec(src_type, 1), ""); - res = LLVMBuildOr(builder, res, msb, ""); - } -#elif 0 - while(shift > 0) { - res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, n), ""), ""); - shift -= n; - n *= 2; + double scale; + struct lp_build_context uf32_bld; + + lp_build_context_init(&uf32_bld, gallivm, src_type); + scale = (double)((1ULL << dst_width) - 1); + + res = LLVMBuildFMul(builder, src, + lp_build_const_vec(gallivm, src_type, scale), ""); + res = lp_build_iround(&uf32_bld, res); + } + else { + /* + * The destination exceeds what can be represented in the floating point. + * So multiply by the largest power two we get away with, and when + * subtract the most significant bit to rescale to normalized values. + * + * The largest power of two factor we can get away is + * (1 << (src_type.width - 1)), because we need to use signed . In theory it + * should be (1 << (src_type.width - 2)), but IEEE 754 rules states + * INT_MIN should be returned in FPToSI, which is the correct result for + * values near 1.0! + * + * This means we get (src_type.width - 1) correct bits for values near 0.0, + * and (mantissa + 1) correct bits for values near 1.0. Equally or more + * important, we also get exact results for 0.0 and 1.0. + */ + + unsigned n = MIN2(src_type.width - 1, dst_width); + + double scale = (double)(1ULL << n); + unsigned lshift = dst_width - n; + unsigned rshift = n; + LLVMValueRef lshifted; + LLVMValueRef rshifted; + + res = LLVMBuildFMul(builder, src, + lp_build_const_vec(gallivm, src_type, scale), ""); + res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); + + /* + * Align the most significant bit to its final place. + * + * This will cause 1.0 to overflow to 0, but the later adjustment will + * get it right. + */ + if (lshift) { + lshifted = LLVMBuildShl(builder, res, + lp_build_const_int_vec(gallivm, src_type, + lshift), ""); + } else { + lshifted = res; } -#endif + + /* + * Align the most significant bit to the right. + */ + rshifted = LLVMBuildLShr(builder, res, + lp_build_const_int_vec(gallivm, src_type, rshift), + ""); + + /* + * Subtract the MSB to the LSB, therefore re-scaling from + * (1 << dst_width) to ((1 << dst_width) - 1). + */ + + res = LLVMBuildSub(builder, lshifted, rshifted, ""); } - else - res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(src_type, mask), ""); return res; } @@ -160,13 +348,14 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder, * return {float, float, float, float} with values in range [0, 1]. */ LLVMValueRef -lp_build_unsigned_norm_to_float(LLVMBuilderRef builder, +lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm, unsigned src_width, struct lp_type dst_type, LLVMValueRef src) { - LLVMTypeRef vec_type = lp_build_vec_type(dst_type); - LLVMTypeRef int_vec_type = lp_build_int_vec_type(dst_type); + LLVMBuilderRef builder = gallivm->builder; + LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type); + LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type); LLVMValueRef bias_; LLVMValueRef res; unsigned mantissa; @@ -180,35 +369,129 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder, mantissa = lp_mantissa(dst_type); - n = MIN2(mantissa, src_width); + if (src_width <= (mantissa + 1)) { + /* + * The source width matches fits what can be represented in floating + * point (i.e., mantissa + 1 bits). So do a straight multiplication + * followed by casting. No further rounding is necessary. + */ + + scale = 1.0/(double)((1ULL << src_width) - 1); + res = LLVMBuildSIToFP(builder, src, vec_type, ""); + res = LLVMBuildFMul(builder, res, + lp_build_const_vec(gallivm, dst_type, scale), ""); + return res; + } + else { + /* + * The source width exceeds what can be represented in floating + * point. So truncate the incoming values. + */ + + n = MIN2(mantissa, src_width); - ubound = ((unsigned long long)1 << n); - mask = ubound - 1; - scale = (double)ubound/mask; - bias = (double)((unsigned long long)1 << (mantissa - n)); + ubound = ((unsigned long long)1 << n); + mask = ubound - 1; + scale = (double)ubound/mask; + bias = (double)((unsigned long long)1 << (mantissa - n)); - res = src; + res = src; - if(src_width > mantissa) { - int shift = src_width - mantissa; - res = LLVMBuildLShr(builder, res, lp_build_const_int_vec(dst_type, shift), ""); - } + if (src_width > mantissa) { + int shift = src_width - mantissa; + res = LLVMBuildLShr(builder, res, + lp_build_const_int_vec(gallivm, dst_type, shift), ""); + } - bias_ = lp_build_const_vec(dst_type, bias); + bias_ = lp_build_const_vec(gallivm, dst_type, bias); - res = LLVMBuildOr(builder, - res, - LLVMBuildBitCast(builder, bias_, int_vec_type, ""), ""); + res = LLVMBuildOr(builder, + res, + LLVMBuildBitCast(builder, bias_, int_vec_type, ""), ""); - res = LLVMBuildBitCast(builder, res, vec_type, ""); + res = LLVMBuildBitCast(builder, res, vec_type, ""); - res = LLVMBuildFSub(builder, res, bias_, ""); - res = LLVMBuildFMul(builder, res, lp_build_const_vec(dst_type, scale), ""); + res = LLVMBuildFSub(builder, res, bias_, ""); + res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), ""); + } return res; } +/** + * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used. + * + * Returns the number of dsts created from src + */ +int lp_build_conv_auto(struct gallivm_state *gallivm, + struct lp_type src_type, + struct lp_type* dst_type, + const LLVMValueRef *src, + unsigned num_srcs, + LLVMValueRef *dst) +{ + int i; + int num_dsts = num_srcs; + + if (src_type.floating == dst_type->floating && + src_type.width == dst_type->width && + src_type.length == dst_type->length && + src_type.fixed == dst_type->fixed && + src_type.norm == dst_type->norm && + src_type.sign == dst_type->sign) + return num_dsts; + + /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub + */ + if (src_type.floating == 1 && + src_type.fixed == 0 && + src_type.sign == 1 && + src_type.norm == 0 && + src_type.width == 32 && + + dst_type->floating == 0 && + dst_type->fixed == 0 && + dst_type->sign == 0 && + dst_type->norm == 1 && + dst_type->width == 8) + { + /* Special case 4x4f --> 1x16ub */ + if (src_type.length == 4 && + util_cpu_caps.has_sse2) + { + num_dsts = (num_srcs + 3) / 4; + dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4; + + lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); + return num_dsts; + } + + /* Special case 2x8f --> 1x16ub */ + if (src_type.length == 8 && + util_cpu_caps.has_avx) + { + num_dsts = (num_srcs + 1) / 2; + dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8; + + lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); + return num_dsts; + } + } + + /* lp_build_resize does not support M:N */ + if (src_type.width == dst_type->width) { + lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); + } else { + for (i = 0; i < num_srcs; ++i) { + lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1); + } + } + + return num_dsts; +} + + /** * Generic type conversion. * @@ -216,12 +499,13 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder, * to the lp_type union. */ void -lp_build_conv(LLVMBuilderRef builder, +lp_build_conv(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, const LLVMValueRef *src, unsigned num_srcs, LLVMValueRef *dst, unsigned num_dsts) { + LLVMBuilderRef builder = gallivm->builder; struct lp_type tmp_type; LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; unsigned num_tmps; @@ -243,7 +527,7 @@ lp_build_conv(LLVMBuilderRef builder, num_tmps = num_srcs; - /* Special case 4x4f --> 1x16ub + /* Special case 4x4f --> 1x16ub, 2x4f -> 1x8ub, 1x4f -> 1x4ub */ if (src_type.floating == 1 && src_type.fixed == 0 && @@ -257,84 +541,156 @@ lp_build_conv(LLVMBuilderRef builder, dst_type.sign == 0 && dst_type.norm == 1 && dst_type.width == 8 && - dst_type.length == 16) + + ((dst_type.length == 16 && 4 * num_dsts == num_srcs) || + (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) && + + util_cpu_caps.has_sse2) { - int i; + struct lp_build_context bld; + struct lp_type int16_type, int32_type; + struct lp_type dst_type_ext = dst_type; + LLVMValueRef const_255f; + unsigned i, j; + + lp_build_context_init(&bld, gallivm, src_type); + + dst_type_ext.length = 16; + int16_type = int32_type = dst_type_ext; + + int16_type.width *= 2; + int16_type.length /= 2; + int16_type.sign = 1; + + int32_type.width *= 4; + int32_type.length /= 4; + int32_type.sign = 1; + + const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); - for (i = 0; i < num_dsts; i++, src += 4) { - struct lp_type int16_type = dst_type; - struct lp_type int32_type = dst_type; + for (i = 0; i < num_dsts; ++i, src += 4) { LLVMValueRef lo, hi; - LLVMValueRef src_int0; - LLVMValueRef src_int1; - LLVMValueRef src_int2; - LLVMValueRef src_int3; - LLVMTypeRef int16_vec_type; - LLVMTypeRef int32_vec_type; - LLVMTypeRef src_vec_type; - LLVMTypeRef dst_vec_type; - LLVMValueRef const_255f; - LLVMValueRef a, b, c, d; - - int16_type.width *= 2; - int16_type.length /= 2; - int16_type.sign = 1; - - int32_type.width *= 4; - int32_type.length /= 4; - int32_type.sign = 1; - - src_vec_type = lp_build_vec_type(src_type); - dst_vec_type = lp_build_vec_type(dst_type); - int16_vec_type = lp_build_vec_type(int16_type); - int32_vec_type = lp_build_vec_type(int32_type); - - const_255f = lp_build_const_vec(src_type, 255.0f); - a = LLVMBuildFMul(builder, src[0], const_255f, ""); - b = LLVMBuildFMul(builder, src[1], const_255f, ""); - c = LLVMBuildFMul(builder, src[2], const_255f, ""); - d = LLVMBuildFMul(builder, src[3], const_255f, ""); + for (j = 0; j < dst_type.length / 4; ++j) { + tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, ""); + tmp[j] = lp_build_iround(&bld, tmp[j]); + } - /* lp_build_round generates excessively general code without - * sse4, so do rounding manually. - */ - if (!util_cpu_caps.has_sse4_1) { - LLVMValueRef const_half = lp_build_const_vec(src_type, 0.5f); - - a = LLVMBuildFAdd(builder, a, const_half, ""); - b = LLVMBuildFAdd(builder, b, const_half, ""); - c = LLVMBuildFAdd(builder, c, const_half, ""); - d = LLVMBuildFAdd(builder, d, const_half, ""); - - src_int0 = LLVMBuildFPToSI(builder, a, int32_vec_type, ""); - src_int1 = LLVMBuildFPToSI(builder, b, int32_vec_type, ""); - src_int2 = LLVMBuildFPToSI(builder, c, int32_vec_type, ""); - src_int3 = LLVMBuildFPToSI(builder, d, int32_vec_type, ""); + if (num_srcs == 1) { + tmp[1] = tmp[0]; + } + + /* relying on clamping behavior of sse2 intrinsics here */ + lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); + + if (num_srcs < 4) { + hi = lo; } else { - struct lp_build_context bld; + hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); + } + dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi); + } + if (num_srcs < 4) { + dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length); + } + + return; + } + + /* Special case 2x8f --> 1x16ub, 1x8f ->1x8ub + */ + else if (src_type.floating == 1 && + src_type.fixed == 0 && + src_type.sign == 1 && + src_type.norm == 0 && + src_type.width == 32 && + src_type.length == 8 && + + dst_type.floating == 0 && + dst_type.fixed == 0 && + dst_type.sign == 0 && + dst_type.norm == 1 && + dst_type.width == 8 && + + ((dst_type.length == 16 && 2 * num_dsts == num_srcs) || + (num_dsts == 1 && dst_type.length * num_srcs == 8)) && + + util_cpu_caps.has_avx) { - bld.builder = builder; - bld.type = src_type; - bld.vec_type = src_vec_type; - bld.int_elem_type = lp_build_elem_type(int32_type); - bld.int_vec_type = int32_vec_type; - bld.undef = lp_build_undef(src_type); - bld.zero = lp_build_zero(src_type); - bld.one = lp_build_one(src_type); - - src_int0 = lp_build_iround(&bld, a); - src_int1 = lp_build_iround(&bld, b); - src_int2 = lp_build_iround(&bld, c); - src_int3 = lp_build_iround(&bld, d); + struct lp_build_context bld; + struct lp_type int16_type, int32_type; + struct lp_type dst_type_ext = dst_type; + LLVMValueRef const_255f; + unsigned i; + + lp_build_context_init(&bld, gallivm, src_type); + + dst_type_ext.length = 16; + int16_type = int32_type = dst_type_ext; + + int16_type.width *= 2; + int16_type.length /= 2; + int16_type.sign = 1; + + int32_type.width *= 4; + int32_type.length /= 4; + int32_type.sign = 1; + + const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); + + for (i = 0; i < num_dsts; ++i, src += 2) { + LLVMValueRef lo, hi, a, b; + + a = LLVMBuildFMul(builder, src[0], const_255f, ""); + a = lp_build_iround(&bld, a); + tmp[0] = lp_build_extract_range(gallivm, a, 0, 4); + tmp[1] = lp_build_extract_range(gallivm, a, 4, 4); + /* relying on clamping behavior of sse2 intrinsics here */ + lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); + + if (num_srcs == 1) { + hi = lo; } + else { + b = LLVMBuildFMul(builder, src[1], const_255f, ""); + b = lp_build_iround(&bld, b); + tmp[2] = lp_build_extract_range(gallivm, b, 0, 4); + tmp[3] = lp_build_extract_range(gallivm, b, 4, 4); + hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); - lo = lp_build_pack2(builder, int32_type, int16_type, src_int0, src_int1); - hi = lp_build_pack2(builder, int32_type, int16_type, src_int2, src_int3); - dst[i] = lp_build_pack2(builder, int16_type, dst_type, lo, hi); + } + dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi); } - return; + + if (num_srcs == 1) { + dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length); + } + + return; + } + + /* Special case -> 16bit half-float + */ + else if (dst_type.floating && dst_type.width == 16) + { + /* Only support src as 32bit float currently */ + assert(src_type.floating && src_type.width == 32); + + for(i = 0; i < num_tmps; ++i) + dst[i] = lp_build_float_to_half(gallivm, tmp[i]); + + return; + } + + /* Pre convert half-floats to floats + */ + else if (src_type.floating && src_type.width == 16) + { + for(i = 0; i < num_tmps; ++i) + tmp[i] = lp_build_half_to_float(gallivm, tmp[i]); + + tmp_type.width = 32; } /* @@ -349,13 +705,13 @@ lp_build_conv(LLVMBuilderRef builder, double dst_max = lp_const_max(dst_type); LLVMValueRef thres; - lp_build_context_init(&bld, builder, tmp_type); + lp_build_context_init(&bld, gallivm, tmp_type); if(src_min < dst_min) { if(dst_min == 0.0) thres = bld.zero; else - thres = lp_build_const_vec(src_type, dst_min); + thres = lp_build_const_vec(gallivm, src_type, dst_min); for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_max(&bld, tmp[i], thres); } @@ -364,7 +720,7 @@ lp_build_conv(LLVMBuilderRef builder, if(dst_max == 1.0) thres = bld.one; else - thres = lp_build_const_vec(src_type, dst_max); + thres = lp_build_const_vec(gallivm, src_type, dst_max); for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_min(&bld, tmp[i], thres); } @@ -380,7 +736,7 @@ lp_build_conv(LLVMBuilderRef builder, else if(tmp_type.floating) { if(!dst_type.fixed && !dst_type.sign && dst_type.norm) { for(i = 0; i < num_tmps; ++i) { - tmp[i] = lp_build_clamped_float_to_unsigned_norm(builder, + tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm, tmp_type, dst_type.width, tmp[i]); @@ -389,42 +745,70 @@ lp_build_conv(LLVMBuilderRef builder, } else { double dst_scale = lp_const_scale(dst_type); - LLVMTypeRef tmp_vec_type; if (dst_scale != 1.0) { - LLVMValueRef scale = lp_build_const_vec(tmp_type, dst_scale); + LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale); for(i = 0; i < num_tmps; ++i) tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } - /* Use an equally sized integer for intermediate computations */ - tmp_type.floating = FALSE; - tmp_vec_type = lp_build_vec_type(tmp_type); - for(i = 0; i < num_tmps; ++i) { + /* + * these functions will use fptosi in some form which won't work + * with 32bit uint dst. Causes lp_test_conv failures though. + */ + if (0) + assert(dst_type.sign || dst_type.width < 32); + + if (dst_type.sign && dst_type.norm && !dst_type.fixed) { + struct lp_build_context bld; + + lp_build_context_init(&bld, gallivm, tmp_type); + for(i = 0; i < num_tmps; ++i) { + tmp[i] = lp_build_iround(&bld, tmp[i]); + } + tmp_type.floating = FALSE; + } + else { + LLVMTypeRef tmp_vec_type; + + tmp_type.floating = FALSE; + tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); + for(i = 0; i < num_tmps; ++i) { #if 0 - if(dst_type.sign) - tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); - else - tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, ""); + if(dst_type.sign) + tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); + else + tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, ""); #else - /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */ - tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); + /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */ + tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); #endif + } } } } else { unsigned src_shift = lp_const_shift(src_type); unsigned dst_shift = lp_const_shift(dst_type); + unsigned src_offset = lp_const_offset(src_type); + unsigned dst_offset = lp_const_offset(dst_type); + struct lp_build_context bld; + lp_build_context_init(&bld, gallivm, tmp_type); + + /* Compensate for different offsets */ + /* sscaled -> unorm and similar would cause negative shift count, skip */ + if (dst_offset > src_offset && src_type.width > dst_type.width && src_shift > 0) { + for (i = 0; i < num_tmps; ++i) { + LLVMValueRef shifted; + + shifted = lp_build_shr_imm(&bld, tmp[i], src_shift - 1); + tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, ""); + } + } - /* FIXME: compensate different offsets too */ if(src_shift > dst_shift) { - LLVMValueRef shift = lp_build_const_int_vec(tmp_type, src_shift - dst_shift); for(i = 0; i < num_tmps; ++i) - if(src_type.sign) - tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, ""); - else - tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, ""); + tmp[i] = lp_build_shr_imm(&bld, tmp[i], src_shift - dst_shift); } } @@ -443,7 +827,7 @@ lp_build_conv(LLVMBuilderRef builder, new_type.width = dst_type.width; new_type.length = dst_type.length; - lp_build_resize(builder, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts); + lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts); tmp_type = new_type; num_tmps = num_dsts; @@ -459,7 +843,7 @@ lp_build_conv(LLVMBuilderRef builder, else if(!src_type.floating && dst_type.floating) { if(!src_type.fixed && !src_type.sign && src_type.norm) { for(i = 0; i < num_tmps; ++i) { - tmp[i] = lp_build_unsigned_norm_to_float(builder, + tmp[i] = lp_build_unsigned_norm_to_float(gallivm, src_type.width, dst_type, tmp[i]); @@ -473,7 +857,7 @@ lp_build_conv(LLVMBuilderRef builder, /* Use an equally sized integer for intermediate computations */ tmp_type.floating = TRUE; tmp_type.sign = TRUE; - tmp_vec_type = lp_build_vec_type(tmp_type); + tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); for(i = 0; i < num_tmps; ++i) { #if 0 if(dst_type.sign) @@ -487,21 +871,58 @@ lp_build_conv(LLVMBuilderRef builder, } if (src_scale != 1.0) { - LLVMValueRef scale = lp_build_const_vec(tmp_type, 1.0/src_scale); + LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale); for(i = 0; i < num_tmps; ++i) tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } + + /* the formula above will produce value below -1.0 for most negative + * value but everything seems happy with that hence disable for now */ + if (0 && !src_type.fixed && src_type.norm && src_type.sign) { + struct lp_build_context bld; + + lp_build_context_init(&bld, gallivm, dst_type); + for(i = 0; i < num_tmps; ++i) { + tmp[i] = lp_build_max(&bld, tmp[i], + lp_build_const_vec(gallivm, dst_type, -1.0f)); + } + } } } else { unsigned src_shift = lp_const_shift(src_type); unsigned dst_shift = lp_const_shift(dst_type); + unsigned src_offset = lp_const_offset(src_type); + unsigned dst_offset = lp_const_offset(dst_type); + struct lp_build_context bld; + lp_build_context_init(&bld, gallivm, tmp_type); + + if (src_shift < dst_shift) { + LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH]; + + if (dst_shift - src_shift < dst_type.width) { + for (i = 0; i < num_tmps; ++i) { + pre_shift[i] = tmp[i]; + tmp[i] = lp_build_shl_imm(&bld, tmp[i], dst_shift - src_shift); + } + } + else { + /* + * This happens for things like sscaled -> unorm conversions. Shift + * counts equal to bit width cause undefined results, so hack around it. + */ + for (i = 0; i < num_tmps; ++i) { + pre_shift[i] = tmp[i]; + tmp[i] = lp_build_zero(gallivm, dst_type); + } + } - /* FIXME: compensate different offsets too */ - if(src_shift < dst_shift) { - LLVMValueRef shift = lp_build_const_int_vec(tmp_type, dst_shift - src_shift); - for(i = 0; i < num_tmps; ++i) - tmp[i] = LLVMBuildShl(builder, tmp[i], shift, ""); + /* Compensate for different offsets */ + if (dst_offset > src_offset) { + for (i = 0; i < num_tmps; ++i) { + tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], ""); + } + } } } @@ -518,19 +939,17 @@ lp_build_conv(LLVMBuilderRef builder, * This will convert the integer masks that match the given types. * * The mask values should 0 or -1, i.e., all bits either set to zero or one. - * Any other value will likely cause in unpredictable results. + * Any other value will likely cause unpredictable results. * * This is basically a very trimmed down version of lp_build_conv. */ void -lp_build_conv_mask(LLVMBuilderRef builder, +lp_build_conv_mask(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, const LLVMValueRef *src, unsigned num_srcs, LLVMValueRef *dst, unsigned num_dsts) { - /* Register width must remain constant */ - assert(src_type.width * src_type.length == dst_type.width * dst_type.length); /* We must not loose or gain channels. Only precision */ assert(src_type.length * num_srcs == dst_type.length * num_dsts); @@ -555,16 +974,5 @@ lp_build_conv_mask(LLVMBuilderRef builder, * Truncate or expand bit width */ - if(src_type.width > dst_type.width) { - assert(num_dsts == 1); - dst[0] = lp_build_pack(builder, src_type, dst_type, TRUE, src, num_srcs); - } - else if(src_type.width < dst_type.width) { - assert(num_srcs == 1); - lp_build_unpack(builder, src_type, dst_type, src[0], dst, num_dsts); - } - else { - assert(num_srcs == num_dsts); - memcpy(dst, src, num_dsts * sizeof *dst); - } + lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts); }