From 450950c57ac53d80a1c57f8ef5eb365db300fec2 Mon Sep 17 00:00:00 2001 From: Roland Scheidegger Date: Tue, 2 Apr 2013 17:47:30 +0200 Subject: [PATCH] gallivm: bring back optimized but incorrect float to smallfloat optimizations Conceptually the same as previously done in float_to_half. Should cut down number of instructions from 14 to 10 or so, but will promote some NaNs to Infs, so it's disabled. It gets a bit tricky though handling all the cases correctly... Passes basic tests either way (though there are no tests testing special cases, but some manual tests injecting them seemed promising). v2: style and comment fixes suggested by Jose Reviewed-by: Jose Fonseca --- .../auxiliary/gallivm/lp_bld_format_float.c | 116 ++++++++++++------ 1 file changed, 78 insertions(+), 38 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_float.c b/src/gallium/auxiliary/gallivm/lp_bld_format_float.c index a8cfe02f0d8..f68a617b810 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_float.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_float.c @@ -79,13 +79,15 @@ lp_build_float_to_smallfloat(struct gallivm_state *gallivm, { LLVMBuilderRef builder = gallivm->builder; LLVMValueRef i32_floatexpmask, i32_smallexpmask, magic, normal; - LLVMValueRef rescale_src, tmp, i32_roundmask, small_max; - LLVMValueRef is_nan, i32_qnanbit, src_abs, shift, infcheck_src, res; - LLVMValueRef is_inf, is_nan_or_inf, nan_or_inf, mask; + LLVMValueRef rescale_src, i32_roundmask, small_max; + LLVMValueRef i32_qnanbit, shift, res; + LLVMValueRef is_nan_or_inf, nan_or_inf, mask, i32_src; struct lp_type f32_type = lp_type_float_vec(32, 32 * i32_type.length); struct lp_build_context f32_bld, i32_bld; LLVMValueRef zero = lp_build_const_vec(gallivm, f32_type, 0.0f); unsigned exponent_start = mantissa_start + mantissa_bits; + boolean always_preserve_nans = true; + boolean maybe_correct_denorm_rounding = true; lp_build_context_init(&f32_bld, gallivm, f32_type); lp_build_context_init(&i32_bld, gallivm, i32_type); @@ -94,35 +96,41 @@ lp_build_float_to_smallfloat(struct gallivm_state *gallivm, ((1 << exponent_bits) - 1) << 23); i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23); - src_abs = lp_build_abs(&f32_bld, src); - src_abs = LLVMBuildBitCast(builder, src_abs, i32_bld.vec_type, ""); + i32_src = LLVMBuildBitCast(builder, src, i32_bld.vec_type, ""); if (has_sign) { - rescale_src = src_abs; - infcheck_src = src_abs; - src = LLVMBuildBitCast(builder, src, i32_bld.vec_type, ""); + rescale_src = src; } else { /* clamp to pos range (can still have sign bit if NaN or negative zero) */ - rescale_src = lp_build_max(&f32_bld, src, zero); - rescale_src = LLVMBuildBitCast(builder, rescale_src, i32_bld.vec_type, ""); - src = LLVMBuildBitCast(builder, src, i32_bld.vec_type, ""); - infcheck_src = src; + rescale_src = lp_build_max(&f32_bld, zero, src); } + rescale_src = LLVMBuildBitCast(builder, rescale_src, i32_bld.vec_type, ""); /* "ordinary" number */ - /* get rid of excess mantissa bits, and while here also potential sign bit */ - i32_roundmask = lp_build_const_int_vec(gallivm, i32_type, - ~((1 << (23 - mantissa_bits)) - 1) & - 0x7fffffff); + /* + * get rid of excess mantissa bits and sign bit + * This is only really needed for correct rounding of denorms I think + * but only if we use the preserve NaN path does using + * src_abs instead save us any instruction. + */ + if (maybe_correct_denorm_rounding || !always_preserve_nans) { + i32_roundmask = lp_build_const_int_vec(gallivm, i32_type, + ~((1 << (23 - mantissa_bits)) - 1) & + 0x7fffffff); + rescale_src = LLVMBuildBitCast(builder, rescale_src, i32_bld.vec_type, ""); + rescale_src = lp_build_and(&i32_bld, rescale_src, i32_roundmask); + rescale_src = LLVMBuildBitCast(builder, rescale_src, f32_bld.vec_type, ""); + } + else { + rescale_src = lp_build_abs(&f32_bld, src); + } - tmp = lp_build_and(&i32_bld, rescale_src, i32_roundmask); - tmp = LLVMBuildBitCast(builder, tmp, f32_bld.vec_type, ""); /* bias exponent (and denormalize if necessary) */ magic = lp_build_const_int_vec(gallivm, i32_type, ((1 << (exponent_bits - 1)) - 1) << 23); magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, ""); - normal = lp_build_mul(&f32_bld, tmp, magic); + normal = lp_build_mul(&f32_bld, rescale_src, magic); /* clamp to max value - largest non-infinity number */ small_max = lp_build_const_int_vec(gallivm, i32_type, @@ -141,19 +149,58 @@ lp_build_float_to_smallfloat(struct gallivm_state *gallivm, * (Cannot actually save the comparison since we need to distinguish * Inf and NaN cases anyway, but it would be better for AVX.) */ - is_nan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, - src_abs, i32_floatexpmask); - is_inf = lp_build_compare(gallivm, i32_type, PIPE_FUNC_EQUAL, - infcheck_src, i32_floatexpmask); - is_nan_or_inf = lp_build_or(&i32_bld, is_nan, is_inf); - /* could also set more mantissa bits but need at least the highest mantissa bit */ - i32_qnanbit = lp_build_const_vec(gallivm, i32_type, 1 << 22); - /* combine maxexp with qnanbit */ - nan_or_inf = lp_build_or(&i32_bld, i32_smallexpmask, - lp_build_and(&i32_bld, is_nan, i32_qnanbit)); - + if (always_preserve_nans) { + LLVMValueRef infcheck_src, is_inf, is_nan; + LLVMValueRef src_abs = lp_build_abs(&f32_bld, src); + src_abs = LLVMBuildBitCast(builder, src_abs, i32_bld.vec_type, ""); + + if (has_sign) { + infcheck_src = src_abs; + } + else { + infcheck_src = i32_src; + } + is_nan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, + src_abs, i32_floatexpmask); + is_inf = lp_build_compare(gallivm, i32_type, PIPE_FUNC_EQUAL, + infcheck_src, i32_floatexpmask); + is_nan_or_inf = lp_build_or(&i32_bld, is_nan, is_inf); + /* could also set more mantissa bits but need at least the highest mantissa bit */ + i32_qnanbit = lp_build_const_vec(gallivm, i32_type, 1 << 22); + /* combine maxexp with qnanbit */ + nan_or_inf = lp_build_or(&i32_bld, i32_smallexpmask, + lp_build_and(&i32_bld, is_nan, i32_qnanbit)); + } + else { + /* + * A couple simplifications, with mostly 2 drawbacks (so disabled): + * - it will promote some SNaNs (those which only had bits set + * in the mantissa part which got chopped off) to +-Infinity. + * (Those bits get chopped off anyway later so can as well use + * rescale_src instead of src_abs here saving the calculation of that.) + * - for no sign case, it relies on the max() being used for rescale_src + * to give back the NaN (which is NOT ieee754r behavior, but should work + * with sse2 on a full moon (rather if I got the operand order right) - + * we _don't_ have well-defined behavior specified with min/max wrt NaNs, + * however, and if it gets converted to cmp/select it may not work (we + * don't really have specified behavior for cmp wrt NaNs neither). + */ + rescale_src = LLVMBuildBitCast(builder, rescale_src, i32_bld.vec_type, ""); + is_nan_or_inf = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GEQUAL, + rescale_src, i32_floatexpmask); + /* note this will introduce excess exponent bits */ + nan_or_inf = rescale_src; + } res = lp_build_select(&i32_bld, is_nan_or_inf, nan_or_inf, normal); + if (mantissa_start > 0 || !always_preserve_nans) { + /* mask off excess bits */ + unsigned maskbits = (1 << (mantissa_bits + exponent_bits)) - 1; + mask = lp_build_const_int_vec(gallivm, i32_type, + maskbits << (23 - mantissa_bits)); + res = lp_build_and(&i32_bld, res, mask); + } + /* add back sign bit at right position */ if (has_sign) { LLVMValueRef sign; @@ -163,7 +210,7 @@ lp_build_float_to_smallfloat(struct gallivm_state *gallivm, mask = lp_build_const_int_vec(gallivm, i32_type, 0x80000000); shift = lp_build_const_int_vec(gallivm, i32_type, 8 - exponent_bits); - sign = lp_build_and(&i32_bld, mask, src); + sign = lp_build_and(&i32_bld, mask, i32_src); sign = lp_build_shr(&u32_bld, sign, shift); res = lp_build_or(&i32_bld, sign, res); } @@ -177,13 +224,6 @@ lp_build_float_to_smallfloat(struct gallivm_state *gallivm, shift = lp_build_const_int_vec(gallivm, i32_type, exponent_start - 23); res = lp_build_shl(&i32_bld, res, shift); } - if (mantissa_start > 0) { - /* generally shouldn't get bits to mask off but can happen with denormals */ - unsigned maskbits = (1 << (mantissa_bits + exponent_bits + has_sign)) - 1; - mask = lp_build_const_int_vec(gallivm, i32_type, - maskbits << mantissa_start); - res = lp_build_and(&i32_bld, res, mask); - } return res; } -- 2.30.2