From b2b2a2c06c20f3ca592af6e96222deab67ea239c Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Thu, 20 Feb 2014 03:09:17 +0100
Subject: [PATCH] gallivm: add smallfloat to float conversion not relying on
 cpu denorm handling

The previous code relied on cpu denorm support for converting small float
formats (such r11g11b10_float and r16_float) to floats, otherwise denorms
are flushed to zero. We worked around that in llvmpipe blend code by
reenabling denorms, but this did nothing for texture sampling. Now it would
be possible to reenable it there too but I'm not really a fan of messing
with fpu flags (and it seems we can't actually do it reliably with llvm in
any case looking at some bug reports). (Not to mention if you actually have
a lot of denorms in there, you can expect some order-of-magnitude slowdown
with x86 cpus.)
So instead use code which adjusts exponents etc. directly hence not relying
on cpu denorm support for the rescaling mul.
(We still need the fpu flag handling as we can't do float-to-smallfloat
without using cpu denorms at least for now - I actually wanted to keep
both the old and new code and using one or the other depending on from where
it's called but that didn't work out as the parameter would have to be passed
through too many layers than I'd like.)

Reviewed-by: Zack Rusin <zackr@vmware.com>
Reviewed-by: Si Chen <sichen@vmware.com>
---
 .../auxiliary/gallivm/lp_bld_format_float.c   | 85 ++++++++++++++-----
 1 file changed, 65 insertions(+), 20 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_float.c b/src/gallium/auxiliary/gallivm/lp_bld_format_float.c
index f68a617b810..b87174e4a20 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_float.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_float.c
@@ -309,33 +309,78 @@ lp_build_smallfloat_to_float(struct gallivm_state *gallivm,
                                     ((1 << (mantissa_bits + exponent_bits)) - 1)
                                     << (23 - mantissa_bits));
    srcabs = lp_build_and(&i32_bld, src, maskabs);
-   srcabs = LLVMBuildBitCast(builder, srcabs, f32_bld.vec_type, "");
 
    /* now do the actual scaling */
    smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
                                          ((1 << exponent_bits) - 1) << 23);
    i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
-   /*
-    * magic number has exponent new exp bias + (new exp bias - old exp bias),
-    * mantissa is 0.
-    */
-   magic = lp_build_const_int_vec(gallivm, i32_type,
-                                  (255 - (1 << (exponent_bits - 1))) << 23);
-   magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
 
-   /* adjust exponent and fix denorms */
-   res = lp_build_mul(&f32_bld, srcabs, magic);
+   if (0) {
+     /*
+      * Note that this code path, while simpler, will convert small
+      * float denorms to floats according to current cpu denorm mode, if
+      * denorms are disabled it will flush them to zero!
+      * If cpu denorms are enabled, it should be faster though as long as
+      * there's no denorms in the inputs, but if there are actually denorms
+      * it's likely to be an order of magnitude slower (on x86 cpus).
+      */
 
-   /*
-    * if exp was max (== NaN or Inf) set new exp to max (keep mantissa),
-    * so a simple "or" will do (because exp adjust will leave mantissa intact)
-    */
-   /* use float compare (better for AVX 8-wide / no AVX2 but else should use int) */
-   smallexpmask = LLVMBuildBitCast(builder, smallexpmask, f32_bld.vec_type, "");
-   wasinfnan = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GEQUAL, srcabs, smallexpmask);
-   res = LLVMBuildBitCast(builder, res, i32_bld.vec_type, "");
-   tmp = lp_build_and(&i32_bld, i32_floatexpmask, wasinfnan);
-   res = lp_build_or(&i32_bld, tmp, res);
+      srcabs = LLVMBuildBitCast(builder, srcabs, f32_bld.vec_type, "");
+
+      /*
+       * magic number has exponent new exp bias + (new exp bias - old exp bias),
+       * mantissa is 0.
+       */
+      magic = lp_build_const_int_vec(gallivm, i32_type,
+                                     (255 - (1 << (exponent_bits - 1))) << 23);
+      magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
+
+      /* adjust exponent and fix denorms */
+      res = lp_build_mul(&f32_bld, srcabs, magic);
+
+      /*
+       * if exp was max (== NaN or Inf) set new exp to max (keep mantissa),
+       * so a simple "or" will do (because exp adjust will leave mantissa intact)
+       */
+      /* use float compare (better for AVX 8-wide / no AVX2 but else should use int) */
+      smallexpmask = LLVMBuildBitCast(builder, smallexpmask, f32_bld.vec_type, "");
+      wasinfnan = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GEQUAL, srcabs, smallexpmask);
+      res = LLVMBuildBitCast(builder, res, i32_bld.vec_type, "");
+      tmp = lp_build_and(&i32_bld, i32_floatexpmask, wasinfnan);
+      res = lp_build_or(&i32_bld, tmp, res);
+   }
+
+   else {
+      LLVMValueRef exp_one, isdenorm, denorm, normal, exp_adj;
+
+      /* denorm (or zero) if exponent is zero */
+      exp_one = lp_build_const_int_vec(gallivm, i32_type, 1 << 23);
+      isdenorm = lp_build_cmp(&i32_bld, PIPE_FUNC_LESS, srcabs, exp_one);
+
+      /* inf or nan if exponent is max */
+      wasinfnan = lp_build_cmp(&i32_bld, PIPE_FUNC_GEQUAL, srcabs, smallexpmask);
+
+      /* for denormal (or zero), add (== or) magic exp to mantissa (== srcabs) (as int)
+       * then subtract it (as float).
+       * Another option would be to just do inttofp then do a rescale mul.
+       */
+      magic = lp_build_const_int_vec(gallivm, i32_type,
+                                     (127 - ((1 << (exponent_bits - 1)) - 2)) << 23);
+      denorm = lp_build_or(&i32_bld, srcabs, magic);
+      denorm = LLVMBuildBitCast(builder, denorm, f32_bld.vec_type, "");
+      denorm = lp_build_sub(&f32_bld, denorm,
+                            LLVMBuildBitCast(builder, magic, f32_bld.vec_type, ""));
+      denorm = LLVMBuildBitCast(builder, denorm, i32_bld.vec_type, "");
+
+      /* for normals, Infs, Nans fix up exponent */
+      exp_adj = lp_build_const_int_vec(gallivm, i32_type,
+                                      (127 - ((1 << (exponent_bits - 1)) - 1)) << 23);
+      normal = lp_build_add(&i32_bld, srcabs, exp_adj);
+      tmp = lp_build_and(&i32_bld, wasinfnan, i32_floatexpmask);
+      normal = lp_build_or(&i32_bld, tmp, normal);
+
+      res = lp_build_select(&i32_bld, isdenorm, denorm, normal);
+   }
 
    if (has_sign) {
       LLVMValueRef signmask = lp_build_const_int_vec(gallivm, i32_type, 0x80000000);
-- 
2.30.2