From: Dylan Baker <dylan@pnwbakers.com>
Date: Fri, 17 Aug 2018 00:20:38 +0000 (-0700)
Subject: move u_math to src/util
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=80825abb5d1a7491035880253ffd531c55acae6b;p=mesa.git

move u_math to src/util

Currently we have two sets of functions for bit counts, one in gallium
and one in core mesa. The ones in core mesa are header only in many
cases, since they reduce to "#define _mesa_bitcount popcount", but they
provide a fallback implementation. This is important because 32bit msvc
doesn't have popcountll, just popcount; so when nir (for example)
includes the core mesa header it doesn't (and shouldn't) link with core
mesa. To fix this we'll promote the version out of gallium util, then
replace the core mesa uses with the util version, since nir (and other
non-core mesa users) can and do link with mesautils.

Acked-by: Eric Engestrom <eric.engestrom@intel.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---

diff --git a/src/broadcom/cle/v3d_packet_helpers.h b/src/broadcom/cle/v3d_packet_helpers.h
index f340b790697..c46089a0e60 100644
--- a/src/broadcom/cle/v3d_packet_helpers.h
+++ b/src/broadcom/cle/v3d_packet_helpers.h
@@ -26,7 +26,7 @@
 #include <stdbool.h>
 #include <assert.h>
 #include <math.h>
-#include <gallium/auxiliary/util/u_math.h>
+#include "util/u_math.h"
 
 #ifdef HAVE_VALGRIND
 #include <valgrind.h>
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index d66fa85f798..33d58dedf28 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -294,8 +294,6 @@ C_SOURCES := \
 	util/u_linear.h \
 	util/u_log.c \
 	util/u_log.h \
-	util/u_math.c \
-	util/u_math.h \
 	util/u_memory.h \
 	util/u_mm.c \
 	util/u_mm.h \
diff --git a/src/gallium/auxiliary/meson.build b/src/gallium/auxiliary/meson.build
index be558e07b27..9e3673a53c0 100644
--- a/src/gallium/auxiliary/meson.build
+++ b/src/gallium/auxiliary/meson.build
@@ -314,8 +314,6 @@ files_libgallium = files(
   'util/u_linear.h',
   'util/u_log.c',
   'util/u_log.h',
-  'util/u_math.c',
-  'util/u_math.h',
   'util/u_memory.h',
   'util/u_mm.c',
   'util/u_mm.h',
diff --git a/src/gallium/auxiliary/util/u_format.c b/src/gallium/auxiliary/util/u_format.c
index 1dd724d9b84..6445f2647cf 100644
--- a/src/gallium/auxiliary/util/u_format.c
+++ b/src/gallium/auxiliary/util/u_format.c
@@ -32,11 +32,11 @@
  * @author Jose Fonseca <jfonseca@vmware.com>
  */
 
-#include "u_math.h"
 #include "u_memory.h"
 #include "u_format.h"
 #include "u_format_s3tc.h"
 #include "u_surface.h"
+#include "util/u_math.h"
 
 #include "pipe/p_defines.h"
 
diff --git a/src/gallium/auxiliary/util/u_format_bptc.c b/src/gallium/auxiliary/util/u_format_bptc.c
index 87ec4139e09..519a541e380 100644
--- a/src/gallium/auxiliary/util/u_format_bptc.c
+++ b/src/gallium/auxiliary/util/u_format_bptc.c
@@ -23,10 +23,10 @@
  *
  **************************************************************************/
 
-#include "u_math.h"
 #include "u_format.h"
 #include "u_format_bptc.h"
 #include "util/format_srgb.h"
+#include "util/u_math.h"
 
 #define BPTC_BLOCK_DECODE
 #include "../../../mesa/main/texcompress_bptc_tmp.h"
diff --git a/src/gallium/auxiliary/util/u_format_latc.c b/src/gallium/auxiliary/util/u_format_latc.c
index 7b2bb00693f..f145081d56d 100644
--- a/src/gallium/auxiliary/util/u_format_latc.c
+++ b/src/gallium/auxiliary/util/u_format_latc.c
@@ -23,11 +23,11 @@
  **************************************************************************/
 
 #include <stdio.h>
-#include "u_math.h"
 #include "u_format.h"
 #include "u_format_rgtc.h"
 #include "u_format_latc.h"
 #include "util/rgtc.h"
+#include "util/u_math.h"
 
 void
 util_format_latc1_unorm_fetch_rgba_8unorm(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
diff --git a/src/gallium/auxiliary/util/u_format_other.c b/src/gallium/auxiliary/util/u_format_other.c
index 025e096a95c..213cb9fd941 100644
--- a/src/gallium/auxiliary/util/u_format_other.c
+++ b/src/gallium/auxiliary/util/u_format_other.c
@@ -26,8 +26,8 @@
  **************************************************************************/
 
 
-#include "u_math.h"
 #include "u_format_other.h"
+#include "util/u_math.h"
 #include "util/format_rgb9e5.h"
 #include "util/format_r11g11b10f.h"
 
diff --git a/src/gallium/auxiliary/util/u_format_pack.py b/src/gallium/auxiliary/util/u_format_pack.py
index c1307d30c2f..62e531716ca 100644
--- a/src/gallium/auxiliary/util/u_format_pack.py
+++ b/src/gallium/auxiliary/util/u_format_pack.py
@@ -702,7 +702,7 @@ def is_format_hand_written(format):
 def generate(formats):
     print()
     print('#include "pipe/p_compiler.h"')
-    print('#include "u_math.h"')
+    print('#include "util/u_math.h"')
     print('#include "u_half.h"')
     print('#include "u_format.h"')
     print('#include "u_format_other.h"')
diff --git a/src/gallium/auxiliary/util/u_format_rgtc.c b/src/gallium/auxiliary/util/u_format_rgtc.c
index da64b63f777..0b98c01a922 100644
--- a/src/gallium/auxiliary/util/u_format_rgtc.c
+++ b/src/gallium/auxiliary/util/u_format_rgtc.c
@@ -23,9 +23,9 @@
  **************************************************************************/
 
 #include <stdio.h>
-#include "u_math.h"
 #include "u_format.h"
 #include "u_format_rgtc.h"
+#include "util/u_math.h"
 #include "util/rgtc.h"
 
 void
diff --git a/src/gallium/auxiliary/util/u_format_s3tc.c b/src/gallium/auxiliary/util/u_format_s3tc.c
index 3f755e53636..69ca62a961c 100644
--- a/src/gallium/auxiliary/util/u_format_s3tc.c
+++ b/src/gallium/auxiliary/util/u_format_s3tc.c
@@ -24,10 +24,10 @@
  **************************************************************************/
 
 #include "u_dl.h"
-#include "u_math.h"
 #include "u_format.h"
 #include "u_format_s3tc.h"
 #include "util/format_srgb.h"
+#include "util/u_math.h"
 #include "../../../mesa/main/texcompress_s3tc_tmp.h"
 
 
diff --git a/src/gallium/auxiliary/util/u_format_yuv.h b/src/gallium/auxiliary/util/u_format_yuv.h
index d8e67b45940..171cbc928d7 100644
--- a/src/gallium/auxiliary/util/u_format_yuv.h
+++ b/src/gallium/auxiliary/util/u_format_yuv.h
@@ -46,7 +46,7 @@
 
 
 #include "pipe/p_compiler.h"
-#include "u_math.h"
+#include "util/u_math.h"
 
 
 /*
diff --git a/src/gallium/auxiliary/util/u_format_zs.c b/src/gallium/auxiliary/util/u_format_zs.c
index 69f2f2971f7..4ad3a0c6477 100644
--- a/src/gallium/auxiliary/util/u_format_zs.c
+++ b/src/gallium/auxiliary/util/u_format_zs.c
@@ -27,8 +27,8 @@
 
 
 #include "u_debug.h"
-#include "u_math.h"
 #include "u_format_zs.h"
+#include "util/u_math.h"
 
 
 /*
diff --git a/src/gallium/auxiliary/util/u_math.c b/src/gallium/auxiliary/util/u_math.c
deleted file mode 100644
index c58af911be7..00000000000
--- a/src/gallium/auxiliary/util/u_math.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 VMware, Inc.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-
-#include "pipe/p_config.h"
-#include "util/u_math.h"
-#include "util/u_cpu_detect.h"
-
-#if defined(PIPE_ARCH_SSE)
-#include <xmmintrin.h>
-/* This is defined in pmmintrin.h, but it can only be included when -msse3 is
- * used, so just define it here to avoid further. */
-#define _MM_DENORMALS_ZERO_MASK	0x0040
-#endif
-
-
-/** 2^x, for x in [-1.0, 1.0) */
-float pow2_table[POW2_TABLE_SIZE];
-
-
-static void
-init_pow2_table(void)
-{
-   int i;
-   for (i = 0; i < POW2_TABLE_SIZE; i++)
-      pow2_table[i] = exp2f((i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE);
-}
-
-
-/** log2(x), for x in [1.0, 2.0) */
-float log2_table[LOG2_TABLE_SIZE];
-
-
-static void 
-init_log2_table(void)
-{
-   unsigned i;
-   for (i = 0; i < LOG2_TABLE_SIZE; i++)
-      log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SCALE));
-}
-
-
-/**
- * One time init for math utilities.
- */
-void
-util_init_math(void)
-{
-   static boolean initialized = FALSE;
-   if (!initialized) {
-      init_pow2_table();
-      init_log2_table();
-      initialized = TRUE;
-   }
-}
-
-/**
- * Fetches the contents of the fpstate (mxcsr on x86) register.
- *
- * On platforms without support for it just returns 0.
- */
-unsigned
-util_fpstate_get(void)
-{
-   unsigned mxcsr = 0;
-
-#if defined(PIPE_ARCH_SSE)
-   if (util_cpu_caps.has_sse) {
-      mxcsr = _mm_getcsr();
-   }
-#endif
-
-   return mxcsr;
-}
-
-/**
- * Make sure that the fp treats the denormalized floating
- * point numbers as zero.
- *
- * This is the behavior required by D3D10. OpenGL doesn't care.
- */
-unsigned
-util_fpstate_set_denorms_to_zero(unsigned current_mxcsr)
-{
-#if defined(PIPE_ARCH_SSE)
-   if (util_cpu_caps.has_sse) {
-      /* Enable flush to zero mode */
-      current_mxcsr |= _MM_FLUSH_ZERO_MASK;
-      if (util_cpu_caps.has_daz) {
-         /* Enable denormals are zero mode */
-         current_mxcsr |= _MM_DENORMALS_ZERO_MASK;
-      }
-      util_fpstate_set(current_mxcsr);
-   }
-#endif
-   return current_mxcsr;
-}
-
-/**
- * Set the state of the fpstate (mxcsr on x86) register.
- *
- * On platforms without support for it's a noop.
- */
-void
-util_fpstate_set(unsigned mxcsr)
-{
-#if defined(PIPE_ARCH_SSE)
-   if (util_cpu_caps.has_sse) {
-      _mm_setcsr(mxcsr);
-   }
-#endif
-}
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
deleted file mode 100644
index 712305c98a4..00000000000
--- a/src/gallium/auxiliary/util/u_math.h
+++ /dev/null
@@ -1,752 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 VMware, Inc.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-/**
- * Math utilities and approximations for common math functions.
- * Reduced precision is usually acceptable in shaders...
- *
- * "fast" is used in the names of functions which are low-precision,
- * or at least lower-precision than the normal C lib functions.
- */
-
-
-#ifndef U_MATH_H
-#define U_MATH_H
-
-
-#include "pipe/p_compiler.h"
-
-#include "c99_math.h"
-#include <assert.h>
-#include <float.h>
-#include <stdarg.h>
-
-#include "util/bitscan.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-#ifndef M_SQRT2
-#define M_SQRT2 1.41421356237309504880
-#endif
-
-#define POW2_TABLE_SIZE_LOG2 9
-#define POW2_TABLE_SIZE (1 << POW2_TABLE_SIZE_LOG2)
-#define POW2_TABLE_OFFSET (POW2_TABLE_SIZE/2)
-#define POW2_TABLE_SCALE ((float)(POW2_TABLE_SIZE/2))
-extern float pow2_table[POW2_TABLE_SIZE];
-
-
-/**
- * Initialize math module.  This should be called before using any
- * other functions in this module.
- */
-extern void
-util_init_math(void);
-
-
-union fi {
-   float f;
-   int32_t i;
-   uint32_t ui;
-};
-
-
-union di {
-   double d;
-   int64_t i;
-   uint64_t ui;
-};
-
-
-/**
- * Extract the IEEE float32 exponent.
- */
-static inline signed
-util_get_float32_exponent(float x)
-{
-   union fi f;
-
-   f.f = x;
-
-   return ((f.ui >> 23) & 0xff) - 127;
-}
-
-
-/**
- * Fast version of 2^x
- * Identity: exp2(a + b) = exp2(a) * exp2(b)
- * Let ipart = int(x)
- * Let fpart = x - ipart;
- * So, exp2(x) = exp2(ipart) * exp2(fpart)
- * Compute exp2(ipart) with i << ipart
- * Compute exp2(fpart) with lookup table.
- */
-static inline float
-util_fast_exp2(float x)
-{
-   int32_t ipart;
-   float fpart, mpart;
-   union fi epart;
-
-   if(x > 129.00000f)
-      return 3.402823466e+38f;
-
-   if (x < -126.99999f)
-      return 0.0f;
-
-   ipart = (int32_t) x;
-   fpart = x - (float) ipart;
-
-   /* same as
-    *   epart.f = (float) (1 << ipart)
-    * but faster and without integer overflow for ipart > 31
-    */
-   epart.i = (ipart + 127 ) << 23;
-
-   mpart = pow2_table[POW2_TABLE_OFFSET + (int)(fpart * POW2_TABLE_SCALE)];
-
-   return epart.f * mpart;
-}
-
-
-/**
- * Fast approximation to exp(x).
- */
-static inline float
-util_fast_exp(float x)
-{
-   const float k = 1.44269f; /* = log2(e) */
-   return util_fast_exp2(k * x);
-}
-
-
-#define LOG2_TABLE_SIZE_LOG2 16
-#define LOG2_TABLE_SCALE (1 << LOG2_TABLE_SIZE_LOG2)
-#define LOG2_TABLE_SIZE (LOG2_TABLE_SCALE + 1)
-extern float log2_table[LOG2_TABLE_SIZE];
-
-
-/**
- * Fast approximation to log2(x).
- */
-static inline float
-util_fast_log2(float x)
-{
-   union fi num;
-   float epart, mpart;
-   num.f = x;
-   epart = (float)(((num.i & 0x7f800000) >> 23) - 127);
-   /* mpart = log2_table[mantissa*LOG2_TABLE_SCALE + 0.5] */
-   mpart = log2_table[((num.i & 0x007fffff) + (1 << (22 - LOG2_TABLE_SIZE_LOG2))) >> (23 - LOG2_TABLE_SIZE_LOG2)];
-   return epart + mpart;
-}
-
-
-/**
- * Fast approximation to x^y.
- */
-static inline float
-util_fast_pow(float x, float y)
-{
-   return util_fast_exp2(util_fast_log2(x) * y);
-}
-
-
-/**
- * Floor(x), returned as int.
- */
-static inline int
-util_ifloor(float f)
-{
-   int ai, bi;
-   double af, bf;
-   union fi u;
-   af = (3 << 22) + 0.5 + (double) f;
-   bf = (3 << 22) + 0.5 - (double) f;
-   u.f = (float) af;  ai = u.i;
-   u.f = (float) bf;  bi = u.i;
-   return (ai - bi) >> 1;
-}
-
-
-/**
- * Round float to nearest int.
- */
-static inline int
-util_iround(float f)
-{
-#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86) 
-   int r;
-   __asm__ ("fistpl %0" : "=m" (r) : "t" (f) : "st");
-   return r;
-#elif defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86)
-   int r;
-   _asm {
-      fld f
-      fistp r
-   }
-   return r;
-#else
-   if (f >= 0.0f)
-      return (int) (f + 0.5f);
-   else
-      return (int) (f - 0.5f);
-#endif
-}
-
-
-/**
- * Approximate floating point comparison
- */
-static inline boolean
-util_is_approx(float a, float b, float tol)
-{
-   return fabsf(b - a) <= tol;
-}
-
-
-/**
- * util_is_X_inf_or_nan = test if x is NaN or +/- Inf
- * util_is_X_nan        = test if x is NaN
- * util_X_inf_sign      = return +1 for +Inf, -1 for -Inf, or 0 for not Inf
- *
- * NaN can be checked with x != x, however this fails with the fast math flag
- **/
-
-
-/**
- * Single-float
- */
-static inline boolean
-util_is_inf_or_nan(float x)
-{
-   union fi tmp;
-   tmp.f = x;
-   return (tmp.ui & 0x7f800000) == 0x7f800000;
-}
-
-
-static inline boolean
-util_is_nan(float x)
-{
-   union fi tmp;
-   tmp.f = x;
-   return (tmp.ui & 0x7fffffff) > 0x7f800000;
-}
-
-
-static inline int
-util_inf_sign(float x)
-{
-   union fi tmp;
-   tmp.f = x;
-   if ((tmp.ui & 0x7fffffff) != 0x7f800000) {
-      return 0;
-   }
-
-   return (x < 0) ? -1 : 1;
-}
-
-
-/**
- * Double-float
- */
-static inline boolean
-util_is_double_inf_or_nan(double x)
-{
-   union di tmp;
-   tmp.d = x;
-   return (tmp.ui & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL;
-}
-
-
-static inline boolean
-util_is_double_nan(double x)
-{
-   union di tmp;
-   tmp.d = x;
-   return (tmp.ui & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL;
-}
-
-
-static inline int
-util_double_inf_sign(double x)
-{
-   union di tmp;
-   tmp.d = x;
-   if ((tmp.ui & 0x7fffffffffffffffULL) != 0x7ff0000000000000ULL) {
-      return 0;
-   }
-
-   return (x < 0) ? -1 : 1;
-}
-
-
-/**
- * Half-float
- */
-static inline boolean
-util_is_half_inf_or_nan(int16_t x)
-{
-   return (x & 0x7c00) == 0x7c00;
-}
-
-
-static inline boolean
-util_is_half_nan(int16_t x)
-{
-   return (x & 0x7fff) > 0x7c00;
-}
-
-
-static inline int
-util_half_inf_sign(int16_t x)
-{
-   if ((x & 0x7fff) != 0x7c00) {
-      return 0;
-   }
-
-   return (x < 0) ? -1 : 1;
-}
-
-
-/**
- * Return float bits.
- */
-static inline unsigned
-fui( float f )
-{
-   union fi fi;
-   fi.f = f;
-   return fi.ui;
-}
-
-static inline float
-uif(uint32_t ui)
-{
-   union fi fi;
-   fi.ui = ui;
-   return fi.f;
-}
-
-
-/**
- * Convert ubyte to float in [0, 1].
- */
-static inline float
-ubyte_to_float(ubyte ub)
-{
-   return (float) ub * (1.0f / 255.0f);
-}
-
-
-/**
- * Convert float in [0,1] to ubyte in [0,255] with clamping.
- */
-static inline ubyte
-float_to_ubyte(float f)
-{
-   /* return 0 for NaN too */
-   if (!(f > 0.0f)) {
-      return (ubyte) 0;
-   }
-   else if (f >= 1.0f) {
-      return (ubyte) 255;
-   }
-   else {
-      union fi tmp;
-      tmp.f = f;
-      tmp.f = tmp.f * (255.0f/256.0f) + 32768.0f;
-      return (ubyte) tmp.i;
-   }
-}
-
-static inline float
-byte_to_float_tex(int8_t b)
-{
-   return (b == -128) ? -1.0F : b * 1.0F / 127.0F;
-}
-
-static inline int8_t
-float_to_byte_tex(float f)
-{
-   return (int8_t) (127.0F * f);
-}
-
-/**
- * Calc log base 2
- */
-static inline unsigned
-util_logbase2(unsigned n)
-{
-#if defined(HAVE___BUILTIN_CLZ)
-   return ((sizeof(unsigned) * 8 - 1) - __builtin_clz(n | 1));
-#else
-   unsigned pos = 0;
-   if (n >= 1<<16) { n >>= 16; pos += 16; }
-   if (n >= 1<< 8) { n >>=  8; pos +=  8; }
-   if (n >= 1<< 4) { n >>=  4; pos +=  4; }
-   if (n >= 1<< 2) { n >>=  2; pos +=  2; }
-   if (n >= 1<< 1) {           pos +=  1; }
-   return pos;
-#endif
-}
-
-static inline uint64_t
-util_logbase2_64(uint64_t n)
-{
-#if defined(HAVE___BUILTIN_CLZLL)
-   return ((sizeof(uint64_t) * 8 - 1) - __builtin_clzll(n | 1));
-#else
-   uint64_t pos = 0ull;
-   if (n >= 1ull<<32) { n >>= 32; pos += 32; }
-   if (n >= 1ull<<16) { n >>= 16; pos += 16; }
-   if (n >= 1ull<< 8) { n >>=  8; pos +=  8; }
-   if (n >= 1ull<< 4) { n >>=  4; pos +=  4; }
-   if (n >= 1ull<< 2) { n >>=  2; pos +=  2; }
-   if (n >= 1ull<< 1) {           pos +=  1; }
-   return pos;
-#endif
-}
-
-/**
- * Returns the ceiling of log n base 2, and 0 when n == 0. Equivalently,
- * returns the smallest x such that n <= 2**x.
- */
-static inline unsigned
-util_logbase2_ceil(unsigned n)
-{
-   if (n <= 1)
-      return 0;
-
-   return 1 + util_logbase2(n - 1);
-}
-
-static inline uint64_t
-util_logbase2_ceil64(uint64_t n)
-{
-   if (n <= 1)
-      return 0;
-
-   return 1ull + util_logbase2_64(n - 1);
-}
-
-/**
- * Returns the smallest power of two >= x
- */
-static inline unsigned
-util_next_power_of_two(unsigned x)
-{
-#if defined(HAVE___BUILTIN_CLZ)
-   if (x <= 1)
-       return 1;
-
-   return (1 << ((sizeof(unsigned) * 8) - __builtin_clz(x - 1)));
-#else
-   unsigned val = x;
-
-   if (x <= 1)
-      return 1;
-
-   if (util_is_power_of_two_or_zero(x))
-      return x;
-
-   val--;
-   val = (val >> 1) | val;
-   val = (val >> 2) | val;
-   val = (val >> 4) | val;
-   val = (val >> 8) | val;
-   val = (val >> 16) | val;
-   val++;
-   return val;
-#endif
-}
-
-static inline uint64_t
-util_next_power_of_two64(uint64_t x)
-{
-#if defined(HAVE___BUILTIN_CLZLL)
-   if (x <= 1)
-       return 1;
-
-   return (1ull << ((sizeof(uint64_t) * 8) - __builtin_clzll(x - 1)));
-#else
-   uint64_t val = x;
-
-   if (x <= 1)
-      return 1;
-
-   if (util_is_power_of_two_or_zero64(x))
-      return x;
-
-   val--;
-   val = (val >> 1)  | val;
-   val = (val >> 2)  | val;
-   val = (val >> 4)  | val;
-   val = (val >> 8)  | val;
-   val = (val >> 16) | val;
-   val = (val >> 32) | val;
-   val++;
-   return val;
-#endif
-}
-
-
-/**
- * Return number of bits set in n.
- */
-static inline unsigned
-util_bitcount(unsigned n)
-{
-#if defined(HAVE___BUILTIN_POPCOUNT)
-   return __builtin_popcount(n);
-#else
-   /* K&R classic bitcount.
-    *
-    * For each iteration, clear the LSB from the bitfield.
-    * Requires only one iteration per set bit, instead of
-    * one iteration per bit less than highest set bit.
-    */
-   unsigned bits;
-   for (bits = 0; n; bits++) {
-      n &= n - 1;
-   }
-   return bits;
-#endif
-}
-
-
-static inline unsigned
-util_bitcount64(uint64_t n)
-{
-#ifdef HAVE___BUILTIN_POPCOUNTLL
-   return __builtin_popcountll(n);
-#else
-   return util_bitcount(n) + util_bitcount(n >> 32);
-#endif
-}
-
-
-/**
- * Reverse bits in n
- * Algorithm taken from:
- * http://stackoverflow.com/questions/9144800/c-reverse-bits-in-unsigned-integer
- */
-static inline unsigned
-util_bitreverse(unsigned n)
-{
-    n = ((n >> 1) & 0x55555555u) | ((n & 0x55555555u) << 1);
-    n = ((n >> 2) & 0x33333333u) | ((n & 0x33333333u) << 2);
-    n = ((n >> 4) & 0x0f0f0f0fu) | ((n & 0x0f0f0f0fu) << 4);
-    n = ((n >> 8) & 0x00ff00ffu) | ((n & 0x00ff00ffu) << 8);
-    n = ((n >> 16) & 0xffffu) | ((n & 0xffffu) << 16);
-    return n;
-}
-
-/**
- * Convert from little endian to CPU byte order.
- */
-
-#ifdef PIPE_ARCH_BIG_ENDIAN
-#define util_le64_to_cpu(x) util_bswap64(x)
-#define util_le32_to_cpu(x) util_bswap32(x)
-#define util_le16_to_cpu(x) util_bswap16(x)
-#else
-#define util_le64_to_cpu(x) (x)
-#define util_le32_to_cpu(x) (x)
-#define util_le16_to_cpu(x) (x)
-#endif
-
-#define util_cpu_to_le64(x) util_le64_to_cpu(x)
-#define util_cpu_to_le32(x) util_le32_to_cpu(x)
-#define util_cpu_to_le16(x) util_le16_to_cpu(x)
-
-/**
- * Reverse byte order of a 32 bit word.
- */
-static inline uint32_t
-util_bswap32(uint32_t n)
-{
-#if defined(HAVE___BUILTIN_BSWAP32)
-   return __builtin_bswap32(n);
-#else
-   return (n >> 24) |
-          ((n >> 8) & 0x0000ff00) |
-          ((n << 8) & 0x00ff0000) |
-          (n << 24);
-#endif
-}
-
-/**
- * Reverse byte order of a 64bit word.
- */
-static inline uint64_t
-util_bswap64(uint64_t n)
-{
-#if defined(HAVE___BUILTIN_BSWAP64)
-   return __builtin_bswap64(n);
-#else
-   return ((uint64_t)util_bswap32((uint32_t)n) << 32) |
-          util_bswap32((n >> 32));
-#endif
-}
-
-
-/**
- * Reverse byte order of a 16 bit word.
- */
-static inline uint16_t
-util_bswap16(uint16_t n)
-{
-   return (n >> 8) |
-          (n << 8);
-}
-
-static inline void*
-util_memcpy_cpu_to_le32(void * restrict dest, const void * restrict src, size_t n)
-{
-#ifdef PIPE_ARCH_BIG_ENDIAN
-   size_t i, e;
-   assert(n % 4 == 0);
-
-   for (i = 0, e = n / 4; i < e; i++) {
-      uint32_t * restrict d = (uint32_t* restrict)dest;
-      const uint32_t * restrict s = (const uint32_t* restrict)src;
-      d[i] = util_bswap32(s[i]);
-   }
-   return dest;
-#else
-   return memcpy(dest, src, n);
-#endif
-}
-
-/**
- * Clamp X to [MIN, MAX].
- * This is a macro to allow float, int, uint, etc. types.
- * We arbitrarily turn NaN into MIN.
- */
-#define CLAMP( X, MIN, MAX )  ( (X)>(MIN) ? ((X)>(MAX) ? (MAX) : (X)) : (MIN) )
-
-#define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
-#define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
-
-#define MIN3( A, B, C ) ((A) < (B) ? MIN2(A, C) : MIN2(B, C))
-#define MAX3( A, B, C ) ((A) > (B) ? MAX2(A, C) : MAX2(B, C))
-
-#define MIN4( A, B, C, D ) ((A) < (B) ? MIN3(A, C, D) : MIN3(B, C, D))
-#define MAX4( A, B, C, D ) ((A) > (B) ? MAX3(A, C, D) : MAX3(B, C, D))
-
-
-/**
- * Align a value, only works pot alignemnts.
- */
-static inline int
-align(int value, int alignment)
-{
-   return (value + alignment - 1) & ~(alignment - 1);
-}
-
-static inline uint64_t
-align64(uint64_t value, unsigned alignment)
-{
-   return (value + alignment - 1) & ~((uint64_t)alignment - 1);
-}
-
-/**
- * Works like align but on npot alignments.
- */
-static inline size_t
-util_align_npot(size_t value, size_t alignment)
-{
-   if (value % alignment)
-      return value + (alignment - (value % alignment));
-   return value;
-}
-
-static inline unsigned
-u_minify(unsigned value, unsigned levels)
-{
-    return MAX2(1, value >> levels);
-}
-
-#ifndef COPY_4V
-#define COPY_4V( DST, SRC )         \
-do {                                \
-   (DST)[0] = (SRC)[0];             \
-   (DST)[1] = (SRC)[1];             \
-   (DST)[2] = (SRC)[2];             \
-   (DST)[3] = (SRC)[3];             \
-} while (0)
-#endif
-
-
-#ifndef COPY_4FV
-#define COPY_4FV( DST, SRC )  COPY_4V(DST, SRC)
-#endif
-
-
-#ifndef ASSIGN_4V
-#define ASSIGN_4V( DST, V0, V1, V2, V3 ) \
-do {                                     \
-   (DST)[0] = (V0);                      \
-   (DST)[1] = (V1);                      \
-   (DST)[2] = (V2);                      \
-   (DST)[3] = (V3);                      \
-} while (0)
-#endif
-
-
-static inline uint32_t
-util_unsigned_fixed(float value, unsigned frac_bits)
-{
-   return value < 0 ? 0 : (uint32_t)(value * (1<<frac_bits));
-}
-
-static inline int32_t
-util_signed_fixed(float value, unsigned frac_bits)
-{
-   return (int32_t)(value * (1<<frac_bits));
-}
-
-unsigned
-util_fpstate_get(void);
-unsigned
-util_fpstate_set_denorms_to_zero(unsigned current_fpstate);
-void
-util_fpstate_set(unsigned fpstate);
-
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* U_MATH_H */
diff --git a/src/util/Makefile.sources b/src/util/Makefile.sources
index 531fd833c7f..b562d6cd6f4 100644
--- a/src/util/Makefile.sources
+++ b/src/util/Makefile.sources
@@ -55,6 +55,8 @@ MESA_UTIL_FILES := \
 	u_atomic.h \
 	u_dynarray.h \
 	u_endian.h \
+	u_math.c \
+	u_math.h \
 	u_queue.c \
 	u_queue.h \
 	u_string.h \
diff --git a/src/util/meson.build b/src/util/meson.build
index a4ff0b92ed6..e7ad2861238 100644
--- a/src/util/meson.build
+++ b/src/util/meson.build
@@ -85,6 +85,8 @@ files_mesa_util = files(
   'u_thread.h',
   'u_vector.c',
   'u_vector.h',
+  'u_math.c',
+  'u_math.h',
   'vma.c',
   'vma.h',
 )
diff --git a/src/util/u_math.c b/src/util/u_math.c
new file mode 100644
index 00000000000..c58af911be7
--- /dev/null
+++ b/src/util/u_math.c
@@ -0,0 +1,137 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#include "pipe/p_config.h"
+#include "util/u_math.h"
+#include "util/u_cpu_detect.h"
+
+#if defined(PIPE_ARCH_SSE)
+#include <xmmintrin.h>
+/* This is defined in pmmintrin.h, but it can only be included when -msse3 is
+ * used, so just define it here to avoid further. */
+#define _MM_DENORMALS_ZERO_MASK	0x0040
+#endif
+
+
+/** 2^x, for x in [-1.0, 1.0) */
+float pow2_table[POW2_TABLE_SIZE];
+
+
+static void
+init_pow2_table(void)
+{
+   int i;
+   for (i = 0; i < POW2_TABLE_SIZE; i++)
+      pow2_table[i] = exp2f((i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE);
+}
+
+
+/** log2(x), for x in [1.0, 2.0) */
+float log2_table[LOG2_TABLE_SIZE];
+
+
+static void 
+init_log2_table(void)
+{
+   unsigned i;
+   for (i = 0; i < LOG2_TABLE_SIZE; i++)
+      log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SCALE));
+}
+
+
+/**
+ * One time init for math utilities.
+ */
+void
+util_init_math(void)
+{
+   static boolean initialized = FALSE;
+   if (!initialized) {
+      init_pow2_table();
+      init_log2_table();
+      initialized = TRUE;
+   }
+}
+
+/**
+ * Fetches the contents of the fpstate (mxcsr on x86) register.
+ *
+ * On platforms without support for it just returns 0.
+ */
+unsigned
+util_fpstate_get(void)
+{
+   unsigned mxcsr = 0;
+
+#if defined(PIPE_ARCH_SSE)
+   if (util_cpu_caps.has_sse) {
+      mxcsr = _mm_getcsr();
+   }
+#endif
+
+   return mxcsr;
+}
+
+/**
+ * Make sure that the fp treats the denormalized floating
+ * point numbers as zero.
+ *
+ * This is the behavior required by D3D10. OpenGL doesn't care.
+ */
+unsigned
+util_fpstate_set_denorms_to_zero(unsigned current_mxcsr)
+{
+#if defined(PIPE_ARCH_SSE)
+   if (util_cpu_caps.has_sse) {
+      /* Enable flush to zero mode */
+      current_mxcsr |= _MM_FLUSH_ZERO_MASK;
+      if (util_cpu_caps.has_daz) {
+         /* Enable denormals are zero mode */
+         current_mxcsr |= _MM_DENORMALS_ZERO_MASK;
+      }
+      util_fpstate_set(current_mxcsr);
+   }
+#endif
+   return current_mxcsr;
+}
+
+/**
+ * Set the state of the fpstate (mxcsr on x86) register.
+ *
+ * On platforms without support for it's a noop.
+ */
+void
+util_fpstate_set(unsigned mxcsr)
+{
+#if defined(PIPE_ARCH_SSE)
+   if (util_cpu_caps.has_sse) {
+      _mm_setcsr(mxcsr);
+   }
+#endif
+}
diff --git a/src/util/u_math.h b/src/util/u_math.h
new file mode 100644
index 00000000000..e7dbbe5ca22
--- /dev/null
+++ b/src/util/u_math.h
@@ -0,0 +1,752 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Math utilities and approximations for common math functions.
+ * Reduced precision is usually acceptable in shaders...
+ *
+ * "fast" is used in the names of functions which are low-precision,
+ * or at least lower-precision than the normal C lib functions.
+ */
+
+
+#ifndef U_MATH_H
+#define U_MATH_H
+
+
+#include "pipe/p_compiler.h"
+
+#include "c99_math.h"
+#include <assert.h>
+#include <float.h>
+#include <stdarg.h>
+
+#include "bitscan.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifndef M_SQRT2
+#define M_SQRT2 1.41421356237309504880
+#endif
+
+#define POW2_TABLE_SIZE_LOG2 9
+#define POW2_TABLE_SIZE (1 << POW2_TABLE_SIZE_LOG2)
+#define POW2_TABLE_OFFSET (POW2_TABLE_SIZE/2)
+#define POW2_TABLE_SCALE ((float)(POW2_TABLE_SIZE/2))
+extern float pow2_table[POW2_TABLE_SIZE];
+
+
+/**
+ * Initialize math module.  This should be called before using any
+ * other functions in this module.
+ */
+extern void
+util_init_math(void);
+
+
+union fi {
+   float f;
+   int32_t i;
+   uint32_t ui;
+};
+
+
+union di {
+   double d;
+   int64_t i;
+   uint64_t ui;
+};
+
+
+/**
+ * Extract the IEEE float32 exponent.
+ */
+static inline signed
+util_get_float32_exponent(float x)
+{
+   union fi f;
+
+   f.f = x;
+
+   return ((f.ui >> 23) & 0xff) - 127;
+}
+
+
+/**
+ * Fast version of 2^x
+ * Identity: exp2(a + b) = exp2(a) * exp2(b)
+ * Let ipart = int(x)
+ * Let fpart = x - ipart;
+ * So, exp2(x) = exp2(ipart) * exp2(fpart)
+ * Compute exp2(ipart) with i << ipart
+ * Compute exp2(fpart) with lookup table.
+ */
+static inline float
+util_fast_exp2(float x)
+{
+   int32_t ipart;
+   float fpart, mpart;
+   union fi epart;
+
+   if(x > 129.00000f)
+      return 3.402823466e+38f;
+
+   if (x < -126.99999f)
+      return 0.0f;
+
+   ipart = (int32_t) x;
+   fpart = x - (float) ipart;
+
+   /* same as
+    *   epart.f = (float) (1 << ipart)
+    * but faster and without integer overflow for ipart > 31
+    */
+   epart.i = (ipart + 127 ) << 23;
+
+   mpart = pow2_table[POW2_TABLE_OFFSET + (int)(fpart * POW2_TABLE_SCALE)];
+
+   return epart.f * mpart;
+}
+
+
+/**
+ * Fast approximation to exp(x).
+ */
+static inline float
+util_fast_exp(float x)
+{
+   const float k = 1.44269f; /* = log2(e) */
+   return util_fast_exp2(k * x);
+}
+
+
+#define LOG2_TABLE_SIZE_LOG2 16
+#define LOG2_TABLE_SCALE (1 << LOG2_TABLE_SIZE_LOG2)
+#define LOG2_TABLE_SIZE (LOG2_TABLE_SCALE + 1)
+extern float log2_table[LOG2_TABLE_SIZE];
+
+
+/**
+ * Fast approximation to log2(x).
+ */
+static inline float
+util_fast_log2(float x)
+{
+   union fi num;
+   float epart, mpart;
+   num.f = x;
+   epart = (float)(((num.i & 0x7f800000) >> 23) - 127);
+   /* mpart = log2_table[mantissa*LOG2_TABLE_SCALE + 0.5] */
+   mpart = log2_table[((num.i & 0x007fffff) + (1 << (22 - LOG2_TABLE_SIZE_LOG2))) >> (23 - LOG2_TABLE_SIZE_LOG2)];
+   return epart + mpart;
+}
+
+
+/**
+ * Fast approximation to x^y.
+ */
+static inline float
+util_fast_pow(float x, float y)
+{
+   return util_fast_exp2(util_fast_log2(x) * y);
+}
+
+
+/**
+ * Floor(x), returned as int.
+ */
+static inline int
+util_ifloor(float f)
+{
+   int ai, bi;
+   double af, bf;
+   union fi u;
+   af = (3 << 22) + 0.5 + (double) f;
+   bf = (3 << 22) + 0.5 - (double) f;
+   u.f = (float) af;  ai = u.i;
+   u.f = (float) bf;  bi = u.i;
+   return (ai - bi) >> 1;
+}
+
+
+/**
+ * Round float to nearest int.
+ */
+static inline int
+util_iround(float f)
+{
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86) 
+   int r;
+   __asm__ ("fistpl %0" : "=m" (r) : "t" (f) : "st");
+   return r;
+#elif defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86)
+   int r;
+   _asm {
+      fld f
+      fistp r
+   }
+   return r;
+#else
+   if (f >= 0.0f)
+      return (int) (f + 0.5f);
+   else
+      return (int) (f - 0.5f);
+#endif
+}
+
+
+/**
+ * Approximate floating point comparison
+ */
+static inline boolean
+util_is_approx(float a, float b, float tol)
+{
+   return fabsf(b - a) <= tol;
+}
+
+
+/**
+ * util_is_X_inf_or_nan = test if x is NaN or +/- Inf
+ * util_is_X_nan        = test if x is NaN
+ * util_X_inf_sign      = return +1 for +Inf, -1 for -Inf, or 0 for not Inf
+ *
+ * NaN can be checked with x != x, however this fails with the fast math flag
+ **/
+
+
+/**
+ * Single-float
+ */
+static inline boolean
+util_is_inf_or_nan(float x)
+{
+   union fi tmp;
+   tmp.f = x;
+   return (tmp.ui & 0x7f800000) == 0x7f800000;
+}
+
+
+static inline boolean
+util_is_nan(float x)
+{
+   union fi tmp;
+   tmp.f = x;
+   return (tmp.ui & 0x7fffffff) > 0x7f800000;
+}
+
+
+static inline int
+util_inf_sign(float x)
+{
+   union fi tmp;
+   tmp.f = x;
+   if ((tmp.ui & 0x7fffffff) != 0x7f800000) {
+      return 0;
+   }
+
+   return (x < 0) ? -1 : 1;
+}
+
+
+/**
+ * Double-float
+ */
+static inline boolean
+util_is_double_inf_or_nan(double x)
+{
+   union di tmp;
+   tmp.d = x;
+   return (tmp.ui & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL;
+}
+
+
+static inline boolean
+util_is_double_nan(double x)
+{
+   union di tmp;
+   tmp.d = x;
+   return (tmp.ui & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL;
+}
+
+
+static inline int
+util_double_inf_sign(double x)
+{
+   union di tmp;
+   tmp.d = x;
+   if ((tmp.ui & 0x7fffffffffffffffULL) != 0x7ff0000000000000ULL) {
+      return 0;
+   }
+
+   return (x < 0) ? -1 : 1;
+}
+
+
+/**
+ * Half-float
+ */
+static inline boolean
+util_is_half_inf_or_nan(int16_t x)
+{
+   return (x & 0x7c00) == 0x7c00;
+}
+
+
+static inline boolean
+util_is_half_nan(int16_t x)
+{
+   return (x & 0x7fff) > 0x7c00;
+}
+
+
+static inline int
+util_half_inf_sign(int16_t x)
+{
+   if ((x & 0x7fff) != 0x7c00) {
+      return 0;
+   }
+
+   return (x < 0) ? -1 : 1;
+}
+
+
+/**
+ * Return float bits.
+ */
+static inline unsigned
+fui( float f )
+{
+   union fi fi;
+   fi.f = f;
+   return fi.ui;
+}
+
+static inline float
+uif(uint32_t ui)
+{
+   union fi fi;
+   fi.ui = ui;
+   return fi.f;
+}
+
+
+/**
+ * Convert ubyte to float in [0, 1].
+ */
+static inline float
+ubyte_to_float(ubyte ub)
+{
+   return (float) ub * (1.0f / 255.0f);
+}
+
+
+/**
+ * Convert float in [0,1] to ubyte in [0,255] with clamping.
+ */
+static inline ubyte
+float_to_ubyte(float f)
+{
+   /* return 0 for NaN too */
+   if (!(f > 0.0f)) {
+      return (ubyte) 0;
+   }
+   else if (f >= 1.0f) {
+      return (ubyte) 255;
+   }
+   else {
+      union fi tmp;
+      tmp.f = f;
+      tmp.f = tmp.f * (255.0f/256.0f) + 32768.0f;
+      return (ubyte) tmp.i;
+   }
+}
+
+static inline float
+byte_to_float_tex(int8_t b)
+{
+   return (b == -128) ? -1.0F : b * 1.0F / 127.0F;
+}
+
+static inline int8_t
+float_to_byte_tex(float f)
+{
+   return (int8_t) (127.0F * f);
+}
+
+/**
+ * Calc log base 2
+ */
+static inline unsigned
+util_logbase2(unsigned n)
+{
+#if defined(HAVE___BUILTIN_CLZ)
+   return ((sizeof(unsigned) * 8 - 1) - __builtin_clz(n | 1));
+#else
+   unsigned pos = 0;
+   if (n >= 1<<16) { n >>= 16; pos += 16; }
+   if (n >= 1<< 8) { n >>=  8; pos +=  8; }
+   if (n >= 1<< 4) { n >>=  4; pos +=  4; }
+   if (n >= 1<< 2) { n >>=  2; pos +=  2; }
+   if (n >= 1<< 1) {           pos +=  1; }
+   return pos;
+#endif
+}
+
+static inline uint64_t
+util_logbase2_64(uint64_t n)
+{
+#if defined(HAVE___BUILTIN_CLZLL)
+   return ((sizeof(uint64_t) * 8 - 1) - __builtin_clzll(n | 1));
+#else
+   uint64_t pos = 0ull;
+   if (n >= 1ull<<32) { n >>= 32; pos += 32; }
+   if (n >= 1ull<<16) { n >>= 16; pos += 16; }
+   if (n >= 1ull<< 8) { n >>=  8; pos +=  8; }
+   if (n >= 1ull<< 4) { n >>=  4; pos +=  4; }
+   if (n >= 1ull<< 2) { n >>=  2; pos +=  2; }
+   if (n >= 1ull<< 1) {           pos +=  1; }
+   return pos;
+#endif
+}
+
+/**
+ * Returns the ceiling of log n base 2, and 0 when n == 0. Equivalently,
+ * returns the smallest x such that n <= 2**x.
+ */
+static inline unsigned
+util_logbase2_ceil(unsigned n)
+{
+   if (n <= 1)
+      return 0;
+
+   return 1 + util_logbase2(n - 1);
+}
+
+static inline uint64_t
+util_logbase2_ceil64(uint64_t n)
+{
+   if (n <= 1)
+      return 0;
+
+   return 1ull + util_logbase2_64(n - 1);
+}
+
+/**
+ * Returns the smallest power of two >= x
+ */
+static inline unsigned
+util_next_power_of_two(unsigned x)
+{
+#if defined(HAVE___BUILTIN_CLZ)
+   if (x <= 1)
+       return 1;
+
+   return (1 << ((sizeof(unsigned) * 8) - __builtin_clz(x - 1)));
+#else
+   unsigned val = x;
+
+   if (x <= 1)
+      return 1;
+
+   if (util_is_power_of_two_or_zero(x))
+      return x;
+
+   val--;
+   val = (val >> 1) | val;
+   val = (val >> 2) | val;
+   val = (val >> 4) | val;
+   val = (val >> 8) | val;
+   val = (val >> 16) | val;
+   val++;
+   return val;
+#endif
+}
+
+static inline uint64_t
+util_next_power_of_two64(uint64_t x)
+{
+#if defined(HAVE___BUILTIN_CLZLL)
+   if (x <= 1)
+       return 1;
+
+   return (1ull << ((sizeof(uint64_t) * 8) - __builtin_clzll(x - 1)));
+#else
+   uint64_t val = x;
+
+   if (x <= 1)
+      return 1;
+
+   if (util_is_power_of_two_or_zero64(x))
+      return x;
+
+   val--;
+   val = (val >> 1)  | val;
+   val = (val >> 2)  | val;
+   val = (val >> 4)  | val;
+   val = (val >> 8)  | val;
+   val = (val >> 16) | val;
+   val = (val >> 32) | val;
+   val++;
+   return val;
+#endif
+}
+
+
+/**
+ * Return number of bits set in n.
+ */
+static inline unsigned
+util_bitcount(unsigned n)
+{
+#if defined(HAVE___BUILTIN_POPCOUNT)
+   return __builtin_popcount(n);
+#else
+   /* K&R classic bitcount.
+    *
+    * For each iteration, clear the LSB from the bitfield.
+    * Requires only one iteration per set bit, instead of
+    * one iteration per bit less than highest set bit.
+    */
+   unsigned bits;
+   for (bits = 0; n; bits++) {
+      n &= n - 1;
+   }
+   return bits;
+#endif
+}
+
+
+static inline unsigned
+util_bitcount64(uint64_t n)
+{
+#ifdef HAVE___BUILTIN_POPCOUNTLL
+   return __builtin_popcountll(n);
+#else
+   return util_bitcount(n) + util_bitcount(n >> 32);
+#endif
+}
+
+
+/**
+ * Reverse bits in n
+ * Algorithm taken from:
+ * http://stackoverflow.com/questions/9144800/c-reverse-bits-in-unsigned-integer
+ */
+static inline unsigned
+util_bitreverse(unsigned n)
+{
+    n = ((n >> 1) & 0x55555555u) | ((n & 0x55555555u) << 1);
+    n = ((n >> 2) & 0x33333333u) | ((n & 0x33333333u) << 2);
+    n = ((n >> 4) & 0x0f0f0f0fu) | ((n & 0x0f0f0f0fu) << 4);
+    n = ((n >> 8) & 0x00ff00ffu) | ((n & 0x00ff00ffu) << 8);
+    n = ((n >> 16) & 0xffffu) | ((n & 0xffffu) << 16);
+    return n;
+}
+
+/**
+ * Convert from little endian to CPU byte order.
+ */
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+#define util_le64_to_cpu(x) util_bswap64(x)
+#define util_le32_to_cpu(x) util_bswap32(x)
+#define util_le16_to_cpu(x) util_bswap16(x)
+#else
+#define util_le64_to_cpu(x) (x)
+#define util_le32_to_cpu(x) (x)
+#define util_le16_to_cpu(x) (x)
+#endif
+
+#define util_cpu_to_le64(x) util_le64_to_cpu(x)
+#define util_cpu_to_le32(x) util_le32_to_cpu(x)
+#define util_cpu_to_le16(x) util_le16_to_cpu(x)
+
+/**
+ * Reverse byte order of a 32 bit word.
+ */
+static inline uint32_t
+util_bswap32(uint32_t n)
+{
+#if defined(HAVE___BUILTIN_BSWAP32)
+   return __builtin_bswap32(n);
+#else
+   return (n >> 24) |
+          ((n >> 8) & 0x0000ff00) |
+          ((n << 8) & 0x00ff0000) |
+          (n << 24);
+#endif
+}
+
+/**
+ * Reverse byte order of a 64bit word.
+ */
+static inline uint64_t
+util_bswap64(uint64_t n)
+{
+#if defined(HAVE___BUILTIN_BSWAP64)
+   return __builtin_bswap64(n);
+#else
+   return ((uint64_t)util_bswap32((uint32_t)n) << 32) |
+          util_bswap32((n >> 32));
+#endif
+}
+
+
+/**
+ * Reverse byte order of a 16 bit word.
+ */
+static inline uint16_t
+util_bswap16(uint16_t n)
+{
+   return (n >> 8) |
+          (n << 8);
+}
+
+static inline void*
+util_memcpy_cpu_to_le32(void * restrict dest, const void * restrict src, size_t n)
+{
+#ifdef PIPE_ARCH_BIG_ENDIAN
+   size_t i, e;
+   assert(n % 4 == 0);
+
+   for (i = 0, e = n / 4; i < e; i++) {
+      uint32_t * restrict d = (uint32_t* restrict)dest;
+      const uint32_t * restrict s = (const uint32_t* restrict)src;
+      d[i] = util_bswap32(s[i]);
+   }
+   return dest;
+#else
+   return memcpy(dest, src, n);
+#endif
+}
+
+/**
+ * Clamp X to [MIN, MAX].
+ * This is a macro to allow float, int, uint, etc. types.
+ * We arbitrarily turn NaN into MIN.
+ */
+#define CLAMP( X, MIN, MAX )  ( (X)>(MIN) ? ((X)>(MAX) ? (MAX) : (X)) : (MIN) )
+
+#define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
+#define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
+
+#define MIN3( A, B, C ) ((A) < (B) ? MIN2(A, C) : MIN2(B, C))
+#define MAX3( A, B, C ) ((A) > (B) ? MAX2(A, C) : MAX2(B, C))
+
+#define MIN4( A, B, C, D ) ((A) < (B) ? MIN3(A, C, D) : MIN3(B, C, D))
+#define MAX4( A, B, C, D ) ((A) > (B) ? MAX3(A, C, D) : MAX3(B, C, D))
+
+
+/**
+ * Align a value, only works pot alignemnts.
+ */
+static inline int
+align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+static inline uint64_t
+align64(uint64_t value, unsigned alignment)
+{
+   return (value + alignment - 1) & ~((uint64_t)alignment - 1);
+}
+
+/**
+ * Works like align but on npot alignments.
+ */
+static inline size_t
+util_align_npot(size_t value, size_t alignment)
+{
+   if (value % alignment)
+      return value + (alignment - (value % alignment));
+   return value;
+}
+
+static inline unsigned
+u_minify(unsigned value, unsigned levels)
+{
+    return MAX2(1, value >> levels);
+}
+
+#ifndef COPY_4V
+#define COPY_4V( DST, SRC )         \
+do {                                \
+   (DST)[0] = (SRC)[0];             \
+   (DST)[1] = (SRC)[1];             \
+   (DST)[2] = (SRC)[2];             \
+   (DST)[3] = (SRC)[3];             \
+} while (0)
+#endif
+
+
+#ifndef COPY_4FV
+#define COPY_4FV( DST, SRC )  COPY_4V(DST, SRC)
+#endif
+
+
+#ifndef ASSIGN_4V
+#define ASSIGN_4V( DST, V0, V1, V2, V3 ) \
+do {                                     \
+   (DST)[0] = (V0);                      \
+   (DST)[1] = (V1);                      \
+   (DST)[2] = (V2);                      \
+   (DST)[3] = (V3);                      \
+} while (0)
+#endif
+
+
+static inline uint32_t
+util_unsigned_fixed(float value, unsigned frac_bits)
+{
+   return value < 0 ? 0 : (uint32_t)(value * (1<<frac_bits));
+}
+
+static inline int32_t
+util_signed_fixed(float value, unsigned frac_bits)
+{
+   return (int32_t)(value * (1<<frac_bits));
+}
+
+unsigned
+util_fpstate_get(void);
+unsigned
+util_fpstate_set_denorms_to_zero(unsigned current_fpstate);
+void
+util_fpstate_set(unsigned fpstate);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_MATH_H */