4e58e505572e6d9280d3993d4225c373b6ab7cbf
1 /**************************************************************************
3 * Copyright 2008 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
30 * Math utilities and approximations for common math functions.
31 * Reduced precision is usually acceptable in shaders...
33 * "fast" is used in the names of functions which are low-precision,
34 * or at least lower-precision than the normal C lib functions.
42 #include "pipe/p_compiler.h"
50 #include <strings.h> /* for ffs */
64 #define M_SQRT2 1.41421356237309504880
67 #define POW2_TABLE_SIZE_LOG2 9
68 #define POW2_TABLE_SIZE (1 << POW2_TABLE_SIZE_LOG2)
69 #define POW2_TABLE_OFFSET (POW2_TABLE_SIZE/2)
70 #define POW2_TABLE_SCALE ((float)(POW2_TABLE_SIZE/2))
71 extern float pow2_table
[POW2_TABLE_SIZE
];
75 * Initialize math module. This should be called before using any
76 * other functions in this module.
97 * Extract the IEEE float32 exponent.
100 util_get_float32_exponent(float x
)
106 return ((f
.ui
>> 23) & 0xff) - 127;
111 * Fast version of 2^x
112 * Identity: exp2(a + b) = exp2(a) * exp2(b)
114 * Let fpart = x - ipart;
115 * So, exp2(x) = exp2(ipart) * exp2(fpart)
116 * Compute exp2(ipart) with i << ipart
117 * Compute exp2(fpart) with lookup table.
120 util_fast_exp2(float x
)
127 return 3.402823466e+38f
;
133 fpart
= x
- (float) ipart
;
136 * epart.f = (float) (1 << ipart)
137 * but faster and without integer overflow for ipart > 31
139 epart
.i
= (ipart
+ 127 ) << 23;
141 mpart
= pow2_table
[POW2_TABLE_OFFSET
+ (int)(fpart
* POW2_TABLE_SCALE
)];
143 return epart
.f
* mpart
;
148 * Fast approximation to exp(x).
151 util_fast_exp(float x
)
153 const float k
= 1.44269f
; /* = log2(e) */
154 return util_fast_exp2(k
* x
);
158 #define LOG2_TABLE_SIZE_LOG2 16
159 #define LOG2_TABLE_SCALE (1 << LOG2_TABLE_SIZE_LOG2)
160 #define LOG2_TABLE_SIZE (LOG2_TABLE_SCALE + 1)
161 extern float log2_table
[LOG2_TABLE_SIZE
];
165 * Fast approximation to log2(x).
168 util_fast_log2(float x
)
173 epart
= (float)(((num
.i
& 0x7f800000) >> 23) - 127);
174 /* mpart = log2_table[mantissa*LOG2_TABLE_SCALE + 0.5] */
175 mpart
= log2_table
[((num
.i
& 0x007fffff) + (1 << (22 - LOG2_TABLE_SIZE_LOG2
))) >> (23 - LOG2_TABLE_SIZE_LOG2
)];
176 return epart
+ mpart
;
181 * Fast approximation to x^y.
184 util_fast_pow(float x
, float y
)
186 return util_fast_exp2(util_fast_log2(x
) * y
);
189 /* Note that this counts zero as a power of two.
191 static inline boolean
192 util_is_power_of_two( unsigned v
)
194 return (v
& (v
-1)) == 0;
199 * Floor(x), returned as int.
207 af
= (3 << 22) + 0.5 + (double) f
;
208 bf
= (3 << 22) + 0.5 - (double) f
;
209 u
.f
= (float) af
; ai
= u
.i
;
210 u
.f
= (float) bf
; bi
= u
.i
;
211 return (ai
- bi
) >> 1;
216 * Round float to nearest int.
221 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
223 __asm__ ("fistpl %0" : "=m" (r
) : "t" (f
) : "st");
225 #elif defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86)
234 return (int) (f
+ 0.5f
);
236 return (int) (f
- 0.5f
);
242 * Approximate floating point comparison
244 static inline boolean
245 util_is_approx(float a
, float b
, float tol
)
247 return fabsf(b
- a
) <= tol
;
252 * util_is_X_inf_or_nan = test if x is NaN or +/- Inf
253 * util_is_X_nan = test if x is NaN
254 * util_X_inf_sign = return +1 for +Inf, -1 for -Inf, or 0 for not Inf
256 * NaN can be checked with x != x, however this fails with the fast math flag
263 static inline boolean
264 util_is_inf_or_nan(float x
)
268 return (tmp
.ui
& 0x7f800000) == 0x7f800000;
272 static inline boolean
277 return (tmp
.ui
& 0x7fffffff) > 0x7f800000;
282 util_inf_sign(float x
)
286 if ((tmp
.ui
& 0x7fffffff) != 0x7f800000) {
290 return (x
< 0) ? -1 : 1;
297 static inline boolean
298 util_is_double_inf_or_nan(double x
)
302 return (tmp
.ui
& 0x7ff0000000000000ULL
) == 0x7ff0000000000000ULL
;
306 static inline boolean
307 util_is_double_nan(double x
)
311 return (tmp
.ui
& 0x7fffffffffffffffULL
) > 0x7ff0000000000000ULL
;
316 util_double_inf_sign(double x
)
320 if ((tmp
.ui
& 0x7fffffffffffffffULL
) != 0x7ff0000000000000ULL
) {
324 return (x
< 0) ? -1 : 1;
331 static inline boolean
332 util_is_half_inf_or_nan(int16_t x
)
334 return (x
& 0x7c00) == 0x7c00;
338 static inline boolean
339 util_is_half_nan(int16_t x
)
341 return (x
& 0x7fff) > 0x7c00;
346 util_half_inf_sign(int16_t x
)
348 if ((x
& 0x7fff) != 0x7c00) {
352 return (x
< 0) ? -1 : 1;
357 * Find first bit set in word. Least significant bit is 1.
358 * Return 0 if no bits set.
361 #define FFS_DEFINED 1
363 #if defined(_MSC_VER) && (_M_IX86 || _M_AMD64 || _M_IA64)
365 unsigned long ffs( unsigned long u
)
368 if (_BitScanForward(&i
, u
))
373 #elif defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86)
375 unsigned ffs( unsigned u
)
389 #elif defined(__MINGW32__) || defined(PIPE_OS_ANDROID) || \
390 defined(HAVE___BUILTIN_FFS)
391 #define ffs __builtin_ffs
394 #ifdef HAVE___BUILTIN_FFSLL
395 #define ffsll __builtin_ffsll
398 ffsll(long long int val
)
402 bit
= ffs((unsigned) (val
& 0xffffffff));
406 bit
= ffs((unsigned) (val
>> 32));
414 #endif /* FFS_DEFINED */
417 * Find first bit set in long long. Least significant bit is 1.
418 * Return 0 if no bits set.
420 #ifndef FFSLL_DEFINED
421 #define FFSLL_DEFINED 1
423 #if defined(__MINGW32__) || defined(PIPE_OS_ANDROID) || \
424 defined(HAVE___BUILTIN_FFSLL)
425 #define ffsll __builtin_ffsll
428 #endif /* FFSLL_DEFINED */
431 * Find last bit set in a word. The least significant bit is 1.
432 * Return 0 if no bits are set.
434 static inline unsigned
435 util_last_bit(unsigned u
)
437 #if defined(HAVE___BUILTIN_CLZ)
438 return u
== 0 ? 0 : 32 - __builtin_clz(u
);
450 * Find last bit set in a word. The least significant bit is 1.
451 * Return 0 if no bits are set.
453 static inline unsigned
454 util_last_bit64(uint64_t u
)
456 #if defined(HAVE___BUILTIN_CLZLL)
457 return u
== 0 ? 0 : 64 - __builtin_clzll(u
);
469 * Find last bit in a word that does not match the sign bit. The least
470 * significant bit is 1.
471 * Return 0 if no bits are set.
473 static inline unsigned
474 util_last_bit_signed(int i
)
477 return util_last_bit(i
);
479 return util_last_bit(~(unsigned)i
);
482 /* Destructively loop over all of the bits in a mask as in:
485 * int i = u_bit_scan(&mymask);
486 * ... process element i
491 u_bit_scan(unsigned *mask
)
493 int i
= ffs(*mask
) - 1;
500 u_bit_scan64(uint64_t *mask
)
502 int i
= ffsll(*mask
) - 1;
503 *mask
&= ~(1llu << i
);
508 /* For looping over a bitmask when you want to loop over consecutive bits
509 * manually, for example:
512 * int start, count, i;
514 * u_bit_scan_consecutive_range(&mask, &start, &count);
516 * for (i = 0; i < count; i++)
517 * ... process element (start+i)
521 u_bit_scan_consecutive_range(unsigned *mask
, int *start
, int *count
)
523 if (*mask
== 0xffffffff) {
529 *start
= ffs(*mask
) - 1;
530 *count
= ffs(~(*mask
>> *start
)) - 1;
531 *mask
&= ~(((1u << *count
) - 1) << *start
);
535 u_bit_scan_consecutive_range64(uint64_t *mask
, int *start
, int *count
)
537 if (*mask
== ~0llu) {
543 *start
= ffsll(*mask
) - 1;
544 *count
= ffsll(~(*mask
>> *start
)) - 1;
545 *mask
&= ~(((1llu << *count
) - 1) << *start
);
551 static inline unsigned
569 * Convert ubyte to float in [0, 1].
570 * XXX a 256-entry lookup table would be slightly faster.
573 ubyte_to_float(ubyte ub
)
575 return (float) ub
* (1.0f
/ 255.0f
);
580 * Convert float in [0,1] to ubyte in [0,255] with clamping.
583 float_to_ubyte(float f
)
591 else if (tmp
.i
>= 0x3f800000 /* 1.0f */) {
595 tmp
.f
= tmp
.f
* (255.0f
/256.0f
) + 32768.0f
;
596 return (ubyte
) tmp
.i
;
601 byte_to_float_tex(int8_t b
)
603 return (b
== -128) ? -1.0F
: b
* 1.0F
/ 127.0F
;
607 float_to_byte_tex(float f
)
609 return (int8_t) (127.0F
* f
);
615 static inline unsigned
616 util_logbase2(unsigned n
)
618 #if defined(HAVE___BUILTIN_CLZ)
619 return ((sizeof(unsigned) * 8 - 1) - __builtin_clz(n
| 1));
622 if (n
>= 1<<16) { n
>>= 16; pos
+= 16; }
623 if (n
>= 1<< 8) { n
>>= 8; pos
+= 8; }
624 if (n
>= 1<< 4) { n
>>= 4; pos
+= 4; }
625 if (n
>= 1<< 2) { n
>>= 2; pos
+= 2; }
626 if (n
>= 1<< 1) { pos
+= 1; }
633 * Returns the smallest power of two >= x
635 static inline unsigned
636 util_next_power_of_two(unsigned x
)
638 #if defined(HAVE___BUILTIN_CLZ)
642 return (1 << ((sizeof(unsigned) * 8) - __builtin_clz(x
- 1)));
649 if (util_is_power_of_two(x
))
653 val
= (val
>> 1) | val
;
654 val
= (val
>> 2) | val
;
655 val
= (val
>> 4) | val
;
656 val
= (val
>> 8) | val
;
657 val
= (val
>> 16) | val
;
665 * Return number of bits set in n.
667 static inline unsigned
668 util_bitcount(unsigned n
)
670 #if defined(HAVE___BUILTIN_POPCOUNT)
671 return __builtin_popcount(n
);
673 /* K&R classic bitcount.
675 * For each iteration, clear the LSB from the bitfield.
676 * Requires only one iteration per set bit, instead of
677 * one iteration per bit less than highest set bit.
680 for (bits
= 0; n
; bits
++) {
688 static inline unsigned
689 util_bitcount64(uint64_t n
)
691 #ifdef HAVE___BUILTIN_POPCOUNTLL
692 return __builtin_popcountll(n
);
694 return util_bitcount(n
) + util_bitcount(n
>> 32);
701 * Algorithm taken from:
702 * http://stackoverflow.com/questions/9144800/c-reverse-bits-in-unsigned-integer
704 static inline unsigned
705 util_bitreverse(unsigned n
)
707 n
= ((n
>> 1) & 0x55555555u
) | ((n
& 0x55555555u
) << 1);
708 n
= ((n
>> 2) & 0x33333333u
) | ((n
& 0x33333333u
) << 2);
709 n
= ((n
>> 4) & 0x0f0f0f0fu
) | ((n
& 0x0f0f0f0fu
) << 4);
710 n
= ((n
>> 8) & 0x00ff00ffu
) | ((n
& 0x00ff00ffu
) << 8);
711 n
= ((n
>> 16) & 0xffffu
) | ((n
& 0xffffu
) << 16);
716 * Convert from little endian to CPU byte order.
719 #ifdef PIPE_ARCH_BIG_ENDIAN
720 #define util_le64_to_cpu(x) util_bswap64(x)
721 #define util_le32_to_cpu(x) util_bswap32(x)
722 #define util_le16_to_cpu(x) util_bswap16(x)
724 #define util_le64_to_cpu(x) (x)
725 #define util_le32_to_cpu(x) (x)
726 #define util_le16_to_cpu(x) (x)
729 #define util_cpu_to_le64(x) util_le64_to_cpu(x)
730 #define util_cpu_to_le32(x) util_le32_to_cpu(x)
731 #define util_cpu_to_le16(x) util_le16_to_cpu(x)
734 * Reverse byte order of a 32 bit word.
736 static inline uint32_t
737 util_bswap32(uint32_t n
)
739 #if defined(HAVE___BUILTIN_BSWAP32)
740 return __builtin_bswap32(n
);
743 ((n
>> 8) & 0x0000ff00) |
744 ((n
<< 8) & 0x00ff0000) |
750 * Reverse byte order of a 64bit word.
752 static inline uint64_t
753 util_bswap64(uint64_t n
)
755 #if defined(HAVE___BUILTIN_BSWAP64)
756 return __builtin_bswap64(n
);
758 return ((uint64_t)util_bswap32((uint32_t)n
) << 32) |
759 util_bswap32((n
>> 32));
765 * Reverse byte order of a 16 bit word.
767 static inline uint16_t
768 util_bswap16(uint16_t n
)
775 util_memcpy_cpu_to_le32(void * restrict dest
, const void * restrict src
, size_t n
)
777 #ifdef PIPE_ARCH_BIG_ENDIAN
781 for (i
= 0, e
= n
/ 4; i
< e
; i
++) {
782 uint32_t * restrict d
= (uint32_t* restrict
)dest
;
783 const uint32_t * restrict s
= (const uint32_t* restrict
)src
;
784 d
[i
] = util_bswap32(s
[i
]);
788 return memcpy(dest
, src
, n
);
793 * Clamp X to [MIN, MAX].
794 * This is a macro to allow float, int, uint, etc. types.
796 #define CLAMP( X, MIN, MAX ) ( (X)<(MIN) ? (MIN) : ((X)>(MAX) ? (MAX) : (X)) )
798 #define MIN2( A, B ) ( (A)<(B) ? (A) : (B) )
799 #define MAX2( A, B ) ( (A)>(B) ? (A) : (B) )
801 #define MIN3( A, B, C ) ((A) < (B) ? MIN2(A, C) : MIN2(B, C))
802 #define MAX3( A, B, C ) ((A) > (B) ? MAX2(A, C) : MAX2(B, C))
804 #define MIN4( A, B, C, D ) ((A) < (B) ? MIN3(A, C, D) : MIN3(B, C, D))
805 #define MAX4( A, B, C, D ) ((A) > (B) ? MAX3(A, C, D) : MAX3(B, C, D))
809 * Align a value, only works pot alignemnts.
812 align(int value
, int alignment
)
814 return (value
+ alignment
- 1) & ~(alignment
- 1);
817 static inline uint64_t
818 align64(uint64_t value
, unsigned alignment
)
820 return (value
+ alignment
- 1) & ~(alignment
- 1);
824 * Works like align but on npot alignments.
827 util_align_npot(size_t value
, size_t alignment
)
829 if (value
% alignment
)
830 return value
+ (alignment
- (value
% alignment
));
834 static inline unsigned
835 u_minify(unsigned value
, unsigned levels
)
837 return MAX2(1, value
>> levels
);
841 #define COPY_4V( DST, SRC ) \
843 (DST)[0] = (SRC)[0]; \
844 (DST)[1] = (SRC)[1]; \
845 (DST)[2] = (SRC)[2]; \
846 (DST)[3] = (SRC)[3]; \
852 #define COPY_4FV( DST, SRC ) COPY_4V(DST, SRC)
857 #define ASSIGN_4V( DST, V0, V1, V2, V3 ) \
867 static inline uint32_t
868 util_unsigned_fixed(float value
, unsigned frac_bits
)
870 return value
< 0 ? 0 : (uint32_t)(value
* (1<<frac_bits
));
873 static inline int32_t
874 util_signed_fixed(float value
, unsigned frac_bits
)
876 return (int32_t)(value
* (1<<frac_bits
));
880 util_fpstate_get(void);
882 util_fpstate_set_denorms_to_zero(unsigned current_fpstate
);
884 util_fpstate_set(unsigned fpstate
);
892 #endif /* U_MATH_H */