From: Sebastien Bourdeauducq Date: Thu, 24 May 2012 21:21:18 +0000 (+0200) Subject: software/libbase: upgrade softfloat to version 2b + add support for more precision X-Git-Tag: 24jan2021_ls180~3172 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=97b77945e5dc0c7cb1760ad5d665b3e664253f25;p=litex.git software/libbase: upgrade softfloat to version 2b + add support for more precision --- diff --git a/software/libbase/milieu.h b/software/libbase/milieu.h index 58d688d0..fd5d8145 100644 --- a/software/libbase/milieu.h +++ b/software/libbase/milieu.h @@ -1,9 +1,8 @@ -/* -=============================================================================== +/*============================================================================ -This C header file is part of the SoftFloat IEC/IEEE Floating-point -Arithmetic Package, Release 2. +This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. Written by John R. Hauser. This work was made possible in part by the International Computer Science Institute, located at Suite 600, 1947 Center @@ -12,54 +11,48 @@ National Science Foundation under grant MIP-9311980. The original version of this code was written as part of a project to build a fixed-point vector processor in collaboration with the University of California at Berkeley, overseen by Profs. Nelson Morgan and John Wawrzynek. More information -is available through the Web page `http://http.cs.berkeley.edu/~jhauser/ -arithmetic/softfloat.html'. +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. -THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort -has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT -TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO -PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY -AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. Derivative works are acceptable, even for commercial purposes, so long as -(1) they include prominent notice that the work is derivative, and (2) they -include prominent notice akin to these three paragraphs for those parts of -this code that are retained. - -=============================================================================== -*/ - -/* -------------------------------------------------------------------------------- -Common integer types and flags. -------------------------------------------------------------------------------- -*/ - -/* -------------------------------------------------------------------------------- -One of the macros `BIGENDIAN' or `LITTLEENDIAN' must be defined. -------------------------------------------------------------------------------- -*/ +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| Include common integer types and flags. +*----------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------- +| One of the macros `BIGENDIAN' or `LITTLEENDIAN' must be defined. +*----------------------------------------------------------------------------*/ #define BIGENDIAN -/* -------------------------------------------------------------------------------- -The macro `BITS64' can be defined to indicate that 64-bit integer types are -supported by the compiler. -------------------------------------------------------------------------------- -*/ +/*---------------------------------------------------------------------------- +| The macro `BITS64' can be defined to indicate that 64-bit integer types are +| supported by the compiler. +*----------------------------------------------------------------------------*/ //#define BITS64 -/* -------------------------------------------------------------------------------- -Each of the following `typedef's defines the most convenient type that holds -integers of at least as many bits as specified. For example, `uint8' should -be the most convenient type that can hold unsigned integers of as many as -8 bits. The `flag' type must be able to hold either a 0 or 1. For most -implementations of C, `flag', `uint8', and `int8' should all be `typedef'ed -to the same as `int'. -------------------------------------------------------------------------------- -*/ +/*---------------------------------------------------------------------------- +| Each of the following `typedef's defines the most convenient type that holds +| integers of at least as many bits as specified. For example, `uint8' should +| be the most convenient type that can hold unsigned integers of as many as +| 8 bits. The `flag' type must be able to hold either a 0 or 1. For most +| implementations of C, `flag', `uint8', and `int8' should all be `typedef'ed +| to the same as `int'. +*----------------------------------------------------------------------------*/ typedef int flag; typedef int uint8; typedef int int8; @@ -68,18 +61,16 @@ typedef int int16; typedef unsigned int uint32; typedef signed int int32; #ifdef BITS64 -typedef unsigned long long int bits64; -typedef signed long long int sbits64; +typedef unsigned long long int uint64; +typedef signed long long int int64; #endif -/* -------------------------------------------------------------------------------- -Each of the following `typedef's defines a type that holds integers -of _exactly_ the number of bits specified. For instance, for most -implementation of C, `bits16' and `sbits16' should be `typedef'ed to -`unsigned short int' and `signed short int' (or `short int'), respectively. -------------------------------------------------------------------------------- -*/ +/*---------------------------------------------------------------------------- +| Each of the following `typedef's defines a type that holds integers +| of _exactly_ the number of bits specified. For instance, for most +| implementation of C, `bits16' and `sbits16' should be `typedef'ed to +| `unsigned short int' and `signed short int' (or `short int'), respectively. +*----------------------------------------------------------------------------*/ typedef unsigned char bits8; typedef signed char sbits8; typedef unsigned short int bits16; @@ -87,38 +78,33 @@ typedef signed short int sbits16; typedef unsigned int bits32; typedef signed int sbits32; #ifdef BITS64 -typedef unsigned long long int uint64; -typedef signed long long int int64; +typedef unsigned long long int bits64; +typedef signed long long int sbits64; #endif #ifdef BITS64 -/* -------------------------------------------------------------------------------- -The `LIT64' macro takes as its argument a textual integer literal and if -necessary ``marks'' the literal as having a 64-bit integer type. For -example, the Gnu C Compiler (`gcc') requires that 64-bit literals be -appended with the letters `LL' standing for `long long', which is `gcc's -name for the 64-bit integer type. Some compilers may allow `LIT64' to be -defined as the identity macro: `#define LIT64( a ) a'. -------------------------------------------------------------------------------- -*/ +/*---------------------------------------------------------------------------- +| The `LIT64' macro takes as its argument a textual integer literal and +| if necessary ``marks'' the literal as having a 64-bit integer type. +| For example, the GNU C Compiler (`gcc') requires that 64-bit literals be +| appended with the letters `LL' standing for `long long', which is `gcc's +| name for the 64-bit integer type. Some compilers may allow `LIT64' to be +| defined as the identity macro: `#define LIT64( a ) a'. +*----------------------------------------------------------------------------*/ #define LIT64( a ) a##LL #endif -/* -------------------------------------------------------------------------------- -The macro `INLINE' can be used before functions that should be inlined. If -a compiler does not support explicit inlining, this macro should be defined -to be `static'. -------------------------------------------------------------------------------- -*/ +/*---------------------------------------------------------------------------- +| The macro `INLINE' can be used before functions that should be inlined. If +| a compiler does not support explicit inlining, this macro should be defined +| to be `static'. +*----------------------------------------------------------------------------*/ #define INLINE extern inline -/* -------------------------------------------------------------------------------- -Symbolic Boolean literals. -------------------------------------------------------------------------------- -*/ + +/*---------------------------------------------------------------------------- +| Symbolic Boolean literals. +*----------------------------------------------------------------------------*/ enum { FALSE = 0, TRUE = 1 diff --git a/software/libbase/softfloat-macros.h b/software/libbase/softfloat-macros.h index 40d1182b..b4f74486 100644 --- a/software/libbase/softfloat-macros.h +++ b/software/libbase/softfloat-macros.h @@ -1,646 +1,627 @@ - -/* -=============================================================================== - -This C source fragment is part of the SoftFloat IEC/IEEE Floating-point -Arithmetic Package, Release 2. - -Written by John R. Hauser. This work was made possible in part by the -International Computer Science Institute, located at Suite 600, 1947 Center -Street, Berkeley, California 94704. Funding was partially provided by the -National Science Foundation under grant MIP-9311980. The original version -of this code was written as part of a project to build a fixed-point vector -processor in collaboration with the University of California at Berkeley, -overseen by Profs. Nelson Morgan and John Wawrzynek. More information -is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ -arithmetic/softfloat.html'. - -THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort -has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT -TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO -PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY -AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. - -Derivative works are acceptable, even for commercial purposes, so long as -(1) they include prominent notice that the work is derivative, and (2) they -include prominent notice akin to these three paragraphs for those parts of -this code that are retained. - -=============================================================================== -*/ - -/* -------------------------------------------------------------------------------- -Shifts `a' right by the number of bits given in `count'. If any nonzero -bits are shifted off, they are ``jammed'' into the least significant bit of -the result by setting the least significant bit to 1. The value of `count' -can be arbitrarily large; in particular, if `count' is greater than 32, the -result will be either 0 or 1, depending on whether `a' is zero or nonzero. -The result is stored in the location pointed to by `zPtr'. -------------------------------------------------------------------------------- -*/ -INLINE void shift32RightJamming( bits32 a, int16 count, bits32 *zPtr ) -{ - bits32 z; - - if ( count == 0 ) { - z = a; - } - else if ( count < 32 ) { - z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 ); - } - else { - z = ( a != 0 ); - } - *zPtr = z; - -} - -/* -------------------------------------------------------------------------------- -Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the -number of bits given in `count'. Any bits shifted off are lost. The value -of `count' can be arbitrarily large; in particular, if `count' is greater -than 64, the result will be 0. The result is broken into two 32-bit pieces -which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. -------------------------------------------------------------------------------- -*/ -INLINE void - shift64Right( - bits32 a0, bits32 a1, int16 count, bits32 *z0Ptr, bits32 *z1Ptr ) -{ - bits32 z0, z1; - int8 negCount = ( - count ) & 31; - - if ( count == 0 ) { - z1 = a1; - z0 = a0; - } - else if ( count < 32 ) { - z1 = ( a0<>count ); - z0 = a0>>count; - } - else { - z1 = ( count < 64 ) ? ( a0>>( count & 31 ) ) : 0; - z0 = 0; - } - *z1Ptr = z1; - *z0Ptr = z0; - -} - -/* -------------------------------------------------------------------------------- -Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the -number of bits given in `count'. If any nonzero bits are shifted off, they -are ``jammed'' into the least significant bit of the result by setting the -least significant bit to 1. The value of `count' can be arbitrarily large; -in particular, if `count' is greater than 64, the result will be either 0 -or 1, depending on whether the concatenation of `a0' and `a1' is zero or -nonzero. The result is broken into two 32-bit pieces which are stored at -the locations pointed to by `z0Ptr' and `z1Ptr'. -------------------------------------------------------------------------------- -*/ -INLINE void - shift64RightJamming( - bits32 a0, bits32 a1, int16 count, bits32 *z0Ptr, bits32 *z1Ptr ) -{ - bits32 z0, z1; - int8 negCount = ( - count ) & 31; - - if ( count == 0 ) { - z1 = a1; - z0 = a0; - } - else if ( count < 32 ) { - z1 = ( a0<>count ) | ( ( a1<>count; - } - else { - if ( count == 32 ) { - z1 = a0 | ( a1 != 0 ); - } - else if ( count < 64 ) { - z1 = ( a0>>( count & 31 ) ) | ( ( ( a0<>count ); - z0 = a0>>count; - } - else { - if ( count == 32 ) { - z2 = a1; - z1 = a0; - } - else { - a2 |= a1; - if ( count < 64 ) { - z2 = a0<>( count & 31 ); - } - else { - z2 = ( count == 64 ) ? a0 : ( a0 != 0 ); - z1 = 0; - } - } - z0 = 0; - } - z2 |= ( a2 != 0 ); - } - *z2Ptr = z2; - *z1Ptr = z1; - *z0Ptr = z0; - -} - -/* -------------------------------------------------------------------------------- -Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the -number of bits given in `count'. Any bits shifted off are lost. The value -of `count' must be less than 32. The result is broken into two 32-bit -pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. -------------------------------------------------------------------------------- -*/ -INLINE void - shortShift64Left( - bits32 a0, bits32 a1, int16 count, bits32 *z0Ptr, bits32 *z1Ptr ) -{ - - *z1Ptr = a1<>( ( - count ) & 31 ) ); - -} - -/* -------------------------------------------------------------------------------- -Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left by -the number of bits given in `count'. Any bits shifted off are lost. The -value of `count' must be less than 32. The result is broken into three -32-bit pieces which are stored at the locations pointed to by `z0Ptr', -`z1Ptr', and `z2Ptr'. -------------------------------------------------------------------------------- -*/ -INLINE void - shortShift96Left( - bits32 a0, - bits32 a1, - bits32 a2, - int16 count, - bits32 *z0Ptr, - bits32 *z1Ptr, - bits32 *z2Ptr - ) -{ - bits32 z0, z1, z2; - int8 negCount; - - z2 = a2<>negCount; - z0 |= a1>>negCount; - } - *z2Ptr = z2; - *z1Ptr = z1; - *z0Ptr = z0; - -} - -/* -------------------------------------------------------------------------------- -Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit -value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so -any carry out is lost. The result is broken into two 32-bit pieces which -are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. -------------------------------------------------------------------------------- -*/ -INLINE void - add64( - bits32 a0, bits32 a1, bits32 b0, bits32 b1, bits32 *z0Ptr, bits32 *z1Ptr ) -{ - bits32 z1; - - z1 = a1 + b1; - *z1Ptr = z1; - *z0Ptr = a0 + b0 + ( z1 < a1 ); - -} - -/* -------------------------------------------------------------------------------- -Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the -96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is -modulo 2^96, so any carry out is lost. The result is broken into three -32-bit pieces which are stored at the locations pointed to by `z0Ptr', -`z1Ptr', and `z2Ptr'. -------------------------------------------------------------------------------- -*/ -INLINE void - add96( - bits32 a0, - bits32 a1, - bits32 a2, - bits32 b0, - bits32 b1, - bits32 b2, - bits32 *z0Ptr, - bits32 *z1Ptr, - bits32 *z2Ptr - ) -{ - bits32 z0, z1, z2; - int8 carry0, carry1; - - z2 = a2 + b2; - carry1 = ( z2 < a2 ); - z1 = a1 + b1; - carry0 = ( z1 < a1 ); - z0 = a0 + b0; - z1 += carry1; - z0 += ( z1 < carry1 ); - z0 += carry0; - *z2Ptr = z2; - *z1Ptr = z1; - *z0Ptr = z0; - -} - -/* -------------------------------------------------------------------------------- -Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the -64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo -2^64, so any borrow out (carry out) is lost. The result is broken into two -32-bit pieces which are stored at the locations pointed to by `z0Ptr' and -`z1Ptr'. -------------------------------------------------------------------------------- -*/ -INLINE void - sub64( - bits32 a0, bits32 a1, bits32 b0, bits32 b1, bits32 *z0Ptr, bits32 *z1Ptr ) -{ - - *z1Ptr = a1 - b1; - *z0Ptr = a0 - b0 - ( a1 < b1 ); - -} - -/* -------------------------------------------------------------------------------- -Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from -the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction -is modulo 2^96, so any borrow out (carry out) is lost. The result is broken -into three 32-bit pieces which are stored at the locations pointed to by -`z0Ptr', `z1Ptr', and `z2Ptr'. -------------------------------------------------------------------------------- -*/ -INLINE void - sub96( - bits32 a0, - bits32 a1, - bits32 a2, - bits32 b0, - bits32 b1, - bits32 b2, - bits32 *z0Ptr, - bits32 *z1Ptr, - bits32 *z2Ptr - ) -{ - bits32 z0, z1, z2; - int8 borrow0, borrow1; - - z2 = a2 - b2; - borrow1 = ( a2 < b2 ); - z1 = a1 - b1; - borrow0 = ( a1 < b1 ); - z0 = a0 - b0; - z0 -= ( z1 < borrow1 ); - z1 -= borrow1; - z0 -= borrow0; - *z2Ptr = z2; - *z1Ptr = z1; - *z0Ptr = z0; - -} - -/* -------------------------------------------------------------------------------- -Multiplies `a' by `b' to obtain a 64-bit product. The product is broken -into two 32-bit pieces which are stored at the locations pointed to by -`z0Ptr' and `z1Ptr'. -------------------------------------------------------------------------------- -*/ -INLINE void mul32To64( bits32 a, bits32 b, bits32 *z0Ptr, bits32 *z1Ptr ) -{ - bits16 aHigh, aLow, bHigh, bLow; - bits32 z0, zMiddleA, zMiddleB, z1; - - aLow = a; - aHigh = a>>16; - bLow = b; - bHigh = b>>16; - z1 = ( (bits32) aLow ) * bLow; - zMiddleA = ( (bits32) aLow ) * bHigh; - zMiddleB = ( (bits32) aHigh ) * bLow; - z0 = ( (bits32) aHigh ) * bHigh; - zMiddleA += zMiddleB; - z0 += ( ( (bits32) ( zMiddleA < zMiddleB ) )<<16 ) + ( zMiddleA>>16 ); - zMiddleA <<= 16; - z1 += zMiddleA; - z0 += ( z1 < zMiddleA ); - *z1Ptr = z1; - *z0Ptr = z0; - -} - -/* -------------------------------------------------------------------------------- -Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b' to -obtain a 96-bit product. The product is broken into three 32-bit pieces -which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and -`z2Ptr'. -------------------------------------------------------------------------------- -*/ -INLINE void - mul64By32To96( - bits32 a0, - bits32 a1, - bits32 b, - bits32 *z0Ptr, - bits32 *z1Ptr, - bits32 *z2Ptr - ) -{ - bits32 z0, z1, z2, more1; - - mul32To64( a1, b, &z1, &z2 ); - mul32To64( a0, b, &z0, &more1 ); - add64( z0, more1, 0, z1, &z0, &z1 ); - *z2Ptr = z2; - *z1Ptr = z1; - *z0Ptr = z0; - -} - -/* -------------------------------------------------------------------------------- -Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the -64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit -product. The product is broken into four 32-bit pieces which are stored at -the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'. -------------------------------------------------------------------------------- -*/ -INLINE void - mul64To128( - bits32 a0, - bits32 a1, - bits32 b0, - bits32 b1, - bits32 *z0Ptr, - bits32 *z1Ptr, - bits32 *z2Ptr, - bits32 *z3Ptr - ) -{ - bits32 z0, z1, z2, z3; - bits32 more1, more2; - - mul32To64( a1, b1, &z2, &z3 ); - mul32To64( a1, b0, &z1, &more2 ); - add64( z1, more2, 0, z2, &z1, &z2 ); - mul32To64( a0, b0, &z0, &more1 ); - add64( z0, more1, 0, z1, &z0, &z1 ); - mul32To64( a0, b1, &more1, &more2 ); - add64( more1, more2, 0, z2, &more1, &z2 ); - add64( z0, z1, 0, more1, &z0, &z1 ); - *z3Ptr = z3; - *z2Ptr = z2; - *z1Ptr = z1; - *z0Ptr = z0; - -} - -/* -------------------------------------------------------------------------------- -Returns an approximation to the 32-bit integer quotient obtained by dividing -`b' into the 64-bit value formed by concatenating `a0' and `a1'. The divisor -`b' must be at least 2^31. If q is the exact quotient truncated toward -zero, the approximation returned lies between q and q + 2 inclusive. If -the exact quotient q is larger than 32 bits, the maximum positive 32-bit -unsigned integer is returned. -------------------------------------------------------------------------------- -*/ -static bits32 estimateDiv64To32( bits32 a0, bits32 a1, bits32 b ) -{ - bits32 b0, b1; - bits32 rem0, rem1, term0, term1; - bits32 z; - - if ( b <= a0 ) return 0xFFFFFFFF; - b0 = b>>16; - z = ( b0<<16 <= a0 ) ? 0xFFFF0000 : ( a0 / b0 )<<16; - mul32To64( b, z, &term0, &term1 ); - sub64( a0, a1, term0, term1, &rem0, &rem1 ); - while ( ( (sbits32) rem0 ) < 0 ) { - z -= 0x10000; - b1 = b<<16; - add64( rem0, rem1, b0, b1, &rem0, &rem1 ); - } - rem0 = ( rem0<<16 ) | ( rem1>>16 ); - z |= ( b0<<16 <= rem0 ) ? 0xFFFF : rem0 / b0; - return z; - -} - -/* -------------------------------------------------------------------------------- -Returns an approximation to the square root of the 32-bit significand given -by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of -`aExp' (the least significant bit) is 1, the integer returned approximates -2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp' -is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either -case, the approximation returned lies strictly within +/-2 of the exact -value. -------------------------------------------------------------------------------- -*/ -static bits32 estimateSqrt32( int16 aExp, bits32 a ) -{ - static const bits16 sqrtOddAdjustments[] = { - 0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0, - 0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67 - }; - static const bits16 sqrtEvenAdjustments[] = { - 0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E, - 0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002 - }; - int8 index; - bits32 z; - - index = ( a>>27 ) & 15; - if ( aExp & 1 ) { - z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ index ]; - z = ( ( a / z )<<14 ) + ( z<<15 ); - a >>= 1; - } - else { - z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ index ]; - z = a / z + z; - z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 ); - if ( z <= a ) return (bits32) ( ( (sbits32) a )>>1 ); - } - return ( ( estimateDiv64To32( a, 0, z ) )>>1 ) + ( z>>1 ); - -} - -/* -------------------------------------------------------------------------------- -Returns the number of leading 0 bits before the most-significant 1 bit -of `a'. If `a' is zero, 32 is returned. -------------------------------------------------------------------------------- -*/ -static int8 countLeadingZeros32( bits32 a ) -{ - static const int8 countLeadingZerosHigh[] = { - 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - }; - int8 shiftCount; - - shiftCount = 0; - if ( a < 0x10000 ) { - shiftCount += 16; - a <<= 16; - } - if ( a < 0x1000000 ) { - shiftCount += 8; - a <<= 8; - } - shiftCount += countLeadingZerosHigh[ a>>24 ]; - return shiftCount; - -} - -/* -------------------------------------------------------------------------------- -Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is equal -to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise, -returns 0. -------------------------------------------------------------------------------- -*/ -INLINE flag eq64( bits32 a0, bits32 a1, bits32 b0, bits32 b1 ) -{ - - return ( a0 == b0 ) && ( a1 == b1 ); - -} - -/* -------------------------------------------------------------------------------- -Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less -than or equal to the 64-bit value formed by concatenating `b0' and `b1'. -Otherwise, returns 0. -------------------------------------------------------------------------------- -*/ -INLINE flag le64( bits32 a0, bits32 a1, bits32 b0, bits32 b1 ) -{ - - return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) ); - -} - -/* -------------------------------------------------------------------------------- -Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less -than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise, -returns 0. -------------------------------------------------------------------------------- -*/ -INLINE flag lt64( bits32 a0, bits32 a1, bits32 b0, bits32 b1 ) -{ - - return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) ); - -} - -/* -------------------------------------------------------------------------------- -Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not -equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise, -returns 0. -------------------------------------------------------------------------------- -*/ -INLINE flag ne64( bits32 a0, bits32 a1, bits32 b0, bits32 b1 ) -{ - - return ( a0 != b0 ) || ( a1 != b1 ); - -} - + +/*============================================================================ + +This C source fragment is part of the SoftFloat IEC/IEEE Floating-point +Arithmetic Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| Shifts `a' right by the number of bits given in `count'. If any nonzero +| bits are shifted off, they are ``jammed'' into the least significant bit of +| the result by setting the least significant bit to 1. The value of `count' +| can be arbitrarily large; in particular, if `count' is greater than 32, the +| result will be either 0 or 1, depending on whether `a' is zero or nonzero. +| The result is stored in the location pointed to by `zPtr'. +*----------------------------------------------------------------------------*/ + +INLINE void shift32RightJamming( bits32 a, int16 count, bits32 *zPtr ) +{ + bits32 z; + + if ( count == 0 ) { + z = a; + } + else if ( count < 32 ) { + z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 ); + } + else { + z = ( a != 0 ); + } + *zPtr = z; + +} + +/*---------------------------------------------------------------------------- +| Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the +| number of bits given in `count'. Any bits shifted off are lost. The value +| of `count' can be arbitrarily large; in particular, if `count' is greater +| than 64, the result will be 0. The result is broken into two 32-bit pieces +| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + shift64Right( + bits32 a0, bits32 a1, int16 count, bits32 *z0Ptr, bits32 *z1Ptr ) +{ + bits32 z0, z1; + int8 negCount = ( - count ) & 31; + + if ( count == 0 ) { + z1 = a1; + z0 = a0; + } + else if ( count < 32 ) { + z1 = ( a0<>count ); + z0 = a0>>count; + } + else { + z1 = ( count < 64 ) ? ( a0>>( count & 31 ) ) : 0; + z0 = 0; + } + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the +| number of bits given in `count'. If any nonzero bits are shifted off, they +| are ``jammed'' into the least significant bit of the result by setting the +| least significant bit to 1. The value of `count' can be arbitrarily large; +| in particular, if `count' is greater than 64, the result will be either 0 +| or 1, depending on whether the concatenation of `a0' and `a1' is zero or +| nonzero. The result is broken into two 32-bit pieces which are stored at +| the locations pointed to by `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + shift64RightJamming( + bits32 a0, bits32 a1, int16 count, bits32 *z0Ptr, bits32 *z1Ptr ) +{ + bits32 z0, z1; + int8 negCount = ( - count ) & 31; + + if ( count == 0 ) { + z1 = a1; + z0 = a0; + } + else if ( count < 32 ) { + z1 = ( a0<>count ) | ( ( a1<>count; + } + else { + if ( count == 32 ) { + z1 = a0 | ( a1 != 0 ); + } + else if ( count < 64 ) { + z1 = ( a0>>( count & 31 ) ) | ( ( ( a0<>count ); + z0 = a0>>count; + } + else { + if ( count == 32 ) { + z2 = a1; + z1 = a0; + } + else { + a2 |= a1; + if ( count < 64 ) { + z2 = a0<>( count & 31 ); + } + else { + z2 = ( count == 64 ) ? a0 : ( a0 != 0 ); + z1 = 0; + } + } + z0 = 0; + } + z2 |= ( a2 != 0 ); + } + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the +| number of bits given in `count'. Any bits shifted off are lost. The value +| of `count' must be less than 32. The result is broken into two 32-bit +| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + shortShift64Left( + bits32 a0, bits32 a1, int16 count, bits32 *z0Ptr, bits32 *z1Ptr ) +{ + + *z1Ptr = a1<>( ( - count ) & 31 ) ); + +} + +/*---------------------------------------------------------------------------- +| Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left +| by the number of bits given in `count'. Any bits shifted off are lost. +| The value of `count' must be less than 32. The result is broken into three +| 32-bit pieces which are stored at the locations pointed to by `z0Ptr', +| `z1Ptr', and `z2Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + shortShift96Left( + bits32 a0, + bits32 a1, + bits32 a2, + int16 count, + bits32 *z0Ptr, + bits32 *z1Ptr, + bits32 *z2Ptr + ) +{ + bits32 z0, z1, z2; + int8 negCount; + + z2 = a2<>negCount; + z0 |= a1>>negCount; + } + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit +| value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so +| any carry out is lost. The result is broken into two 32-bit pieces which +| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + add64( + bits32 a0, bits32 a1, bits32 b0, bits32 b1, bits32 *z0Ptr, bits32 *z1Ptr ) +{ + bits32 z1; + + z1 = a1 + b1; + *z1Ptr = z1; + *z0Ptr = a0 + b0 + ( z1 < a1 ); + +} + +/*---------------------------------------------------------------------------- +| Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the +| 96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is +| modulo 2^96, so any carry out is lost. The result is broken into three +| 32-bit pieces which are stored at the locations pointed to by `z0Ptr', +| `z1Ptr', and `z2Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + add96( + bits32 a0, + bits32 a1, + bits32 a2, + bits32 b0, + bits32 b1, + bits32 b2, + bits32 *z0Ptr, + bits32 *z1Ptr, + bits32 *z2Ptr + ) +{ + bits32 z0, z1, z2; + int8 carry0, carry1; + + z2 = a2 + b2; + carry1 = ( z2 < a2 ); + z1 = a1 + b1; + carry0 = ( z1 < a1 ); + z0 = a0 + b0; + z1 += carry1; + z0 += ( z1 < carry1 ); + z0 += carry0; + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the +| 64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo +| 2^64, so any borrow out (carry out) is lost. The result is broken into two +| 32-bit pieces which are stored at the locations pointed to by `z0Ptr' and +| `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + sub64( + bits32 a0, bits32 a1, bits32 b0, bits32 b1, bits32 *z0Ptr, bits32 *z1Ptr ) +{ + + *z1Ptr = a1 - b1; + *z0Ptr = a0 - b0 - ( a1 < b1 ); + +} + +/*---------------------------------------------------------------------------- +| Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from +| the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction +| is modulo 2^96, so any borrow out (carry out) is lost. The result is broken +| into three 32-bit pieces which are stored at the locations pointed to by +| `z0Ptr', `z1Ptr', and `z2Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + sub96( + bits32 a0, + bits32 a1, + bits32 a2, + bits32 b0, + bits32 b1, + bits32 b2, + bits32 *z0Ptr, + bits32 *z1Ptr, + bits32 *z2Ptr + ) +{ + bits32 z0, z1, z2; + int8 borrow0, borrow1; + + z2 = a2 - b2; + borrow1 = ( a2 < b2 ); + z1 = a1 - b1; + borrow0 = ( a1 < b1 ); + z0 = a0 - b0; + z0 -= ( z1 < borrow1 ); + z1 -= borrow1; + z0 -= borrow0; + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Multiplies `a' by `b' to obtain a 64-bit product. The product is broken +| into two 32-bit pieces which are stored at the locations pointed to by +| `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void mul32To64( bits32 a, bits32 b, bits32 *z0Ptr, bits32 *z1Ptr ) +{ + bits16 aHigh, aLow, bHigh, bLow; + bits32 z0, zMiddleA, zMiddleB, z1; + + aLow = a; + aHigh = a>>16; + bLow = b; + bHigh = b>>16; + z1 = ( (bits32) aLow ) * bLow; + zMiddleA = ( (bits32) aLow ) * bHigh; + zMiddleB = ( (bits32) aHigh ) * bLow; + z0 = ( (bits32) aHigh ) * bHigh; + zMiddleA += zMiddleB; + z0 += ( ( (bits32) ( zMiddleA < zMiddleB ) )<<16 ) + ( zMiddleA>>16 ); + zMiddleA <<= 16; + z1 += zMiddleA; + z0 += ( z1 < zMiddleA ); + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b' +| to obtain a 96-bit product. The product is broken into three 32-bit pieces +| which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and +| `z2Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + mul64By32To96( + bits32 a0, + bits32 a1, + bits32 b, + bits32 *z0Ptr, + bits32 *z1Ptr, + bits32 *z2Ptr + ) +{ + bits32 z0, z1, z2, more1; + + mul32To64( a1, b, &z1, &z2 ); + mul32To64( a0, b, &z0, &more1 ); + add64( z0, more1, 0, z1, &z0, &z1 ); + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the +| 64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit +| product. The product is broken into four 32-bit pieces which are stored at +| the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + mul64To128( + bits32 a0, + bits32 a1, + bits32 b0, + bits32 b1, + bits32 *z0Ptr, + bits32 *z1Ptr, + bits32 *z2Ptr, + bits32 *z3Ptr + ) +{ + bits32 z0, z1, z2, z3; + bits32 more1, more2; + + mul32To64( a1, b1, &z2, &z3 ); + mul32To64( a1, b0, &z1, &more2 ); + add64( z1, more2, 0, z2, &z1, &z2 ); + mul32To64( a0, b0, &z0, &more1 ); + add64( z0, more1, 0, z1, &z0, &z1 ); + mul32To64( a0, b1, &more1, &more2 ); + add64( more1, more2, 0, z2, &more1, &z2 ); + add64( z0, z1, 0, more1, &z0, &z1 ); + *z3Ptr = z3; + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Returns an approximation to the 32-bit integer quotient obtained by dividing +| `b' into the 64-bit value formed by concatenating `a0' and `a1'. The +| divisor `b' must be at least 2^31. If q is the exact quotient truncated +| toward zero, the approximation returned lies between q and q + 2 inclusive. +| If the exact quotient q is larger than 32 bits, the maximum positive 32-bit +| unsigned integer is returned. +*----------------------------------------------------------------------------*/ + +static bits32 estimateDiv64To32( bits32 a0, bits32 a1, bits32 b ) +{ + bits32 b0, b1; + bits32 rem0, rem1, term0, term1; + bits32 z; + + if ( b <= a0 ) return 0xFFFFFFFF; + b0 = b>>16; + z = ( b0<<16 <= a0 ) ? 0xFFFF0000 : ( a0 / b0 )<<16; + mul32To64( b, z, &term0, &term1 ); + sub64( a0, a1, term0, term1, &rem0, &rem1 ); + while ( ( (sbits32) rem0 ) < 0 ) { + z -= 0x10000; + b1 = b<<16; + add64( rem0, rem1, b0, b1, &rem0, &rem1 ); + } + rem0 = ( rem0<<16 ) | ( rem1>>16 ); + z |= ( b0<<16 <= rem0 ) ? 0xFFFF : rem0 / b0; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns an approximation to the square root of the 32-bit significand given +| by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of +| `aExp' (the least significant bit) is 1, the integer returned approximates +| 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp' +| is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either +| case, the approximation returned lies strictly within +/-2 of the exact +| value. +*----------------------------------------------------------------------------*/ + +static bits32 estimateSqrt32( int16 aExp, bits32 a ) +{ + static const bits16 sqrtOddAdjustments[] = { + 0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0, + 0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67 + }; + static const bits16 sqrtEvenAdjustments[] = { + 0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E, + 0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002 + }; + int8 index; + bits32 z; + + index = ( a>>27 ) & 15; + if ( aExp & 1 ) { + z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ index ]; + z = ( ( a / z )<<14 ) + ( z<<15 ); + a >>= 1; + } + else { + z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ index ]; + z = a / z + z; + z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 ); + if ( z <= a ) return (bits32) ( ( (sbits32) a )>>1 ); + } + return ( ( estimateDiv64To32( a, 0, z ) )>>1 ) + ( z>>1 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the number of leading 0 bits before the most-significant 1 bit of +| `a'. If `a' is zero, 32 is returned. +*----------------------------------------------------------------------------*/ + +static int8 countLeadingZeros32( bits32 a ) +{ + static const int8 countLeadingZerosHigh[] = { + 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + int8 shiftCount; + + shiftCount = 0; + if ( a < 0x10000 ) { + shiftCount += 16; + a <<= 16; + } + if ( a < 0x1000000 ) { + shiftCount += 8; + a <<= 8; + } + shiftCount += countLeadingZerosHigh[ a>>24 ]; + return shiftCount; + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is +| equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise, +| returns 0. +*----------------------------------------------------------------------------*/ + +INLINE flag eq64( bits32 a0, bits32 a1, bits32 b0, bits32 b1 ) +{ + + return ( a0 == b0 ) && ( a1 == b1 ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less +| than or equal to the 64-bit value formed by concatenating `b0' and `b1'. +| Otherwise, returns 0. +*----------------------------------------------------------------------------*/ + +INLINE flag le64( bits32 a0, bits32 a1, bits32 b0, bits32 b1 ) +{ + + return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less +| than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise, +| returns 0. +*----------------------------------------------------------------------------*/ + +INLINE flag lt64( bits32 a0, bits32 a1, bits32 b0, bits32 b1 ) +{ + + return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not +| equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise, +| returns 0. +*----------------------------------------------------------------------------*/ + +INLINE flag ne64( bits32 a0, bits32 a1, bits32 b0, bits32 b1 ) +{ + + return ( a0 != b0 ) || ( a1 != b1 ); + +} + diff --git a/software/libbase/softfloat-specialize.h b/software/libbase/softfloat-specialize.h index 9f3466f1..8b830a55 100644 --- a/software/libbase/softfloat-specialize.h +++ b/software/libbase/softfloat-specialize.h @@ -1,9 +1,8 @@ -/* -=============================================================================== +/*============================================================================ This C source fragment is part of the SoftFloat IEC/IEEE Floating-point -Arithmetic Package, Release 2. +Arithmetic Package, Release 2b. Written by John R. Hauser. This work was made possible in part by the International Computer Science Institute, located at Suite 600, 1947 Center @@ -12,39 +11,38 @@ National Science Foundation under grant MIP-9311980. The original version of this code was written as part of a project to build a fixed-point vector processor in collaboration with the University of California at Berkeley, overseen by Profs. Nelson Morgan and John Wawrzynek. More information -is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ -arithmetic/softfloat.html'. - -THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort -has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT -TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO -PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY -AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. Derivative works are acceptable, even for commercial purposes, so long as -(1) they include prominent notice that the work is derivative, and (2) they -include prominent notice akin to these three paragraphs for those parts of -this code that are retained. - -=============================================================================== -*/ - -/* -------------------------------------------------------------------------------- -Underflow tininess-detection mode, statically initialized to default value. -(The declaration in `softfloat.h' must match the `int8' type here.) -------------------------------------------------------------------------------- -*/ +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| Underflow tininess-detection mode, statically initialized to default value. +| (The declaration in `softfloat.h' must match the `int8' type here.) +*----------------------------------------------------------------------------*/ int8 float_detect_tininess = float_tininess_after_rounding; -/* -------------------------------------------------------------------------------- -Raises the exceptions specified by `flags'. Floating-point traps can be -defined here if desired. It is currently not possible for such a trap to -substitute a result value. If traps are not implemented, this routine -should be simply `float_exception_flags |= flags;'. -------------------------------------------------------------------------------- -*/ +/*---------------------------------------------------------------------------- +| Raises the exceptions specified by `flags'. Floating-point traps can be +| defined here if desired. It is currently not possible for such a trap +| to substitute a result value. If traps are not implemented, this routine +| should be simply `float_exception_flags |= flags;'. +*----------------------------------------------------------------------------*/ + void float_raise( int8 flags ) { @@ -52,31 +50,26 @@ void float_raise( int8 flags ) } -/* -------------------------------------------------------------------------------- -Internal canonical NaN format. -------------------------------------------------------------------------------- -*/ +/*---------------------------------------------------------------------------- +| Internal canonical NaN format. +*----------------------------------------------------------------------------*/ typedef struct { flag sign; bits32 high, low; } commonNaNT; -/* -------------------------------------------------------------------------------- -The pattern for a default generated single-precision NaN. -------------------------------------------------------------------------------- -*/ +/*---------------------------------------------------------------------------- +| The pattern for a default generated single-precision NaN. +*----------------------------------------------------------------------------*/ enum { float32_default_nan = 0xFFFFFFFF }; -/* -------------------------------------------------------------------------------- -Returns 1 if the single-precision floating-point value `a' is a NaN; -otherwise returns 0. -------------------------------------------------------------------------------- -*/ +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is a NaN; +| otherwise returns 0. +*----------------------------------------------------------------------------*/ + flag float32_is_nan( float32 a ) { @@ -84,12 +77,11 @@ flag float32_is_nan( float32 a ) } -/* -------------------------------------------------------------------------------- -Returns 1 if the single-precision floating-point value `a' is a signaling -NaN; otherwise returns 0. -------------------------------------------------------------------------------- -*/ +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is a signaling +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + flag float32_is_signaling_nan( float32 a ) { @@ -97,13 +89,42 @@ flag float32_is_signaling_nan( float32 a ) } -/* -------------------------------------------------------------------------------- -Takes two single-precision floating-point values `a' and `b', one of which -is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a -signaling NaN, the invalid exception is raised. -------------------------------------------------------------------------------- -*/ +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point NaN +| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid +| exception is raised. +*----------------------------------------------------------------------------*/ + +static commonNaNT float32ToCommonNaN( float32 a ) +{ + commonNaNT z; + + if ( float32_is_signaling_nan( a ) ) float_raise( float_flag_invalid ); + z.sign = a>>31; + z.low = 0; + z.high = a<<9; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the canonical NaN `a' to the single- +| precision floating-point format. +*----------------------------------------------------------------------------*/ + +static float32 commonNaNToFloat32( commonNaNT a ) +{ + + return ( ( (bits32) a.sign )<<31 ) | 0x7FC00000 | ( a.high>>9 ); + +} + +/*---------------------------------------------------------------------------- +| Takes two single-precision floating-point values `a' and `b', one of which +| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a +| signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + static float32 propagateFloat32NaN( float32 a, float32 b ) { flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN; @@ -123,3 +144,99 @@ static float32 propagateFloat32NaN( float32 a, float32 b ) } } + +/*---------------------------------------------------------------------------- +| The pattern for a default generated double-precision NaN. The `high' and +| `low' values hold the most- and least-significant bits, respectively. +*----------------------------------------------------------------------------*/ +enum { + float64_default_nan_high = 0xFFFFFFFF, + float64_default_nan_low = 0xFFFFFFFF +}; + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is a NaN; +| otherwise returns 0. +*----------------------------------------------------------------------------*/ + +flag float64_is_nan( float64 a ) +{ + + return + ( 0xFFE00000 <= (bits32) ( a.high<<1 ) ) + && ( a.low || ( a.high & 0x000FFFFF ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is a signaling +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +flag float64_is_signaling_nan( float64 a ) +{ + + return + ( ( ( a.high>>19 ) & 0xFFF ) == 0xFFE ) + && ( a.low || ( a.high & 0x0007FFFF ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point NaN +| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid +| exception is raised. +*----------------------------------------------------------------------------*/ + +static commonNaNT float64ToCommonNaN( float64 a ) +{ + commonNaNT z; + + if ( float64_is_signaling_nan( a ) ) float_raise( float_flag_invalid ); + z.sign = a.high>>31; + shortShift64Left( a.high, a.low, 12, &z.high, &z.low ); + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the canonical NaN `a' to the double- +| precision floating-point format. +*----------------------------------------------------------------------------*/ + +static float64 commonNaNToFloat64( commonNaNT a ) +{ + float64 z; + + shift64Right( a.high, a.low, 12, &z.high, &z.low ); + z.high |= ( ( (bits32) a.sign )<<31 ) | 0x7FF80000; + return z; + +} + +/*---------------------------------------------------------------------------- +| Takes two double-precision floating-point values `a' and `b', one of which +| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a +| signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +static float64 propagateFloat64NaN( float64 a, float64 b ) +{ + flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN; + + aIsNaN = float64_is_nan( a ); + aIsSignalingNaN = float64_is_signaling_nan( a ); + bIsNaN = float64_is_nan( b ); + bIsSignalingNaN = float64_is_signaling_nan( b ); + a.high |= 0x00080000; + b.high |= 0x00080000; + if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid ); + if ( aIsNaN ) { + return ( aIsSignalingNaN & bIsNaN ) ? b : a; + } + else { + return b; + } + +} + diff --git a/software/libbase/softfloat.c b/software/libbase/softfloat.c index 9ab4f0bd..30ed86d7 100644 --- a/software/libbase/softfloat.c +++ b/software/libbase/softfloat.c @@ -1,1030 +1,2270 @@ - -/* -=============================================================================== - -This C source file is part of the SoftFloat IEC/IEEE Floating-point -Arithmetic Package, Release 2. - -Written by John R. Hauser. This work was made possible in part by the -International Computer Science Institute, located at Suite 600, 1947 Center -Street, Berkeley, California 94704. Funding was partially provided by the -National Science Foundation under grant MIP-9311980. The original version -of this code was written as part of a project to build a fixed-point vector -processor in collaboration with the University of California at Berkeley, -overseen by Profs. Nelson Morgan and John Wawrzynek. More information -is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ -arithmetic/softfloat.html'. - -THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort -has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT -TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO -PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY -AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. - -Derivative works are acceptable, even for commercial purposes, so long as -(1) they include prominent notice that the work is derivative, and (2) they -include prominent notice akin to these three paragraphs for those parts of -this code that are retained. - -=============================================================================== -*/ - -#include "milieu.h" -#include "softfloat.h" - -/* -------------------------------------------------------------------------------- -Floating-point rounding mode and exception flags. -------------------------------------------------------------------------------- -*/ -int8 float_rounding_mode = float_round_nearest_even; -int8 float_exception_flags = 0; - -/* -------------------------------------------------------------------------------- -Primitive arithmetic functions, including multi-word arithmetic, and -division and square root approximations. (Can be specialized to target if -desired.) -------------------------------------------------------------------------------- -*/ -#include "softfloat-macros.h" - -/* -------------------------------------------------------------------------------- -Functions and definitions to determine: (1) whether tininess for underflow -is detected before or after rounding by default, (2) what (if anything) -happens when exceptions are raised, (3) how signaling NaNs are distinguished -from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs -are propagated from function inputs to output. These details are target- -specific. -------------------------------------------------------------------------------- -*/ -#include "softfloat-specialize.h" - -/* -------------------------------------------------------------------------------- -Returns the fraction bits of the single-precision floating-point value `a'. -------------------------------------------------------------------------------- -*/ -INLINE bits32 extractFloat32Frac( float32 a ) -{ - - return a & 0x007FFFFF; - -} - -/* -------------------------------------------------------------------------------- -Returns the exponent bits of the single-precision floating-point value `a'. -------------------------------------------------------------------------------- -*/ -INLINE int16 extractFloat32Exp( float32 a ) -{ - - return ( a>>23 ) & 0xFF; - -} - -/* -------------------------------------------------------------------------------- -Returns the sign bit of the single-precision floating-point value `a'. -------------------------------------------------------------------------------- -*/ -INLINE flag extractFloat32Sign( float32 a ) -{ - - return a>>31; - -} - -/* -------------------------------------------------------------------------------- -Normalizes the subnormal single-precision floating-point value represented -by the denormalized significand `aSig'. The normalized exponent and -significand are stored at the locations pointed to by `zExpPtr' and -`zSigPtr', respectively. -------------------------------------------------------------------------------- -*/ -static void - normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) -{ - int8 shiftCount; - - shiftCount = countLeadingZeros32( aSig ) - 8; - *zSigPtr = aSig<>7; - zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); - if ( zSig == 0 ) zExp = 0; - return packFloat32( zSign, zExp, zSig ); - -} - -/* -------------------------------------------------------------------------------- -Takes an abstract floating-point value having sign `zSign', exponent `zExp', -and significand `zSig', and returns the proper single-precision floating- -point value corresponding to the abstract input. This routine is just like -`roundAndPackFloat32' except that `zSig' does not have to be normalized in -any way. In all cases, `zExp' must be 1 less than the ``true'' floating- -point exponent. -------------------------------------------------------------------------------- -*/ -static float32 - normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) -{ - int8 shiftCount; - - shiftCount = countLeadingZeros32( zSig ) - 1; - return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<>( - shiftCount ); - } - if ( zExtra ) float_exception_flags |= float_flag_inexact; - roundingMode = float_rounding_mode; - if ( roundingMode == float_round_nearest_even ) { - if ( (sbits32) zExtra < 0 ) { - ++z; - if ( (bits32) ( zExtra<<1 ) == 0 ) z &= ~1; - } - if ( aSign ) z = - z; - } - else { - zExtra = ( zExtra != 0 ); - if ( aSign ) { - z += ( roundingMode == float_round_down ) & zExtra; - z = - z; - } - else { - z += ( roundingMode == float_round_up ) & zExtra; - } - } - } - return z; - -} - -/* -------------------------------------------------------------------------------- -Returns the result of converting the single-precision floating-point value -`a' to the 32-bit two's complement integer format. The conversion is -performed according to the IEC/IEEE Standard for Binary Floating-point -Arithmetic, except that the conversion is always rounded toward zero. If -`a' is a NaN, the largest positive integer is returned. Otherwise, if the -conversion overflows, the largest integer with the same sign as `a' is -returned. -------------------------------------------------------------------------------- -*/ -int32 float32_to_int32_round_to_zero( float32 a ) -{ - flag aSign; - int16 aExp, shiftCount; - bits32 aSig; - int32 z; - - aSig = extractFloat32Frac( a ); - aExp = extractFloat32Exp( a ); - aSign = extractFloat32Sign( a ); - shiftCount = aExp - 0x9E; - if ( 0 <= shiftCount ) { - if ( a == 0xCF000000 ) return 0x80000000; - float_raise( float_flag_invalid ); - if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; - return 0x80000000; - } - else if ( aExp <= 0x7E ) { - if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; - return 0; - } - aSig = ( aSig | 0x00800000 )<<8; - z = aSig>>( - shiftCount ); - if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { - float_exception_flags |= float_flag_inexact; - } - return aSign ? - z : z; - -} - -/* -------------------------------------------------------------------------------- -Rounds the single-precision floating-point value `a' to an integer, and -returns the result as a single-precision floating-point value. The -operation is performed according to the IEC/IEEE Standard for Binary -Floating-point Arithmetic. -------------------------------------------------------------------------------- -*/ -float32 float32_round_to_int( float32 a ) -{ - flag aSign; - int16 aExp; - bits32 lastBitMask, roundBitsMask; - int8 roundingMode; - float32 z; - - aExp = extractFloat32Exp( a ); - if ( 0x96 <= aExp ) { - if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { - return propagateFloat32NaN( a, a ); - } - return a; - } - if ( aExp <= 0x7E ) { - if ( (bits32) ( a<<1 ) == 0 ) return a; - float_exception_flags |= float_flag_inexact; - aSign = extractFloat32Sign( a ); - switch ( float_rounding_mode ) { - case float_round_nearest_even: - if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { - return packFloat32( aSign, 0x7F, 0 ); - } - break; - case float_round_down: - return aSign ? 0xBF800000 : 0; - case float_round_up: - return aSign ? 0x80000000 : 0x3F800000; - } - return packFloat32( aSign, 0, 0 ); - } - lastBitMask = 1; - lastBitMask <<= 0x96 - aExp; - roundBitsMask = lastBitMask - 1; - z = a; - roundingMode = float_rounding_mode; - if ( roundingMode == float_round_nearest_even ) { - z += lastBitMask>>1; - if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; - } - else if ( roundingMode != float_round_to_zero ) { - if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { - z += roundBitsMask; - } - } - z &= ~ roundBitsMask; - if ( z != a ) float_exception_flags |= float_flag_inexact; - return z; - -} - -/* -------------------------------------------------------------------------------- -Returns the result of adding the absolute values of the single-precision -floating-point values `a' and `b'. If `zSign' is true, the sum is negated -before being returned. `zSign' is ignored if the result is a NaN. The -addition is performed according to the IEC/IEEE Standard for Binary -Floating-point Arithmetic. -------------------------------------------------------------------------------- -*/ -static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) -{ - int16 aExp, bExp, zExp; - bits32 aSig, bSig, zSig; - int16 expDiff; - - aSig = extractFloat32Frac( a ); - aExp = extractFloat32Exp( a ); - bSig = extractFloat32Frac( b ); - bExp = extractFloat32Exp( b ); - expDiff = aExp - bExp; - aSig <<= 6; - bSig <<= 6; - if ( 0 < expDiff ) { - if ( aExp == 0xFF ) { - if ( aSig ) return propagateFloat32NaN( a, b ); - return a; - } - if ( bExp == 0 ) { - --expDiff; - } - else { - bSig |= 0x20000000; - } - shift32RightJamming( bSig, expDiff, &bSig ); - zExp = aExp; - } - else if ( expDiff < 0 ) { - if ( bExp == 0xFF ) { - if ( bSig ) return propagateFloat32NaN( a, b ); - return packFloat32( zSign, 0xFF, 0 ); - } - if ( aExp == 0 ) { - ++expDiff; - } - else { - aSig |= 0x20000000; - } - shift32RightJamming( aSig, - expDiff, &aSig ); - zExp = bExp; - } - else { - if ( aExp == 0xFF ) { - if ( aSig | bSig ) return propagateFloat32NaN( a, b ); - return a; - } - if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); - zSig = 0x40000000 + aSig + bSig; - zExp = aExp; - goto roundAndPack; - } - aSig |= 0x20000000; - zSig = ( aSig + bSig )<<1; - --zExp; - if ( (sbits32) zSig < 0 ) { - zSig = aSig + bSig; - ++zExp; - } - roundAndPack: - return roundAndPackFloat32( zSign, zExp, zSig ); - -} - -/* -------------------------------------------------------------------------------- -Returns the result of subtracting the absolute values of the single- -precision floating-point values `a' and `b'. If `zSign' is true, the -difference is negated before being returned. `zSign' is ignored if the -result is a NaN. The subtraction is performed according to the IEC/IEEE -Standard for Binary Floating-point Arithmetic. -------------------------------------------------------------------------------- -*/ -static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) -{ - int16 aExp, bExp, zExp; - bits32 aSig, bSig, zSig; - int16 expDiff; - - aSig = extractFloat32Frac( a ); - aExp = extractFloat32Exp( a ); - bSig = extractFloat32Frac( b ); - bExp = extractFloat32Exp( b ); - expDiff = aExp - bExp; - aSig <<= 7; - bSig <<= 7; - if ( 0 < expDiff ) goto aExpBigger; - if ( expDiff < 0 ) goto bExpBigger; - if ( aExp == 0xFF ) { - if ( aSig | bSig ) return propagateFloat32NaN( a, b ); - float_raise( float_flag_invalid ); - return float32_default_nan; - } - if ( aExp == 0 ) { - aExp = 1; - bExp = 1; - } - if ( bSig < aSig ) goto aBigger; - if ( aSig < bSig ) goto bBigger; - return packFloat32( float_rounding_mode == float_round_down, 0, 0 ); - bExpBigger: - if ( bExp == 0xFF ) { - if ( bSig ) return propagateFloat32NaN( a, b ); - return packFloat32( zSign ^ 1, 0xFF, 0 ); - } - if ( aExp == 0 ) { - ++expDiff; - } - else { - aSig |= 0x40000000; - } - shift32RightJamming( aSig, - expDiff, &aSig ); - bSig |= 0x40000000; - bBigger: - zSig = bSig - aSig; - zExp = bExp; - zSign ^= 1; - goto normalizeRoundAndPack; - aExpBigger: - if ( aExp == 0xFF ) { - if ( aSig ) return propagateFloat32NaN( a, b ); - return a; - } - if ( bExp == 0 ) { - --expDiff; - } - else { - bSig |= 0x40000000; - } - shift32RightJamming( bSig, expDiff, &bSig ); - aSig |= 0x40000000; - aBigger: - zSig = aSig - bSig; - zExp = aExp; - normalizeRoundAndPack: - --zExp; - return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); - -} - -/* -------------------------------------------------------------------------------- -Returns the result of adding the single-precision floating-point values `a' -and `b'. The operation is performed according to the IEC/IEEE Standard for -Binary Floating-point Arithmetic. -------------------------------------------------------------------------------- -*/ -float32 float32_add( float32 a, float32 b ) -{ - flag aSign, bSign; - - aSign = extractFloat32Sign( a ); - bSign = extractFloat32Sign( b ); - if ( aSign == bSign ) { - return addFloat32Sigs( a, b, aSign ); - } - else { - return subFloat32Sigs( a, b, aSign ); - } - -} - -/* -------------------------------------------------------------------------------- -Returns the result of subtracting the single-precision floating-point values -`a' and `b'. The operation is performed according to the IEC/IEEE Standard -for Binary Floating-point Arithmetic. -------------------------------------------------------------------------------- -*/ -float32 float32_sub( float32 a, float32 b ) -{ - flag aSign, bSign; - - aSign = extractFloat32Sign( a ); - bSign = extractFloat32Sign( b ); - if ( aSign == bSign ) { - return subFloat32Sigs( a, b, aSign ); - } - else { - return addFloat32Sigs( a, b, aSign ); - } - -} - -/* -------------------------------------------------------------------------------- -Returns the result of multiplying the single-precision floating-point values -`a' and `b'. The operation is performed according to the IEC/IEEE Standard -for Binary Floating-point Arithmetic. -------------------------------------------------------------------------------- -*/ -float32 float32_mul( float32 a, float32 b ) -{ - flag aSign, bSign, zSign; - int16 aExp, bExp, zExp; - bits32 aSig, bSig, zSig0, zSig1; - - aSig = extractFloat32Frac( a ); - aExp = extractFloat32Exp( a ); - aSign = extractFloat32Sign( a ); - bSig = extractFloat32Frac( b ); - bExp = extractFloat32Exp( b ); - bSign = extractFloat32Sign( b ); - zSign = aSign ^ bSign; - if ( aExp == 0xFF ) { - if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { - return propagateFloat32NaN( a, b ); - } - if ( ( bExp | bSig ) == 0 ) { - float_raise( float_flag_invalid ); - return float32_default_nan; - } - return packFloat32( zSign, 0xFF, 0 ); - } - if ( bExp == 0xFF ) { - if ( bSig ) return propagateFloat32NaN( a, b ); - if ( ( aExp | aSig ) == 0 ) { - float_raise( float_flag_invalid ); - return float32_default_nan; - } - return packFloat32( zSign, 0xFF, 0 ); - } - if ( aExp == 0 ) { - if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); - normalizeFloat32Subnormal( aSig, &aExp, &aSig ); - } - if ( bExp == 0 ) { - if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); - normalizeFloat32Subnormal( bSig, &bExp, &bSig ); - } - zExp = aExp + bExp - 0x7F; - aSig = ( aSig | 0x00800000 )<<7; - bSig = ( bSig | 0x00800000 )<<8; - mul32To64( aSig, bSig, &zSig0, &zSig1 ); - zSig0 |= ( zSig1 != 0 ); - if ( 0 <= (sbits32) ( zSig0<<1 ) ) { - zSig0 <<= 1; - --zExp; - } - return roundAndPackFloat32( zSign, zExp, zSig0 ); - -} - -/* -------------------------------------------------------------------------------- -Returns the result of dividing the single-precision floating-point value `a' -by the corresponding value `b'. The operation is performed according to -the IEC/IEEE Standard for Binary Floating-point Arithmetic. -------------------------------------------------------------------------------- -*/ -float32 float32_div( float32 a, float32 b ) -{ - flag aSign, bSign, zSign; - int16 aExp, bExp, zExp; - bits32 aSig, bSig, zSig; - bits32 rem0, rem1; - bits32 term0, term1; - - aSig = extractFloat32Frac( a ); - aExp = extractFloat32Exp( a ); - aSign = extractFloat32Sign( a ); - bSig = extractFloat32Frac( b ); - bExp = extractFloat32Exp( b ); - bSign = extractFloat32Sign( b ); - zSign = aSign ^ bSign; - if ( aExp == 0xFF ) { - if ( aSig ) return propagateFloat32NaN( a, b ); - if ( bExp == 0xFF ) { - if ( bSig ) return propagateFloat32NaN( a, b ); - float_raise( float_flag_invalid ); - return float32_default_nan; - } - return packFloat32( zSign, 0xFF, 0 ); - } - if ( bExp == 0xFF ) { - if ( bSig ) return propagateFloat32NaN( a, b ); - return packFloat32( zSign, 0, 0 ); - } - if ( bExp == 0 ) { - if ( bSig == 0 ) { - if ( ( aExp | aSig ) == 0 ) { - float_raise( float_flag_invalid ); - return float32_default_nan; - } - float_raise( float_flag_divbyzero ); - return packFloat32( zSign, 0xFF, 0 ); - } - normalizeFloat32Subnormal( bSig, &bExp, &bSig ); - } - if ( aExp == 0 ) { - if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); - normalizeFloat32Subnormal( aSig, &aExp, &aSig ); - } - zExp = aExp - bExp + 0x7D; - aSig = ( aSig | 0x00800000 )<<7; - bSig = ( bSig | 0x00800000 )<<8; - if ( bSig <= ( aSig + aSig ) ) { - aSig >>= 1; - ++zExp; - } - zSig = estimateDiv64To32( aSig, 0, bSig ); - if ( ( zSig & 0x3F ) <= 2 ) { - mul32To64( bSig, zSig, &term0, &term1 ); - sub64( aSig, 0, term0, term1, &rem0, &rem1 ); - while ( (sbits32) rem0 < 0 ) { - --zSig; - add64( rem0, rem1, 0, bSig, &rem0, &rem1 ); - } - zSig |= ( rem1 != 0 ); - } - return roundAndPackFloat32( zSign, zExp, zSig ); - -} - -/* -------------------------------------------------------------------------------- -Returns the remainder of the single-precision floating-point value `a' -with respect to the corresponding value `b'. The operation is performed -according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. -------------------------------------------------------------------------------- -*/ -float32 float32_rem( float32 a, float32 b ) -{ - flag aSign, bSign, zSign; - int16 aExp, bExp, expDiff; - bits32 aSig, bSig; - bits32 q, alternateASig; - sbits32 sigMean; - - aSig = extractFloat32Frac( a ); - aExp = extractFloat32Exp( a ); - aSign = extractFloat32Sign( a ); - bSig = extractFloat32Frac( b ); - bExp = extractFloat32Exp( b ); - bSign = extractFloat32Sign( b ); - if ( aExp == 0xFF ) { - if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { - return propagateFloat32NaN( a, b ); - } - float_raise( float_flag_invalid ); - return float32_default_nan; - } - if ( bExp == 0xFF ) { - if ( bSig ) return propagateFloat32NaN( a, b ); - return a; - } - if ( bExp == 0 ) { - if ( bSig == 0 ) { - float_raise( float_flag_invalid ); - return float32_default_nan; - } - normalizeFloat32Subnormal( bSig, &bExp, &bSig ); - } - if ( aExp == 0 ) { - if ( aSig == 0 ) return a; - normalizeFloat32Subnormal( aSig, &aExp, &aSig ); - } - expDiff = aExp - bExp; - aSig = ( aSig | 0x00800000 )<<8; - bSig = ( bSig | 0x00800000 )<<8; - if ( expDiff < 0 ) { - if ( expDiff < -1 ) return a; - aSig >>= 1; - } - q = ( bSig <= aSig ); - if ( q ) aSig -= bSig; - expDiff -= 32; - while ( 0 < expDiff ) { - q = estimateDiv64To32( aSig, 0, bSig ); - q = ( 2 < q ) ? q - 2 : 0; - aSig = - ( ( bSig>>2 ) * q ); - expDiff -= 30; - } - expDiff += 32; - if ( 0 < expDiff ) { - q = estimateDiv64To32( aSig, 0, bSig ); - q = ( 2 < q ) ? q - 2 : 0; - q >>= 32 - expDiff; - bSig >>= 2; - aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; - } - else { - aSig >>= 2; - bSig >>= 2; - } - do { - alternateASig = aSig; - ++q; - aSig -= bSig; - } while ( 0 <= (sbits32) aSig ); - sigMean = aSig + alternateASig; - if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { - aSig = alternateASig; - } - zSign = ( (sbits32) aSig < 0 ); - if ( zSign ) aSig = - aSig; - return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); - -} - -/* -------------------------------------------------------------------------------- -Returns the square root of the single-precision floating-point value `a'. -The operation is performed according to the IEC/IEEE Standard for Binary -Floating-point Arithmetic. -------------------------------------------------------------------------------- -*/ -float32 float32_sqrt( float32 a ) -{ - flag aSign; - int16 aExp, zExp; - bits32 aSig, zSig; - bits32 rem0, rem1; - bits32 term0, term1; - - aSig = extractFloat32Frac( a ); - aExp = extractFloat32Exp( a ); - aSign = extractFloat32Sign( a ); - if ( aExp == 0xFF ) { - if ( aSig ) return propagateFloat32NaN( a, 0 ); - if ( ! aSign ) return a; - float_raise( float_flag_invalid ); - return float32_default_nan; - } - if ( aSign ) { - if ( ( aExp | aSig ) == 0 ) return a; - float_raise( float_flag_invalid ); - return float32_default_nan; - } - if ( aExp == 0 ) { - if ( aSig == 0 ) return 0; - normalizeFloat32Subnormal( aSig, &aExp, &aSig ); - } - zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; - aSig = ( aSig | 0x00800000 )<<8; - zSig = estimateSqrt32( aExp, aSig ) + 2; - if ( ( zSig & 0x7F ) <= 5 ) { - if ( zSig < 2 ) { - zSig = 0xFFFFFFFF; - } - else { - aSig >>= aExp & 1; - mul32To64( zSig, zSig, &term0, &term1 ); - sub64( aSig, 0, term0, term1, &rem0, &rem1 ); - while ( (sbits32) rem0 < 0 ) { - --zSig; - shortShift64Left( 0, zSig, 1, &term0, &term1 ); - term1 |= 1; - add64( rem0, rem1, term0, term1, &rem0, &rem1 ); - } - zSig |= ( ( rem0 | rem1 ) != 0 ); - } - } - shift32RightJamming( zSig, 1, &zSig ); - return roundAndPackFloat32( 0, zExp, zSig ); - -} - -/* -------------------------------------------------------------------------------- -Returns 1 if the single-precision floating-point value `a' is equal to the -corresponding value `b', and 0 otherwise. The comparison is performed -according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. -------------------------------------------------------------------------------- -*/ -flag float32_eq( float32 a, float32 b ) -{ - - if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) - || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) - ) { - if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { - float_raise( float_flag_invalid ); - } - return 0; - } - return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); - -} - -/* -------------------------------------------------------------------------------- -Returns 1 if the single-precision floating-point value `a' is less than or -equal to the corresponding value `b', and 0 otherwise. The comparison is -performed according to the IEC/IEEE Standard for Binary Floating-point -Arithmetic. -------------------------------------------------------------------------------- -*/ -flag float32_le( float32 a, float32 b ) -{ - flag aSign, bSign; - - if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) - || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) - ) { - float_raise( float_flag_invalid ); - return 0; - } - aSign = extractFloat32Sign( a ); - bSign = extractFloat32Sign( b ); - if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); - return ( a == b ) || ( aSign ^ ( a < b ) ); - -} - -/* -------------------------------------------------------------------------------- -Returns 1 if the single-precision floating-point value `a' is less than -the corresponding value `b', and 0 otherwise. The comparison is performed -according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. -------------------------------------------------------------------------------- -*/ -flag float32_lt( float32 a, float32 b ) -{ - flag aSign, bSign; - - if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) - || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) - ) { - float_raise( float_flag_invalid ); - return 0; - } - aSign = extractFloat32Sign( a ); - bSign = extractFloat32Sign( b ); - if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); - return ( a != b ) && ( aSign ^ ( a < b ) ); - -} - -/* -------------------------------------------------------------------------------- -Returns 1 if the single-precision floating-point value `a' is equal to the -corresponding value `b', and 0 otherwise. The invalid exception is raised -if either operand is a NaN. Otherwise, the comparison is performed -according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. -------------------------------------------------------------------------------- -*/ -flag float32_eq_signaling( float32 a, float32 b ) -{ - - if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) - || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) - ) { - float_raise( float_flag_invalid ); - return 0; - } - return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); - -} - -/* -------------------------------------------------------------------------------- -Returns 1 if the single-precision floating-point value `a' is less than or -equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not -cause an exception. Otherwise, the comparison is performed according to the -IEC/IEEE Standard for Binary Floating-point Arithmetic. -------------------------------------------------------------------------------- -*/ -flag float32_le_quiet( float32 a, float32 b ) -{ - flag aSign, bSign; - - if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) - || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) - ) { - if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { - float_raise( float_flag_invalid ); - } - return 0; - } - aSign = extractFloat32Sign( a ); - bSign = extractFloat32Sign( b ); - if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); - return ( a == b ) || ( aSign ^ ( a < b ) ); - -} - -/* -------------------------------------------------------------------------------- -Returns 1 if the single-precision floating-point value `a' is less than -the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an -exception. Otherwise, the comparison is performed according to the IEC/IEEE -Standard for Binary Floating-point Arithmetic. -------------------------------------------------------------------------------- -*/ -flag float32_lt_quiet( float32 a, float32 b ) -{ - flag aSign, bSign; - - if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) - || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) - ) { - if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { - float_raise( float_flag_invalid ); - } - return 0; - } - aSign = extractFloat32Sign( a ); - bSign = extractFloat32Sign( b ); - if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); - return ( a != b ) && ( aSign ^ ( a < b ) ); - -} - + +/*============================================================================ + +This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +#include "milieu.h" +#include "softfloat.h" + +/*---------------------------------------------------------------------------- +| Floating-point rounding mode and exception flags. +*----------------------------------------------------------------------------*/ +int8 float_rounding_mode = float_round_nearest_even; +int8 float_exception_flags = 0; + +/*---------------------------------------------------------------------------- +| Primitive arithmetic functions, including multi-word arithmetic, and +| division and square root approximations. (Can be specialized to target if +| desired.) +*----------------------------------------------------------------------------*/ +#include "softfloat-macros.h" + +/*---------------------------------------------------------------------------- +| Functions and definitions to determine: (1) whether tininess for underflow +| is detected before or after rounding by default, (2) what (if anything) +| happens when exceptions are raised, (3) how signaling NaNs are distinguished +| from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs +| are propagated from function inputs to output. These details are target- +| specific. +*----------------------------------------------------------------------------*/ +#include "softfloat-specialize.h" + +/*---------------------------------------------------------------------------- +| Returns the fraction bits of the single-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE bits32 extractFloat32Frac( float32 a ) +{ + + return a & 0x007FFFFF; + +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the single-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE int16 extractFloat32Exp( float32 a ) +{ + + return ( a>>23 ) & 0xFF; + +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the single-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE flag extractFloat32Sign( float32 a ) +{ + + return a>>31; + +} + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal single-precision floating-point value represented +| by the denormalized significand `aSig'. The normalized exponent and +| significand are stored at the locations pointed to by `zExpPtr' and +| `zSigPtr', respectively. +*----------------------------------------------------------------------------*/ + +static void + normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) +{ + int8 shiftCount; + + shiftCount = countLeadingZeros32( aSig ) - 8; + *zSigPtr = aSig<>7; + zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); + if ( zSig == 0 ) zExp = 0; + return packFloat32( zSign, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand `zSig', and returns the proper single-precision floating- +| point value corresponding to the abstract input. This routine is just like +| `roundAndPackFloat32' except that `zSig' does not have to be normalized. +| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' +| floating-point exponent. +*----------------------------------------------------------------------------*/ + +static float32 + normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) +{ + int8 shiftCount; + + shiftCount = countLeadingZeros32( zSig ) - 1; + return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<>20 ) & 0x7FF; + +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the double-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE flag extractFloat64Sign( float64 a ) +{ + + return a.high>>31; + +} + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal double-precision floating-point value represented +| by the denormalized significand formed by the concatenation of `aSig0' and +| `aSig1'. The normalized exponent is stored at the location pointed to by +| `zExpPtr'. The most significant 21 bits of the normalized significand are +| stored at the location pointed to by `zSig0Ptr', and the least significant +| 32 bits of the normalized significand are stored at the location pointed to +| by `zSig1Ptr'. +*----------------------------------------------------------------------------*/ + +static void + normalizeFloat64Subnormal( + bits32 aSig0, + bits32 aSig1, + int16 *zExpPtr, + bits32 *zSig0Ptr, + bits32 *zSig1Ptr + ) +{ + int8 shiftCount; + + if ( aSig0 == 0 ) { + shiftCount = countLeadingZeros32( aSig1 ) - 11; + if ( shiftCount < 0 ) { + *zSig0Ptr = aSig1>>( - shiftCount ); + *zSig1Ptr = aSig1<<( shiftCount & 31 ); + } + else { + *zSig0Ptr = aSig1<>( - shiftCount ); + } + if ( aSigExtra ) float_exception_flags |= float_flag_inexact; + roundingMode = float_rounding_mode; + if ( roundingMode == float_round_nearest_even ) { + if ( (sbits32) aSigExtra < 0 ) { + ++z; + if ( (bits32) ( aSigExtra<<1 ) == 0 ) z &= ~1; + } + if ( aSign ) z = - z; + } + else { + aSigExtra = ( aSigExtra != 0 ); + if ( aSign ) { + z += ( roundingMode == float_round_down ) & aSigExtra; + z = - z; + } + else { + z += ( roundingMode == float_round_up ) & aSigExtra; + } + } + } + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the 32-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic, except that the conversion is always rounded toward zero. +| If `a' is a NaN, the largest positive integer is returned. Otherwise, if +| the conversion overflows, the largest integer with the same sign as `a' is +| returned. +*----------------------------------------------------------------------------*/ + +int32 float32_to_int32_round_to_zero( float32 a ) +{ + flag aSign; + int16 aExp, shiftCount; + bits32 aSig; + int32 z; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + shiftCount = aExp - 0x9E; + if ( 0 <= shiftCount ) { + if ( a != 0xCF000000 ) { + float_raise( float_flag_invalid ); + if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; + } + return (sbits32) 0x80000000; + } + else if ( aExp <= 0x7E ) { + if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; + return 0; + } + aSig = ( aSig | 0x00800000 )<<8; + z = aSig>>( - shiftCount ); + if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { + float_exception_flags |= float_flag_inexact; + } + if ( aSign ) z = - z; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the double-precision floating-point format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float32_to_float64( float32 a ) +{ + flag aSign; + int16 aExp; + bits32 aSig, zSig0, zSig1; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + if ( aExp == 0xFF ) { + if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); + return packFloat64( aSign, 0x7FF, 0, 0 ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat64( aSign, 0, 0, 0 ); + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + --aExp; + } + shift64Right( aSig, 0, 3, &zSig0, &zSig1 ); + return packFloat64( aSign, aExp + 0x380, zSig0, zSig1 ); + +} + +/*---------------------------------------------------------------------------- +| Rounds the single-precision floating-point value `a' to an integer, +| and returns the result as a single-precision floating-point value. The +| operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_round_to_int( float32 a ) +{ + flag aSign; + int16 aExp; + bits32 lastBitMask, roundBitsMask; + int8 roundingMode; + float32 z; + + aExp = extractFloat32Exp( a ); + if ( 0x96 <= aExp ) { + if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { + return propagateFloat32NaN( a, a ); + } + return a; + } + if ( aExp <= 0x7E ) { + if ( (bits32) ( a<<1 ) == 0 ) return a; + float_exception_flags |= float_flag_inexact; + aSign = extractFloat32Sign( a ); + switch ( float_rounding_mode ) { + case float_round_nearest_even: + if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { + return packFloat32( aSign, 0x7F, 0 ); + } + break; + case float_round_down: + return aSign ? 0xBF800000 : 0; + case float_round_up: + return aSign ? 0x80000000 : 0x3F800000; + } + return packFloat32( aSign, 0, 0 ); + } + lastBitMask = 1; + lastBitMask <<= 0x96 - aExp; + roundBitsMask = lastBitMask - 1; + z = a; + roundingMode = float_rounding_mode; + if ( roundingMode == float_round_nearest_even ) { + z += lastBitMask>>1; + if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; + } + else if ( roundingMode != float_round_to_zero ) { + if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { + z += roundBitsMask; + } + } + z &= ~ roundBitsMask; + if ( z != a ) float_exception_flags |= float_flag_inexact; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the absolute values of the single-precision +| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated +| before being returned. `zSign' is ignored if the result is a NaN. +| The addition is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) +{ + int16 aExp, bExp, zExp; + bits32 aSig, bSig, zSig; + int16 expDiff; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + bSig = extractFloat32Frac( b ); + bExp = extractFloat32Exp( b ); + expDiff = aExp - bExp; + aSig <<= 6; + bSig <<= 6; + if ( 0 < expDiff ) { + if ( aExp == 0xFF ) { + if ( aSig ) return propagateFloat32NaN( a, b ); + return a; + } + if ( bExp == 0 ) { + --expDiff; + } + else { + bSig |= 0x20000000; + } + shift32RightJamming( bSig, expDiff, &bSig ); + zExp = aExp; + } + else if ( expDiff < 0 ) { + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b ); + return packFloat32( zSign, 0xFF, 0 ); + } + if ( aExp == 0 ) { + ++expDiff; + } + else { + aSig |= 0x20000000; + } + shift32RightJamming( aSig, - expDiff, &aSig ); + zExp = bExp; + } + else { + if ( aExp == 0xFF ) { + if ( aSig | bSig ) return propagateFloat32NaN( a, b ); + return a; + } + if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); + zSig = 0x40000000 + aSig + bSig; + zExp = aExp; + goto roundAndPack; + } + aSig |= 0x20000000; + zSig = ( aSig + bSig )<<1; + --zExp; + if ( (sbits32) zSig < 0 ) { + zSig = aSig + bSig; + ++zExp; + } + roundAndPack: + return roundAndPackFloat32( zSign, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the absolute values of the single- +| precision floating-point values `a' and `b'. If `zSign' is 1, the +| difference is negated before being returned. `zSign' is ignored if the +| result is a NaN. The subtraction is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) +{ + int16 aExp, bExp, zExp; + bits32 aSig, bSig, zSig; + int16 expDiff; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + bSig = extractFloat32Frac( b ); + bExp = extractFloat32Exp( b ); + expDiff = aExp - bExp; + aSig <<= 7; + bSig <<= 7; + if ( 0 < expDiff ) goto aExpBigger; + if ( expDiff < 0 ) goto bExpBigger; + if ( aExp == 0xFF ) { + if ( aSig | bSig ) return propagateFloat32NaN( a, b ); + float_raise( float_flag_invalid ); + return float32_default_nan; + } + if ( aExp == 0 ) { + aExp = 1; + bExp = 1; + } + if ( bSig < aSig ) goto aBigger; + if ( aSig < bSig ) goto bBigger; + return packFloat32( float_rounding_mode == float_round_down, 0, 0 ); + bExpBigger: + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b ); + return packFloat32( zSign ^ 1, 0xFF, 0 ); + } + if ( aExp == 0 ) { + ++expDiff; + } + else { + aSig |= 0x40000000; + } + shift32RightJamming( aSig, - expDiff, &aSig ); + bSig |= 0x40000000; + bBigger: + zSig = bSig - aSig; + zExp = bExp; + zSign ^= 1; + goto normalizeRoundAndPack; + aExpBigger: + if ( aExp == 0xFF ) { + if ( aSig ) return propagateFloat32NaN( a, b ); + return a; + } + if ( bExp == 0 ) { + --expDiff; + } + else { + bSig |= 0x40000000; + } + shift32RightJamming( bSig, expDiff, &bSig ); + aSig |= 0x40000000; + aBigger: + zSig = aSig - bSig; + zExp = aExp; + normalizeRoundAndPack: + --zExp; + return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the single-precision floating-point values `a' +| and `b'. The operation is performed according to the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_add( float32 a, float32 b ) +{ + flag aSign, bSign; + + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + if ( aSign == bSign ) { + return addFloat32Sigs( a, b, aSign ); + } + else { + return subFloat32Sigs( a, b, aSign ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the single-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_sub( float32 a, float32 b ) +{ + flag aSign, bSign; + + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + if ( aSign == bSign ) { + return subFloat32Sigs( a, b, aSign ); + } + else { + return addFloat32Sigs( a, b, aSign ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the single-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_mul( float32 a, float32 b ) +{ + flag aSign, bSign, zSign; + int16 aExp, bExp, zExp; + bits32 aSig, bSig, zSig0, zSig1; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + bSig = extractFloat32Frac( b ); + bExp = extractFloat32Exp( b ); + bSign = extractFloat32Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0xFF ) { + if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { + return propagateFloat32NaN( a, b ); + } + if ( ( bExp | bSig ) == 0 ) { + float_raise( float_flag_invalid ); + return float32_default_nan; + } + return packFloat32( zSign, 0xFF, 0 ); + } + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b ); + if ( ( aExp | aSig ) == 0 ) { + float_raise( float_flag_invalid ); + return float32_default_nan; + } + return packFloat32( zSign, 0xFF, 0 ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + } + if ( bExp == 0 ) { + if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); + normalizeFloat32Subnormal( bSig, &bExp, &bSig ); + } + zExp = aExp + bExp - 0x7F; + aSig = ( aSig | 0x00800000 )<<7; + bSig = ( bSig | 0x00800000 )<<8; + mul32To64( aSig, bSig, &zSig0, &zSig1 ); + zSig0 |= ( zSig1 != 0 ); + if ( 0 <= (sbits32) ( zSig0<<1 ) ) { + zSig0 <<= 1; + --zExp; + } + return roundAndPackFloat32( zSign, zExp, zSig0 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of dividing the single-precision floating-point value `a' +| by the corresponding value `b'. The operation is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_div( float32 a, float32 b ) +{ + flag aSign, bSign, zSign; + int16 aExp, bExp, zExp; + bits32 aSig, bSig, zSig, rem0, rem1, term0, term1; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + bSig = extractFloat32Frac( b ); + bExp = extractFloat32Exp( b ); + bSign = extractFloat32Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0xFF ) { + if ( aSig ) return propagateFloat32NaN( a, b ); + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b ); + float_raise( float_flag_invalid ); + return float32_default_nan; + } + return packFloat32( zSign, 0xFF, 0 ); + } + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b ); + return packFloat32( zSign, 0, 0 ); + } + if ( bExp == 0 ) { + if ( bSig == 0 ) { + if ( ( aExp | aSig ) == 0 ) { + float_raise( float_flag_invalid ); + return float32_default_nan; + } + float_raise( float_flag_divbyzero ); + return packFloat32( zSign, 0xFF, 0 ); + } + normalizeFloat32Subnormal( bSig, &bExp, &bSig ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + } + zExp = aExp - bExp + 0x7D; + aSig = ( aSig | 0x00800000 )<<7; + bSig = ( bSig | 0x00800000 )<<8; + if ( bSig <= ( aSig + aSig ) ) { + aSig >>= 1; + ++zExp; + } + zSig = estimateDiv64To32( aSig, 0, bSig ); + if ( ( zSig & 0x3F ) <= 2 ) { + mul32To64( bSig, zSig, &term0, &term1 ); + sub64( aSig, 0, term0, term1, &rem0, &rem1 ); + while ( (sbits32) rem0 < 0 ) { + --zSig; + add64( rem0, rem1, 0, bSig, &rem0, &rem1 ); + } + zSig |= ( rem1 != 0 ); + } + return roundAndPackFloat32( zSign, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns the remainder of the single-precision floating-point value `a' +| with respect to the corresponding value `b'. The operation is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_rem( float32 a, float32 b ) +{ + flag aSign, bSign, zSign; + int16 aExp, bExp, expDiff; + bits32 aSig, bSig, q, allZero, alternateASig; + sbits32 sigMean; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + bSig = extractFloat32Frac( b ); + bExp = extractFloat32Exp( b ); + bSign = extractFloat32Sign( b ); + if ( aExp == 0xFF ) { + if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { + return propagateFloat32NaN( a, b ); + } + float_raise( float_flag_invalid ); + return float32_default_nan; + } + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b ); + return a; + } + if ( bExp == 0 ) { + if ( bSig == 0 ) { + float_raise( float_flag_invalid ); + return float32_default_nan; + } + normalizeFloat32Subnormal( bSig, &bExp, &bSig ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return a; + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + } + expDiff = aExp - bExp; + aSig = ( aSig | 0x00800000 )<<8; + bSig = ( bSig | 0x00800000 )<<8; + if ( expDiff < 0 ) { + if ( expDiff < -1 ) return a; + aSig >>= 1; + } + q = ( bSig <= aSig ); + if ( q ) aSig -= bSig; + expDiff -= 32; + while ( 0 < expDiff ) { + q = estimateDiv64To32( aSig, 0, bSig ); + q = ( 2 < q ) ? q - 2 : 0; + aSig = - ( ( bSig>>2 ) * q ); + expDiff -= 30; + } + expDiff += 32; + if ( 0 < expDiff ) { + q = estimateDiv64To32( aSig, 0, bSig ); + q = ( 2 < q ) ? q - 2 : 0; + q >>= 32 - expDiff; + bSig >>= 2; + aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; + } + else { + aSig >>= 2; + bSig >>= 2; + } + do { + alternateASig = aSig; + ++q; + aSig -= bSig; + } while ( 0 <= (sbits32) aSig ); + sigMean = aSig + alternateASig; + if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { + aSig = alternateASig; + } + zSign = ( (sbits32) aSig < 0 ); + if ( zSign ) aSig = - aSig; + return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns the square root of the single-precision floating-point value `a'. +| The operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_sqrt( float32 a ) +{ + flag aSign; + int16 aExp, zExp; + bits32 aSig, zSig, rem0, rem1, term0, term1; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + if ( aExp == 0xFF ) { + if ( aSig ) return propagateFloat32NaN( a, 0 ); + if ( ! aSign ) return a; + float_raise( float_flag_invalid ); + return float32_default_nan; + } + if ( aSign ) { + if ( ( aExp | aSig ) == 0 ) return a; + float_raise( float_flag_invalid ); + return float32_default_nan; + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return 0; + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + } + zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; + aSig = ( aSig | 0x00800000 )<<8; + zSig = estimateSqrt32( aExp, aSig ) + 2; + if ( ( zSig & 0x7F ) <= 5 ) { + if ( zSig < 2 ) { + zSig = 0x7FFFFFFF; + goto roundAndPack; + } + else { + aSig >>= aExp & 1; + mul32To64( zSig, zSig, &term0, &term1 ); + sub64( aSig, 0, term0, term1, &rem0, &rem1 ); + while ( (sbits32) rem0 < 0 ) { + --zSig; + shortShift64Left( 0, zSig, 1, &term0, &term1 ); + term1 |= 1; + add64( rem0, rem1, term0, term1, &rem0, &rem1 ); + } + zSig |= ( ( rem0 | rem1 ) != 0 ); + } + } + shift32RightJamming( zSig, 1, &zSig ); + roundAndPack: + return roundAndPackFloat32( 0, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is equal to +| the corresponding value `b', and 0 otherwise. The comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float32_eq( float32 a, float32 b ) +{ + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is less than +| or equal to the corresponding value `b', and 0 otherwise. The comparison +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float32_le( float32 a, float32 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); + return ( a == b ) || ( aSign ^ ( a < b ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is less than +| the corresponding value `b', and 0 otherwise. The comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float32_lt( float32 a, float32 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); + return ( a != b ) && ( aSign ^ ( a < b ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is equal to +| the corresponding value `b', and 0 otherwise. The invalid exception is +| raised if either operand is a NaN. Otherwise, the comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float32_eq_signaling( float32 a, float32 b ) +{ + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is less than or +| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not +| cause an exception. Otherwise, the comparison is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float32_le_quiet( float32 a, float32 b ) +{ + flag aSign, bSign; + int16 aExp, bExp; + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); + return ( a == b ) || ( aSign ^ ( a < b ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is less than +| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an +| exception. Otherwise, the comparison is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float32_lt_quiet( float32 a, float32 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); + return ( a != b ) && ( aSign ^ ( a < b ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the 32-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic---which means in particular that the conversion is rounded +| according to the current rounding mode. If `a' is a NaN, the largest +| positive integer is returned. Otherwise, if the conversion overflows, the +| largest integer with the same sign as `a' is returned. +*----------------------------------------------------------------------------*/ + +int32 float64_to_int32( float64 a ) +{ + flag aSign; + int16 aExp, shiftCount; + bits32 aSig0, aSig1, absZ, aSigExtra; + int32 z; + int8 roundingMode; + + aSig1 = extractFloat64Frac1( a ); + aSig0 = extractFloat64Frac0( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + shiftCount = aExp - 0x413; + if ( 0 <= shiftCount ) { + if ( 0x41E < aExp ) { + if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0; + goto invalid; + } + shortShift64Left( + aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra ); + if ( 0x80000000 < absZ ) goto invalid; + } + else { + aSig1 = ( aSig1 != 0 ); + if ( aExp < 0x3FE ) { + aSigExtra = aExp | aSig0 | aSig1; + absZ = 0; + } + else { + aSig0 |= 0x00100000; + aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1; + absZ = aSig0>>( - shiftCount ); + } + } + roundingMode = float_rounding_mode; + if ( roundingMode == float_round_nearest_even ) { + if ( (sbits32) aSigExtra < 0 ) { + ++absZ; + if ( (bits32) ( aSigExtra<<1 ) == 0 ) absZ &= ~1; + } + z = aSign ? - absZ : absZ; + } + else { + aSigExtra = ( aSigExtra != 0 ); + if ( aSign ) { + z = - ( absZ + + ( ( roundingMode == float_round_down ) & aSigExtra ) ); + } + else { + z = absZ + ( ( roundingMode == float_round_up ) & aSigExtra ); + } + } + if ( ( aSign ^ ( z < 0 ) ) && z ) { + invalid: + float_raise( float_flag_invalid ); + return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; + } + if ( aSigExtra ) float_exception_flags |= float_flag_inexact; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the 32-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic, except that the conversion is always rounded toward zero. +| If `a' is a NaN, the largest positive integer is returned. Otherwise, if +| the conversion overflows, the largest integer with the same sign as `a' is +| returned. +*----------------------------------------------------------------------------*/ + +int32 float64_to_int32_round_to_zero( float64 a ) +{ + flag aSign; + int16 aExp, shiftCount; + bits32 aSig0, aSig1, absZ, aSigExtra; + int32 z; + + aSig1 = extractFloat64Frac1( a ); + aSig0 = extractFloat64Frac0( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + shiftCount = aExp - 0x413; + if ( 0 <= shiftCount ) { + if ( 0x41E < aExp ) { + if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0; + goto invalid; + } + shortShift64Left( + aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra ); + } + else { + if ( aExp < 0x3FF ) { + if ( aExp | aSig0 | aSig1 ) { + float_exception_flags |= float_flag_inexact; + } + return 0; + } + aSig0 |= 0x00100000; + aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1; + absZ = aSig0>>( - shiftCount ); + } + z = aSign ? - absZ : absZ; + if ( ( aSign ^ ( z < 0 ) ) && z ) { + invalid: + float_raise( float_flag_invalid ); + return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; + } + if ( aSigExtra ) float_exception_flags |= float_flag_inexact; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the single-precision floating-point format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float64_to_float32( float64 a ) +{ + flag aSign; + int16 aExp; + bits32 aSig0, aSig1, zSig; + bits32 allZero; + + aSig1 = extractFloat64Frac1( a ); + aSig0 = extractFloat64Frac0( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( aExp == 0x7FF ) { + if ( aSig0 | aSig1 ) { + return commonNaNToFloat32( float64ToCommonNaN( a ) ); + } + return packFloat32( aSign, 0xFF, 0 ); + } + shift64RightJamming( aSig0, aSig1, 22, &allZero, &zSig ); + if ( aExp ) zSig |= 0x40000000; + return roundAndPackFloat32( aSign, aExp - 0x381, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Rounds the double-precision floating-point value `a' to an integer, +| and returns the result as a double-precision floating-point value. The +| operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_round_to_int( float64 a ) +{ + flag aSign; + int16 aExp; + bits32 lastBitMask, roundBitsMask; + int8 roundingMode; + float64 z; + + aExp = extractFloat64Exp( a ); + if ( 0x413 <= aExp ) { + if ( 0x433 <= aExp ) { + if ( ( aExp == 0x7FF ) + && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) { + return propagateFloat64NaN( a, a ); + } + return a; + } + lastBitMask = 1; + lastBitMask = ( lastBitMask<<( 0x432 - aExp ) )<<1; + roundBitsMask = lastBitMask - 1; + z = a; + roundingMode = float_rounding_mode; + if ( roundingMode == float_round_nearest_even ) { + if ( lastBitMask ) { + add64( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); + if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; + } + else { + if ( (sbits32) z.low < 0 ) { + ++z.high; + if ( (bits32) ( z.low<<1 ) == 0 ) z.high &= ~1; + } + } + } + else if ( roundingMode != float_round_to_zero ) { + if ( extractFloat64Sign( z ) + ^ ( roundingMode == float_round_up ) ) { + add64( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); + } + } + z.low &= ~ roundBitsMask; + } + else { + if ( aExp <= 0x3FE ) { + if ( ( ( (bits32) ( a.high<<1 ) ) | a.low ) == 0 ) return a; + float_exception_flags |= float_flag_inexact; + aSign = extractFloat64Sign( a ); + switch ( float_rounding_mode ) { + case float_round_nearest_even: + if ( ( aExp == 0x3FE ) + && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) + ) { + return packFloat64( aSign, 0x3FF, 0, 0 ); + } + break; + case float_round_down: + return + aSign ? packFloat64( 1, 0x3FF, 0, 0 ) + : packFloat64( 0, 0, 0, 0 ); + case float_round_up: + return + aSign ? packFloat64( 1, 0, 0, 0 ) + : packFloat64( 0, 0x3FF, 0, 0 ); + } + return packFloat64( aSign, 0, 0, 0 ); + } + lastBitMask = 1; + lastBitMask <<= 0x413 - aExp; + roundBitsMask = lastBitMask - 1; + z.low = 0; + z.high = a.high; + roundingMode = float_rounding_mode; + if ( roundingMode == float_round_nearest_even ) { + z.high += lastBitMask>>1; + if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { + z.high &= ~ lastBitMask; + } + } + else if ( roundingMode != float_round_to_zero ) { + if ( extractFloat64Sign( z ) + ^ ( roundingMode == float_round_up ) ) { + z.high |= ( a.low != 0 ); + z.high += roundBitsMask; + } + } + z.high &= ~ roundBitsMask; + } + if ( ( z.low != a.low ) || ( z.high != a.high ) ) { + float_exception_flags |= float_flag_inexact; + } + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the absolute values of the double-precision +| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated +| before being returned. `zSign' is ignored if the result is a NaN. +| The addition is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) +{ + int16 aExp, bExp, zExp; + bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; + int16 expDiff; + + aSig1 = extractFloat64Frac1( a ); + aSig0 = extractFloat64Frac0( a ); + aExp = extractFloat64Exp( a ); + bSig1 = extractFloat64Frac1( b ); + bSig0 = extractFloat64Frac0( b ); + bExp = extractFloat64Exp( b ); + expDiff = aExp - bExp; + if ( 0 < expDiff ) { + if ( aExp == 0x7FF ) { + if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b ); + return a; + } + if ( bExp == 0 ) { + --expDiff; + } + else { + bSig0 |= 0x00100000; + } + shift64ExtraRightJamming( + bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); + zExp = aExp; + } + else if ( expDiff < 0 ) { + if ( bExp == 0x7FF ) { + if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); + return packFloat64( zSign, 0x7FF, 0, 0 ); + } + if ( aExp == 0 ) { + ++expDiff; + } + else { + aSig0 |= 0x00100000; + } + shift64ExtraRightJamming( + aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); + zExp = bExp; + } + else { + if ( aExp == 0x7FF ) { + if ( aSig0 | aSig1 | bSig0 | bSig1 ) { + return propagateFloat64NaN( a, b ); + } + return a; + } + add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); + if ( aExp == 0 ) return packFloat64( zSign, 0, zSig0, zSig1 ); + zSig2 = 0; + zSig0 |= 0x00200000; + zExp = aExp; + goto shiftRight1; + } + aSig0 |= 0x00100000; + add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); + --zExp; + if ( zSig0 < 0x00200000 ) goto roundAndPack; + ++zExp; + shiftRight1: + shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); + roundAndPack: + return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the absolute values of the double- +| precision floating-point values `a' and `b'. If `zSign' is 1, the +| difference is negated before being returned. `zSign' is ignored if the +| result is a NaN. The subtraction is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) +{ + int16 aExp, bExp, zExp; + bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; + int16 expDiff; + float64 z; + + aSig1 = extractFloat64Frac1( a ); + aSig0 = extractFloat64Frac0( a ); + aExp = extractFloat64Exp( a ); + bSig1 = extractFloat64Frac1( b ); + bSig0 = extractFloat64Frac0( b ); + bExp = extractFloat64Exp( b ); + expDiff = aExp - bExp; + shortShift64Left( aSig0, aSig1, 10, &aSig0, &aSig1 ); + shortShift64Left( bSig0, bSig1, 10, &bSig0, &bSig1 ); + if ( 0 < expDiff ) goto aExpBigger; + if ( expDiff < 0 ) goto bExpBigger; + if ( aExp == 0x7FF ) { + if ( aSig0 | aSig1 | bSig0 | bSig1 ) { + return propagateFloat64NaN( a, b ); + } + float_raise( float_flag_invalid ); + z.low = float64_default_nan_low; + z.high = float64_default_nan_high; + return z; + } + if ( aExp == 0 ) { + aExp = 1; + bExp = 1; + } + if ( bSig0 < aSig0 ) goto aBigger; + if ( aSig0 < bSig0 ) goto bBigger; + if ( bSig1 < aSig1 ) goto aBigger; + if ( aSig1 < bSig1 ) goto bBigger; + return packFloat64( float_rounding_mode == float_round_down, 0, 0, 0 ); + bExpBigger: + if ( bExp == 0x7FF ) { + if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); + return packFloat64( zSign ^ 1, 0x7FF, 0, 0 ); + } + if ( aExp == 0 ) { + ++expDiff; + } + else { + aSig0 |= 0x40000000; + } + shift64RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); + bSig0 |= 0x40000000; + bBigger: + sub64( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); + zExp = bExp; + zSign ^= 1; + goto normalizeRoundAndPack; + aExpBigger: + if ( aExp == 0x7FF ) { + if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b ); + return a; + } + if ( bExp == 0 ) { + --expDiff; + } + else { + bSig0 |= 0x40000000; + } + shift64RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); + aSig0 |= 0x40000000; + aBigger: + sub64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); + zExp = aExp; + normalizeRoundAndPack: + --zExp; + return normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the double-precision floating-point values `a' +| and `b'. The operation is performed according to the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_add( float64 a, float64 b ) +{ + flag aSign, bSign; + + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + if ( aSign == bSign ) { + return addFloat64Sigs( a, b, aSign ); + } + else { + return subFloat64Sigs( a, b, aSign ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the double-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_sub( float64 a, float64 b ) +{ + flag aSign, bSign; + + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + if ( aSign == bSign ) { + return subFloat64Sigs( a, b, aSign ); + } + else { + return addFloat64Sigs( a, b, aSign ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the double-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_mul( float64 a, float64 b ) +{ + flag aSign, bSign, zSign; + int16 aExp, bExp, zExp; + bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; + float64 z; + + aSig1 = extractFloat64Frac1( a ); + aSig0 = extractFloat64Frac0( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + bSig1 = extractFloat64Frac1( b ); + bSig0 = extractFloat64Frac0( b ); + bExp = extractFloat64Exp( b ); + bSign = extractFloat64Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0x7FF ) { + if ( ( aSig0 | aSig1 ) + || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) { + return propagateFloat64NaN( a, b ); + } + if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; + return packFloat64( zSign, 0x7FF, 0, 0 ); + } + if ( bExp == 0x7FF ) { + if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); + if ( ( aExp | aSig0 | aSig1 ) == 0 ) { + invalid: + float_raise( float_flag_invalid ); + z.low = float64_default_nan_low; + z.high = float64_default_nan_high; + return z; + } + return packFloat64( zSign, 0x7FF, 0, 0 ); + } + if ( aExp == 0 ) { + if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 ); + normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); + } + if ( bExp == 0 ) { + if ( ( bSig0 | bSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 ); + normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); + } + zExp = aExp + bExp - 0x400; + aSig0 |= 0x00100000; + shortShift64Left( bSig0, bSig1, 12, &bSig0, &bSig1 ); + mul64To128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); + add64( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); + zSig2 |= ( zSig3 != 0 ); + if ( 0x00200000 <= zSig0 ) { + shift64ExtraRightJamming( + zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); + ++zExp; + } + return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of dividing the double-precision floating-point value `a' +| by the corresponding value `b'. The operation is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_div( float64 a, float64 b ) +{ + flag aSign, bSign, zSign; + int16 aExp, bExp, zExp; + bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; + bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3; + float64 z; + + aSig1 = extractFloat64Frac1( a ); + aSig0 = extractFloat64Frac0( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + bSig1 = extractFloat64Frac1( b ); + bSig0 = extractFloat64Frac0( b ); + bExp = extractFloat64Exp( b ); + bSign = extractFloat64Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0x7FF ) { + if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b ); + if ( bExp == 0x7FF ) { + if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); + goto invalid; + } + return packFloat64( zSign, 0x7FF, 0, 0 ); + } + if ( bExp == 0x7FF ) { + if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); + return packFloat64( zSign, 0, 0, 0 ); + } + if ( bExp == 0 ) { + if ( ( bSig0 | bSig1 ) == 0 ) { + if ( ( aExp | aSig0 | aSig1 ) == 0 ) { + invalid: + float_raise( float_flag_invalid ); + z.low = float64_default_nan_low; + z.high = float64_default_nan_high; + return z; + } + float_raise( float_flag_divbyzero ); + return packFloat64( zSign, 0x7FF, 0, 0 ); + } + normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); + } + if ( aExp == 0 ) { + if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 ); + normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); + } + zExp = aExp - bExp + 0x3FD; + shortShift64Left( aSig0 | 0x00100000, aSig1, 11, &aSig0, &aSig1 ); + shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 ); + if ( le64( bSig0, bSig1, aSig0, aSig1 ) ) { + shift64Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); + ++zExp; + } + zSig0 = estimateDiv64To32( aSig0, aSig1, bSig0 ); + mul64By32To96( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); + sub96( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); + while ( (sbits32) rem0 < 0 ) { + --zSig0; + add96( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); + } + zSig1 = estimateDiv64To32( rem1, rem2, bSig0 ); + if ( ( zSig1 & 0x3FF ) <= 4 ) { + mul64By32To96( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); + sub96( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); + while ( (sbits32) rem1 < 0 ) { + --zSig1; + add96( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); + } + zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); + } + shift64ExtraRightJamming( zSig0, zSig1, 0, 11, &zSig0, &zSig1, &zSig2 ); + return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the remainder of the double-precision floating-point value `a' +| with respect to the corresponding value `b'. The operation is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_rem( float64 a, float64 b ) +{ + flag aSign, bSign, zSign; + int16 aExp, bExp, expDiff; + bits32 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; + bits32 allZero, alternateASig0, alternateASig1, sigMean1; + sbits32 sigMean0; + float64 z; + + aSig1 = extractFloat64Frac1( a ); + aSig0 = extractFloat64Frac0( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + bSig1 = extractFloat64Frac1( b ); + bSig0 = extractFloat64Frac0( b ); + bExp = extractFloat64Exp( b ); + bSign = extractFloat64Sign( b ); + if ( aExp == 0x7FF ) { + if ( ( aSig0 | aSig1 ) + || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) { + return propagateFloat64NaN( a, b ); + } + goto invalid; + } + if ( bExp == 0x7FF ) { + if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); + return a; + } + if ( bExp == 0 ) { + if ( ( bSig0 | bSig1 ) == 0 ) { + invalid: + float_raise( float_flag_invalid ); + z.low = float64_default_nan_low; + z.high = float64_default_nan_high; + return z; + } + normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); + } + if ( aExp == 0 ) { + if ( ( aSig0 | aSig1 ) == 0 ) return a; + normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); + } + expDiff = aExp - bExp; + if ( expDiff < -1 ) return a; + shortShift64Left( + aSig0 | 0x00100000, aSig1, 11 - ( expDiff < 0 ), &aSig0, &aSig1 ); + shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 ); + q = le64( bSig0, bSig1, aSig0, aSig1 ); + if ( q ) sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); + expDiff -= 32; + while ( 0 < expDiff ) { + q = estimateDiv64To32( aSig0, aSig1, bSig0 ); + q = ( 4 < q ) ? q - 4 : 0; + mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 ); + shortShift96Left( term0, term1, term2, 29, &term1, &term2, &allZero ); + shortShift64Left( aSig0, aSig1, 29, &aSig0, &allZero ); + sub64( aSig0, 0, term1, term2, &aSig0, &aSig1 ); + expDiff -= 29; + } + if ( -32 < expDiff ) { + q = estimateDiv64To32( aSig0, aSig1, bSig0 ); + q = ( 4 < q ) ? q - 4 : 0; + q >>= - expDiff; + shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 ); + expDiff += 24; + if ( expDiff < 0 ) { + shift64Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); + } + else { + shortShift64Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); + } + mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 ); + sub64( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); + } + else { + shift64Right( aSig0, aSig1, 8, &aSig0, &aSig1 ); + shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 ); + } + do { + alternateASig0 = aSig0; + alternateASig1 = aSig1; + ++q; + sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); + } while ( 0 <= (sbits32) aSig0 ); + add64( + aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 ); + if ( ( sigMean0 < 0 ) + || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { + aSig0 = alternateASig0; + aSig1 = alternateASig1; + } + zSign = ( (sbits32) aSig0 < 0 ); + if ( zSign ) sub64( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); + return + normalizeRoundAndPackFloat64( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the square root of the double-precision floating-point value `a'. +| The operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_sqrt( float64 a ) +{ + flag aSign; + int16 aExp, zExp; + bits32 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; + bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3; + float64 z; + + aSig1 = extractFloat64Frac1( a ); + aSig0 = extractFloat64Frac0( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( aExp == 0x7FF ) { + if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, a ); + if ( ! aSign ) return a; + goto invalid; + } + if ( aSign ) { + if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; + invalid: + float_raise( float_flag_invalid ); + z.low = float64_default_nan_low; + z.high = float64_default_nan_high; + return z; + } + if ( aExp == 0 ) { + if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( 0, 0, 0, 0 ); + normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); + } + zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; + aSig0 |= 0x00100000; + shortShift64Left( aSig0, aSig1, 11, &term0, &term1 ); + zSig0 = ( estimateSqrt32( aExp, term0 )>>1 ) + 1; + if ( zSig0 == 0 ) zSig0 = 0x7FFFFFFF; + doubleZSig0 = zSig0 + zSig0; + shortShift64Left( aSig0, aSig1, 9 - ( aExp & 1 ), &aSig0, &aSig1 ); + mul32To64( zSig0, zSig0, &term0, &term1 ); + sub64( aSig0, aSig1, term0, term1, &rem0, &rem1 ); + while ( (sbits32) rem0 < 0 ) { + --zSig0; + doubleZSig0 -= 2; + add64( rem0, rem1, 0, doubleZSig0 | 1, &rem0, &rem1 ); + } + zSig1 = estimateDiv64To32( rem1, 0, doubleZSig0 ); + if ( ( zSig1 & 0x1FF ) <= 5 ) { + if ( zSig1 == 0 ) zSig1 = 1; + mul32To64( doubleZSig0, zSig1, &term1, &term2 ); + sub64( rem1, 0, term1, term2, &rem1, &rem2 ); + mul32To64( zSig1, zSig1, &term2, &term3 ); + sub96( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); + while ( (sbits32) rem1 < 0 ) { + --zSig1; + shortShift64Left( 0, zSig1, 1, &term2, &term3 ); + term3 |= 1; + term2 |= doubleZSig0; + add96( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); + } + zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); + } + shift64ExtraRightJamming( zSig0, zSig1, 0, 10, &zSig0, &zSig1, &zSig2 ); + return roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2 ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is equal to +| the corresponding value `b', and 0 otherwise. The comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float64_eq( float64 a, float64 b ) +{ + + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) + && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) + && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) + ) { + if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + return + ( a.low == b.low ) + && ( ( a.high == b.high ) + || ( ( a.low == 0 ) + && ( (bits32) ( ( a.high | b.high )<<1 ) == 0 ) ) + ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is less than +| or equal to the corresponding value `b', and 0 otherwise. The comparison +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float64_le( float64 a, float64 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) + && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) + && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + if ( aSign != bSign ) { + return + aSign + || ( ( ( (bits32) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + == 0 ); + } + return + aSign ? le64( b.high, b.low, a.high, a.low ) + : le64( a.high, a.low, b.high, b.low ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is less than +| the corresponding value `b', and 0 otherwise. The comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float64_lt( float64 a, float64 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) + && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) + && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + if ( aSign != bSign ) { + return + aSign + && ( ( ( (bits32) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + != 0 ); + } + return + aSign ? lt64( b.high, b.low, a.high, a.low ) + : lt64( a.high, a.low, b.high, b.low ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is equal to +| the corresponding value `b', and 0 otherwise. The invalid exception is +| raised if either operand is a NaN. Otherwise, the comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float64_eq_signaling( float64 a, float64 b ) +{ + + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) + && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) + && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + return + ( a.low == b.low ) + && ( ( a.high == b.high ) + || ( ( a.low == 0 ) + && ( (bits32) ( ( a.high | b.high )<<1 ) == 0 ) ) + ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is less than or +| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not +| cause an exception. Otherwise, the comparison is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float64_le_quiet( float64 a, float64 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) + && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) + && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) + ) { + if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + if ( aSign != bSign ) { + return + aSign + || ( ( ( (bits32) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + == 0 ); + } + return + aSign ? le64( b.high, b.low, a.high, a.low ) + : le64( a.high, a.low, b.high, b.low ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is less than +| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an +| exception. Otherwise, the comparison is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float64_lt_quiet( float64 a, float64 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) + && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) + && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) + ) { + if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + if ( aSign != bSign ) { + return + aSign + && ( ( ( (bits32) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + != 0 ); + } + return + aSign ? lt64( b.high, b.low, a.high, a.low ) + : lt64( a.high, a.low, b.high, b.low ); + +} + diff --git a/software/libbase/softfloat.h b/software/libbase/softfloat.h index c987d563..5eb9707a 100644 --- a/software/libbase/softfloat.h +++ b/software/libbase/softfloat.h @@ -1,120 +1,134 @@ - -/* -=============================================================================== - -This C header file is part of the SoftFloat IEC/IEEE Floating-point -Arithmetic Package, Release 2. - -Written by John R. Hauser. This work was made possible in part by the -International Computer Science Institute, located at Suite 600, 1947 Center -Street, Berkeley, California 94704. Funding was partially provided by the -National Science Foundation under grant MIP-9311980. The original version -of this code was written as part of a project to build a fixed-point vector -processor in collaboration with the University of California at Berkeley, -overseen by Profs. Nelson Morgan and John Wawrzynek. More information -is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ -arithmetic/softfloat.html'. - -THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort -has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT -TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO -PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY -AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. - -Derivative works are acceptable, even for commercial purposes, so long as -(1) they include prominent notice that the work is derivative, and (2) they -include prominent notice akin to these three paragraphs for those parts of -this code that are retained. - -=============================================================================== -*/ - -/* -------------------------------------------------------------------------------- -Software IEC/IEEE floating-point types. -------------------------------------------------------------------------------- -*/ -typedef unsigned int float32; - -/* -------------------------------------------------------------------------------- -Software IEC/IEEE floating-point underflow tininess-detection mode. -------------------------------------------------------------------------------- -*/ -extern int float_detect_tininess; -enum { - float_tininess_after_rounding = 0, - float_tininess_before_rounding = 1 -}; - -/* -------------------------------------------------------------------------------- -Software IEC/IEEE floating-point rounding mode. -------------------------------------------------------------------------------- -*/ -extern int float_rounding_mode; -enum { - float_round_nearest_even = 0, - float_round_to_zero = 1, - float_round_up = 2, - float_round_down = 3 -}; - -/* -------------------------------------------------------------------------------- -Software IEC/IEEE floating-point exception flags. -------------------------------------------------------------------------------- -*/ -extern int float_exception_flags; -enum { - float_flag_inexact = 1, - float_flag_divbyzero = 2, - float_flag_underflow = 4, - float_flag_overflow = 8, - float_flag_invalid = 16 -}; - -/* -------------------------------------------------------------------------------- -Routine to raise any or all of the software IEC/IEEE floating-point -exception flags. -------------------------------------------------------------------------------- -*/ -void float_raise( int ); - -/* -------------------------------------------------------------------------------- -Software IEC/IEEE integer-to-floating-point conversion routines. -------------------------------------------------------------------------------- -*/ -float32 int32_to_float32( int ); - -/* -------------------------------------------------------------------------------- -Software IEC/IEEE single-precision conversion routines. -------------------------------------------------------------------------------- -*/ -int float32_to_int32( float32 ); -int float32_to_int32_round_to_zero( float32 ); - -/* -------------------------------------------------------------------------------- -Software IEC/IEEE single-precision operations. -------------------------------------------------------------------------------- -*/ -float32 float32_round_to_int( float32 ); -float32 float32_add( float32, float32 ); -float32 float32_sub( float32, float32 ); -float32 float32_mul( float32, float32 ); -float32 float32_div( float32, float32 ); -float32 float32_rem( float32, float32 ); -float32 float32_sqrt( float32 ); -flag float32_eq( float32, float32 ); -flag float32_le( float32, float32 ); -flag float32_lt( float32, float32 ); -flag float32_eq_signaling( float32, float32 ); -flag float32_le_quiet( float32, float32 ); -flag float32_lt_quiet( float32, float32 ); -flag float32_is_nan( float32 a ); -flag float32_is_signaling_nan( float32 ); - + +/*============================================================================ + +This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point types. +*----------------------------------------------------------------------------*/ +typedef bits32 float32; +typedef struct { + bits32 high, low; +} float64; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point underflow tininess-detection mode. +*----------------------------------------------------------------------------*/ +extern int8 float_detect_tininess; +enum { + float_tininess_after_rounding = 0, + float_tininess_before_rounding = 1 +}; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point rounding mode. +*----------------------------------------------------------------------------*/ +extern int8 float_rounding_mode; +enum { + float_round_nearest_even = 0, + float_round_to_zero = 1, + float_round_down = 2, + float_round_up = 3 +}; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point exception flags. +*----------------------------------------------------------------------------*/ +extern int8 float_exception_flags; +enum { + float_flag_inexact = 1, + float_flag_underflow = 2, + float_flag_overflow = 4, + float_flag_divbyzero = 8, + float_flag_invalid = 16 +}; + +/*---------------------------------------------------------------------------- +| Routine to raise any or all of the software IEC/IEEE floating-point +| exception flags. +*----------------------------------------------------------------------------*/ +void float_raise( int8 ); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE integer-to-floating-point conversion routines. +*----------------------------------------------------------------------------*/ +float32 int32_to_float32( int32 ); +float64 int32_to_float64( int32 ); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE single-precision conversion routines. +*----------------------------------------------------------------------------*/ +int32 float32_to_int32( float32 ); +int32 float32_to_int32_round_to_zero( float32 ); +float64 float32_to_float64( float32 ); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE single-precision operations. +*----------------------------------------------------------------------------*/ +float32 float32_round_to_int( float32 ); +float32 float32_add( float32, float32 ); +float32 float32_sub( float32, float32 ); +float32 float32_mul( float32, float32 ); +float32 float32_div( float32, float32 ); +float32 float32_rem( float32, float32 ); +float32 float32_sqrt( float32 ); +flag float32_eq( float32, float32 ); +flag float32_le( float32, float32 ); +flag float32_lt( float32, float32 ); +flag float32_eq_signaling( float32, float32 ); +flag float32_le_quiet( float32, float32 ); +flag float32_lt_quiet( float32, float32 ); +flag float32_is_signaling_nan( float32 ); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE double-precision conversion routines. +*----------------------------------------------------------------------------*/ +int32 float64_to_int32( float64 ); +int32 float64_to_int32_round_to_zero( float64 ); +float32 float64_to_float32( float64 ); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE double-precision operations. +*----------------------------------------------------------------------------*/ +float64 float64_round_to_int( float64 ); +float64 float64_add( float64, float64 ); +float64 float64_sub( float64, float64 ); +float64 float64_mul( float64, float64 ); +float64 float64_div( float64, float64 ); +float64 float64_rem( float64, float64 ); +float64 float64_sqrt( float64 ); +flag float64_eq( float64, float64 ); +flag float64_le( float64, float64 ); +flag float64_lt( float64, float64 ); +flag float64_eq_signaling( float64, float64 ); +flag float64_le_quiet( float64, float64 ); +flag float64_lt_quiet( float64, float64 ); +flag float64_is_signaling_nan( float64 ); +