From 1753dde328506233dea5f3d584c3936cc8dd54a2 Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Thu, 7 Jul 2022 16:19:20 +0100 Subject: [PATCH] tables, move discussion on transcendentals --- openpower/transcendentals.mdwn | 238 ++++------------------ openpower/transcendentals/discussion.mdwn | 169 +++++++++++++++ 2 files changed, 214 insertions(+), 193 deletions(-) create mode 100644 openpower/transcendentals/discussion.mdwn diff --git a/openpower/transcendentals.mdwn b/openpower/transcendentals.mdwn index a789982d9..271ed7b00 100644 --- a/openpower/transcendentals.mdwn +++ b/openpower/transcendentals.mdwn @@ -22,7 +22,6 @@ See: * * -* Discussion: * [[power_trans_ops]] for opcode listing. Extension subsets: @@ -146,51 +145,50 @@ IEEE754-2019 Table 9.1 lists "additional mathematical operations". Interestingly the only functions missing when compared to OpenCL are compound, exp2m1, exp10m1, log2p1, log10p1, pown (integer power) and powr. -[[!table data=""" -opcode | OpenCL FP32 | OpenCL FP16 | OpenCL native | OpenCL fast | IEEE754 | Power ISA | -FSIN | sin | half\_sin | native\_sin | NONE | sin | NONE | -FCOS | cos | half\_cos | native\_cos | NONE | cos | NONE | -FTAN | tan | half\_tan | native\_tan | NONE | tan | NONE | -NONE (1) | sincos | NONE | NONE | NONE | NONE | NONE | -FASIN | asin | NONE | NONE | NONE | asin | NONE | -FACOS | acos | NONE | NONE | NONE | acos | NONE | -FATAN | atan | NONE | NONE | NONE | atan | NONE | -FSINPI | sinpi | NONE | NONE | NONE | sinPi | NONE | -FCOSPI | cospi | NONE | NONE | NONE | cosPi | NONE | -FTANPI | tanpi | NONE | NONE | NONE | tanPi | NONE | -FASINPI | asinpi | NONE | NONE | NONE | asinPi | NONE | -FACOSPI | acospi | NONE | NONE | NONE | acosPi | NONE | -FATANPI | atanpi | NONE | NONE | NONE | atanPi | NONE | -FSINH | sinh | NONE | NONE | NONE | sinh | NONE | -FCOSH | cosh | NONE | NONE | NONE | cosh | NONE | -FTANH | tanh | NONE | NONE | NONE | tanh | NONE | -FASINH | asinh | NONE | NONE | NONE | asinh | NONE | -FACOSH | acosh | NONE | NONE | NONE | acosh | NONE | -FATANH | atanh | NONE | NONE | NONE | atanh | NONE | -FATAN2 | atan2 | NONE | NONE | NONE | atan2 | NONE | -FATAN2PI | atan2pi | NONE | NONE | NONE | atan2pi | NONE | -FRSQRT | rsqrt | half\_rsqrt | native\_rsqrt | NONE | rSqrt | fsqrte, fsqrtes (4) | -FCBRT | cbrt | NONE | NONE | NONE | NONE (2) | NONE | -FEXP2 | exp2 | half\_exp2 | native\_exp2 | NONE | exp2 | NONE | -FLOG2 | log2 | half\_log2 | native\_log2 | NONE | log2 | NONE | -FEXPM1 | expm1 | NONE | NONE | NONE | expm1 | NONE | -FLOG1P | log1p | NONE | NONE | NONE | logp1 | NONE | -FEXP | exp | half\_exp | native\_exp | NONE | exp | NONE | -FLOG | log | half\_log | native\_log | NONE | log | NONE | -FEXP10 | exp10 | half\_exp10 | native\_exp10 | NONE | exp10 | NONE | -FLOG10 | log10 | half\_log10 | native\_log10 | NONE | log10 | NONE | -FPOW | pow | NONE | NONE | NONE | pow | NONE | -FPOWN | pown | NONE | NONE | NONE | pown | NONE | -FPOWR | powr | half\_powr | native\_powr | NONE | powr | NONE | -FROOTN | rootn | NONE | NONE | NONE | rootn | NONE | -FHYPOT | hypot | NONE | NONE | NONE | hypot | NONE | -FRECIP | NONE | half\_recip | native\_recip | NONE | NONE (3) | fre, fres (4) | -NONE | NONE | NONE | NONE | NONE | compound | NONE | -NONE | NONE | NONE | NONE | NONE | exp2m1 | NONE | -NONE | NONE | NONE | NONE | NONE | exp10m1 | NONE | -NONE | NONE | NONE | NONE | NONE | log2p1 | NONE | -NONE | NONE | NONE | NONE | NONE | log10p1 | NONE | -"""]] +|opcode |OpenCL FP32|OpenCL FP16|OpenCL native|OpenCL fast|IEEE754 |Power ISA | +|------- |-----------|-----------|-------------|-----------|------- |--------- | +|FSIN |sin |half\_sin |native\_sin |NONE |sin |NONE | +|FCOS |cos |half\_cos |native\_cos |NONE |cos |NONE | +|FTAN |tan |half\_tan |native\_tan |NONE |tan |NONE | +|NONE (1)|sincos |NONE |NONE |NONE |NONE |NONE | +|FASIN |asin |NONE |NONE |NONE |asin |NONE | +|FACOS |acos |NONE |NONE |NONE |acos |NONE | +|FATAN |atan |NONE |NONE |NONE |atan |NONE | +|FSINPI |sinpi |NONE |NONE |NONE |sinPi |NONE | +|FCOSPI |cospi |NONE |NONE |NONE |cosPi |NONE | +|FTANPI |tanpi |NONE |NONE |NONE |tanPi |NONE | +|FASINPI |asinpi |NONE |NONE |NONE |asinPi |NONE | +|FACOSPI |acospi |NONE |NONE |NONE |acosPi |NONE | +|FATANPI |atanpi |NONE |NONE |NONE |atanPi |NONE | +|FSINH |sinh |NONE |NONE |NONE |sinh |NONE | +|FCOSH |cosh |NONE |NONE |NONE |cosh |NONE | +|FTANH |tanh |NONE |NONE |NONE |tanh |NONE | +|FASINH |asinh |NONE |NONE |NONE |asinh |NONE | +|FACOSH |acosh |NONE |NONE |NONE |acosh |NONE | +|FATANH |atanh |NONE |NONE |NONE |atanh |NONE | +|FATAN2 |atan2 |NONE |NONE |NONE |atan2 |NONE | +|FATAN2PI|atan2pi |NONE |NONE |NONE |atan2pi |NONE | +|FRSQRT |rsqrt |half\_rsqrt|native\_rsqrt|NONE |rSqrt |fsqrte, fsqrtes (4) | +|FCBRT |cbrt |NONE |NONE |NONE |NONE (2)|NONE | +|FEXP2 |exp2 |half\_exp2 |native\_exp2 |NONE |exp2 |NONE | +|FLOG2 |log2 |half\_log2 |native\_log2 |NONE |log2 |NONE | +|FEXPM1 |expm1 |NONE |NONE |NONE |expm1 |NONE | +|FLOG1P |log1p |NONE |NONE |NONE |logp1 |NONE | +|FEXP |exp |half\_exp |native\_exp |NONE |exp |NONE | +|FLOG |log |half\_log |native\_log |NONE |log |NONE | +|FEXP10 |exp10 |half\_exp10|native\_exp10|NONE |exp10 |NONE | +|FLOG10 |log10 |half\_log10|native\_log10|NONE |log10 |NONE | +|FPOW |pow |NONE |NONE |NONE |pow |NONE | +|FPOWN |pown |NONE |NONE |NONE |pown |NONE | +|FPOWR |powr |half\_powr |native\_powr |NONE |powr |NONE | +|FROOTN |rootn |NONE |NONE |NONE |rootn |NONE | +|FHYPOT |hypot |NONE |NONE |NONE |hypot |NONE | +|FRECIP |NONE |half\_recip|native\_recip|NONE |NONE (3)|fre, fres (4) | +|NONE |NONE |NONE |NONE |NONE |compound|NONE | +|NONE |NONE |NONE |NONE |NONE |exp2m1 |NONE | +|NONE |NONE |NONE |NONE |NONE |exp10m1 |NONE | +|NONE |NONE |NONE |NONE |NONE |log2p1 |NONE | +|NONE |NONE |NONE |NONE |NONE |log10p1 |NONE | Note (1) FSINCOS is macro-op fused (see below). @@ -437,151 +435,5 @@ high-performance or correctly-rounding): # Evaluation and commentary -This section will move later to discussion. +Moved to [[discussion]] -## Reciprocal - -Used to be an alias. Some implementors may wish to implement divide as -y times recip(x). - -Others may have shared hardware for recip and divide, others may not. - -To avoid penalising one implementor over another, recip stays. - -## To evaluate: should LOG be replaced with LOG1P (and EXP with EXPM1)? - -RISC principle says "exclude LOG because it's covered by LOGP1 plus an ADD". -Research needed to ensure that implementors are not compromised by such -a decision - - -> > correctly-rounded LOG will return different results than LOGP1 and ADD. -> > Likewise for EXP and EXPM1 - -> ok, they stay in as real opcodes, then. - -## ATAN / ATAN2 commentary - -Discussion starts here: - - -from Mitch Alsup: - -would like to point out that the general implementations of ATAN2 do a -bunch of special case checks and then simply call ATAN. - - double ATAN2( double y, double x ) - { // IEEE 754-2008 quality ATAN2 - - // deal with NANs - if( ISNAN( x ) ) return x; - if( ISNAN( y ) ) return y; - - // deal with infinities - if( x == +∞ && |y|== +∞ ) return copysign( π/4, y ); - if( x == +∞ ) return copysign( 0.0, y ); - if( x == -∞ && |y|== +∞ ) return copysign( 3π/4, y ); - if( x == -∞ ) return copysign( π, y ); - if( |y|== +∞ ) return copysign( π/2, y ); - - // deal with signed zeros - if( x == 0.0 && y != 0.0 ) return copysign( π/2, y ); - if( x >=+0.0 && y == 0.0 ) return copysign( 0.0, y ); - if( x <=-0.0 && y == 0.0 ) return copysign( π, y ); - - // calculate ATAN2 textbook style - if( x > 0.0 ) return ATAN( |y / x| ); - if( x < 0.0 ) return π - ATAN( |y / x| ); - } - - -Yet the proposed encoding makes ATAN2 the primitive and has ATAN invent -a constant and then call/use ATAN2. - -When one considers an implementation of ATAN, one must consider several -ranges of evaluation:: - - x [ -∞, -1.0]:: ATAN( x ) = -π/2 + ATAN( 1/x ); - x (-1.0, +1.0]:: ATAN( x ) = + ATAN( x ); - x [ 1.0, +∞]:: ATAN( x ) = +π/2 - ATAN( 1/x ); - -I should point out that the add/sub of π/2 can not lose significance -since the result of ATAN(1/x) is bounded 0..π/2 - -The bottom line is that I think you are choosing to make too many of -these into OpCodes, making the hardware function/calculation unit (and -sequencer) more complicated that necessary. - --------------------------------------------------------- - -We therefore I think have a case for bringing back ATAN and including ATAN2. - -The reason is that whilst a microcode-like GPU-centric platform would -do ATAN2 in terms of ATAN, a UNIX-centric platform would do it the other -way round. - -(that is the hypothesis, to be evaluated for correctness. feedback requested). - -This because we cannot compromise or prioritise one platfrom's -speed/accuracy over another. That is not reasonable or desirable, to -penalise one implementor over another. - -Thus, all implementors, to keep interoperability, must both have both -opcodes and may choose, at the architectural and routing level, which -one to implement in terms of the other. - -Allowing implementors to choose to add either opcode and let traps sort it -out leaves an uncertainty in the software developer's mind: they cannot -trust the hardware, available from many vendors, to be performant right -across the board. - -Standards are a pig. - ---- - -I might suggest that if there were a way for a calculation to be performed -and the result of that calculation chained to a subsequent calculation -such that the precision of the result-becomes-operand is wider than -what will fit in a register, then you can dramatically reduce the count -of instructions in this category while retaining - -acceptable accuracy: - - z = x / y - -can be calculated as:: - - z = x * (1/y) - -Where 1/y has about 26-to-32 bits of fraction. No, it's not IEEE 754-2008 -accurate, but GPUs want speed and - -1/y is fully pipelined (F32) while x/y cannot be (at reasonable area). It -is also not "that inaccurate" displaying 0.625-to-0.52 ULP. - -Given that one has the ability to carry (and process) more fraction bits, -one can then do high precision multiplies of π or other transcendental -radixes. - -And GPUs have been doing this almost since the dawn of 3D. - - // calculate ATAN2 high performance style - // Note: at this point x != y - // - if( x > 0.0 ) - { - if( y < 0.0 && |y| < |x| ) return - π/2 - ATAN( x / y ); - if( y < 0.0 && |y| > |x| ) return + ATAN( y / x ); - if( y > 0.0 && |y| < |x| ) return + ATAN( y / x ); - if( y > 0.0 && |y| > |x| ) return + π/2 - ATAN( x / y ); - } - if( x < 0.0 ) - { - if( y < 0.0 && |y| < |x| ) return + π/2 + ATAN( x / y ); - if( y < 0.0 && |y| > |x| ) return + π - ATAN( y / x ); - if( y > 0.0 && |y| < |x| ) return + π - ATAN( y / x ); - if( y > 0.0 && |y| > |x| ) return +3π/2 + ATAN( x / y ); - } - -This way the adds and subtracts from the constant are not in a precision -precarious position. diff --git a/openpower/transcendentals/discussion.mdwn b/openpower/transcendentals/discussion.mdwn new file mode 100644 index 000000000..f8dfa2434 --- /dev/null +++ b/openpower/transcendentals/discussion.mdwn @@ -0,0 +1,169 @@ +# Discussion + +* +* +* Discussion: +* [[power_trans_ops]] for opcode listing. + +TODO: + +* Decision on accuracy, moved to [[zfpacc_proposal]] + +* Errors **MUST** be repeatable. +* How about four Platform Specifications? 3DUNIX, UNIX, 3DEmbedded and Embedded? + + Accuracy requirements for dual (triple) purpose implementations must + meet the higher standard. +* Reciprocal Square-root is in its own separate extension (Zfrsqrt) as + it is desirable on its own by other implementors. This to be evaluated. + +# Evaluation and commentary + +This section now in discussion + +## Reciprocal + +Used to be an alias. Some implementors may wish to implement divide as +y times recip(x). + +Others may have shared hardware for recip and divide, others may not. + +To avoid penalising one implementor over another, recip stays. + +## To evaluate: should LOG be replaced with LOG1P (and EXP with EXPM1)? + +RISC principle says "exclude LOG because it's covered by LOGP1 plus an ADD". +Research needed to ensure that implementors are not compromised by such +a decision + + +> > correctly-rounded LOG will return different results than LOGP1 and ADD. +> > Likewise for EXP and EXPM1 + +> ok, they stay in as real opcodes, then. + +## ATAN / ATAN2 commentary + +Discussion starts here: + + +from Mitch Alsup: + +would like to point out that the general implementations of ATAN2 do a +bunch of special case checks and then simply call ATAN. + + double ATAN2( double y, double x ) + { // IEEE 754-2008 quality ATAN2 + + // deal with NANs + if( ISNAN( x ) ) return x; + if( ISNAN( y ) ) return y; + + // deal with infinities + if( x == +∞ && |y|== +∞ ) return copysign( π/4, y ); + if( x == +∞ ) return copysign( 0.0, y ); + if( x == -∞ && |y|== +∞ ) return copysign( 3π/4, y ); + if( x == -∞ ) return copysign( π, y ); + if( |y|== +∞ ) return copysign( π/2, y ); + + // deal with signed zeros + if( x == 0.0 && y != 0.0 ) return copysign( π/2, y ); + if( x >=+0.0 && y == 0.0 ) return copysign( 0.0, y ); + if( x <=-0.0 && y == 0.0 ) return copysign( π, y ); + + // calculate ATAN2 textbook style + if( x > 0.0 ) return ATAN( |y / x| ); + if( x < 0.0 ) return π - ATAN( |y / x| ); + } + + +Yet the proposed encoding makes ATAN2 the primitive and has ATAN invent +a constant and then call/use ATAN2. + +When one considers an implementation of ATAN, one must consider several +ranges of evaluation:: + + x [ -∞, -1.0]:: ATAN( x ) = -π/2 + ATAN( 1/x ); + x (-1.0, +1.0]:: ATAN( x ) = + ATAN( x ); + x [ 1.0, +∞]:: ATAN( x ) = +π/2 - ATAN( 1/x ); + +I should point out that the add/sub of π/2 can not lose significance +since the result of ATAN(1/x) is bounded 0..π/2 + +The bottom line is that I think you are choosing to make too many of +these into OpCodes, making the hardware function/calculation unit (and +sequencer) more complicated that necessary. + +-------------------------------------------------------- + +We therefore I think have a case for bringing back ATAN and including ATAN2. + +The reason is that whilst a microcode-like GPU-centric platform would +do ATAN2 in terms of ATAN, a UNIX-centric platform would do it the other +way round. + +(that is the hypothesis, to be evaluated for correctness. feedback requested). + +This because we cannot compromise or prioritise one platfrom's +speed/accuracy over another. That is not reasonable or desirable, to +penalise one implementor over another. + +Thus, all implementors, to keep interoperability, must both have both +opcodes and may choose, at the architectural and routing level, which +one to implement in terms of the other. + +Allowing implementors to choose to add either opcode and let traps sort it +out leaves an uncertainty in the software developer's mind: they cannot +trust the hardware, available from many vendors, to be performant right +across the board. + +Standards are a pig. + +--- + +I might suggest that if there were a way for a calculation to be performed +and the result of that calculation chained to a subsequent calculation +such that the precision of the result-becomes-operand is wider than +what will fit in a register, then you can dramatically reduce the count +of instructions in this category while retaining + +acceptable accuracy: + + z = x / y + +can be calculated as:: + + z = x * (1/y) + +Where 1/y has about 26-to-32 bits of fraction. No, it's not IEEE 754-2008 +accurate, but GPUs want speed and + +1/y is fully pipelined (F32) while x/y cannot be (at reasonable area). It +is also not "that inaccurate" displaying 0.625-to-0.52 ULP. + +Given that one has the ability to carry (and process) more fraction bits, +one can then do high precision multiplies of π or other transcendental +radixes. + +And GPUs have been doing this almost since the dawn of 3D. + + // calculate ATAN2 high performance style + // Note: at this point x != y + // + if( x > 0.0 ) + { + if( y < 0.0 && |y| < |x| ) return - π/2 - ATAN( x / y ); + if( y < 0.0 && |y| > |x| ) return + ATAN( y / x ); + if( y > 0.0 && |y| < |x| ) return + ATAN( y / x ); + if( y > 0.0 && |y| > |x| ) return + π/2 - ATAN( x / y ); + } + if( x < 0.0 ) + { + if( y < 0.0 && |y| < |x| ) return + π/2 + ATAN( x / y ); + if( y < 0.0 && |y| > |x| ) return + π - ATAN( y / x ); + if( y > 0.0 && |y| < |x| ) return + π - ATAN( y / x ); + if( y > 0.0 && |y| > |x| ) return +3π/2 + ATAN( x / y ); + } + +This way the adds and subtracts from the constant are not in a precision +precarious position. -- 2.30.2