From 1753dde328506233dea5f3d584c3936cc8dd54a2 Mon Sep 17 00:00:00 2001
From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Thu, 7 Jul 2022 16:19:20 +0100
Subject: [PATCH] tables, move discussion on transcendentals

---
 openpower/transcendentals.mdwn            | 238 ++++------------------
 openpower/transcendentals/discussion.mdwn | 169 +++++++++++++++
 2 files changed, 214 insertions(+), 193 deletions(-)
 create mode 100644 openpower/transcendentals/discussion.mdwn

diff --git a/openpower/transcendentals.mdwn b/openpower/transcendentals.mdwn
index a789982d9..271ed7b00 100644
--- a/openpower/transcendentals.mdwn
+++ b/openpower/transcendentals.mdwn
@@ -22,7 +22,6 @@ See:
 
 * <http://bugs.libre-soc.org/show_bug.cgi?id=127>
 * <https://www.khronos.org/registry/spir-v/specs/unified1/OpenCL.ExtendedInstructionSet.100.html>
-* Discussion: <http://lists.libre-riscv.org/pipermail/libre-riscv-dev/2019-August/002342.html>
 * [[power_trans_ops]] for opcode listing.
 
 Extension subsets:
@@ -146,51 +145,50 @@ IEEE754-2019 Table 9.1 lists "additional mathematical operations".
 Interestingly the only functions missing when compared to OpenCL are
 compound, exp2m1, exp10m1, log2p1, log10p1, pown (integer power) and powr.
 
-[[!table data="""
-opcode   | OpenCL FP32 | OpenCL FP16 | OpenCL native | OpenCL fast | IEEE754  | Power ISA |
-FSIN     | sin         | half\_sin   | native\_sin   | NONE        | sin      | NONE      |
-FCOS     | cos         | half\_cos   | native\_cos   | NONE        | cos      | NONE      |
-FTAN     | tan         | half\_tan   | native\_tan   | NONE        | tan      | NONE      |
-NONE (1) | sincos      | NONE        | NONE          | NONE        | NONE     | NONE      |
-FASIN    | asin        | NONE        | NONE          | NONE        | asin     | NONE      |
-FACOS    | acos        | NONE        | NONE          | NONE        | acos     | NONE      |
-FATAN    | atan        | NONE        | NONE          | NONE        | atan     | NONE      |
-FSINPI   | sinpi       | NONE        | NONE          | NONE        | sinPi    | NONE      |
-FCOSPI   | cospi       | NONE        | NONE          | NONE        | cosPi    | NONE      |
-FTANPI   | tanpi       | NONE        | NONE          | NONE        | tanPi    | NONE      |
-FASINPI  | asinpi      | NONE        | NONE          | NONE        | asinPi   | NONE      |
-FACOSPI  | acospi      | NONE        | NONE          | NONE        | acosPi   | NONE      |
-FATANPI  | atanpi      | NONE        | NONE          | NONE        | atanPi   | NONE      |
-FSINH    | sinh        | NONE        | NONE          | NONE        | sinh     | NONE      |
-FCOSH    | cosh        | NONE        | NONE          | NONE        | cosh     | NONE      |
-FTANH    | tanh        | NONE        | NONE          | NONE        | tanh     | NONE      |
-FASINH   | asinh       | NONE        | NONE          | NONE        | asinh    | NONE      |
-FACOSH   | acosh       | NONE        | NONE          | NONE        | acosh    | NONE      |
-FATANH   | atanh       | NONE        | NONE          | NONE        | atanh    | NONE      |
-FATAN2   | atan2       | NONE        | NONE          | NONE        | atan2    | NONE      |
-FATAN2PI | atan2pi     | NONE        | NONE          | NONE        | atan2pi  | NONE      |
-FRSQRT   | rsqrt       | half\_rsqrt | native\_rsqrt | NONE        | rSqrt    | fsqrte, fsqrtes (4)   |
-FCBRT    | cbrt        | NONE        | NONE          | NONE        | NONE (2) | NONE      |
-FEXP2    | exp2        | half\_exp2  | native\_exp2  | NONE        | exp2     | NONE      |
-FLOG2    | log2        | half\_log2  | native\_log2  | NONE        | log2     | NONE      |
-FEXPM1   | expm1       | NONE        | NONE          | NONE        | expm1    | NONE      |
-FLOG1P   | log1p       | NONE        | NONE          | NONE        | logp1    | NONE      |
-FEXP     | exp         | half\_exp   | native\_exp   | NONE        | exp      | NONE      |
-FLOG     | log         | half\_log   | native\_log   | NONE        | log      | NONE      |
-FEXP10   | exp10       | half\_exp10 | native\_exp10 | NONE        | exp10    | NONE      |
-FLOG10   | log10       | half\_log10 | native\_log10 | NONE        | log10    | NONE      |
-FPOW     | pow         | NONE        | NONE          | NONE        | pow      | NONE      |
-FPOWN    | pown        | NONE        | NONE          | NONE        | pown     | NONE      |
-FPOWR    | powr        | half\_powr  | native\_powr  | NONE        | powr     | NONE      |
-FROOTN   | rootn       | NONE        | NONE          | NONE        | rootn    | NONE      |
-FHYPOT   | hypot       | NONE        | NONE          | NONE        | hypot    | NONE      |
-FRECIP   | NONE        | half\_recip | native\_recip | NONE        | NONE (3) | fre, fres (4)        |
-NONE     | NONE        | NONE        | NONE          | NONE        | compound | NONE      |
-NONE     | NONE        | NONE        | NONE          | NONE        | exp2m1   | NONE      |
-NONE     | NONE        | NONE        | NONE          | NONE        | exp10m1  | NONE      |
-NONE     | NONE        | NONE        | NONE          | NONE        | log2p1   | NONE      |
-NONE     | NONE        | NONE        | NONE          | NONE        | log10p1  | NONE      |
-"""]]
+|opcode  |OpenCL FP32|OpenCL FP16|OpenCL native|OpenCL fast|IEEE754 |Power ISA |
+|------- |-----------|-----------|-------------|-----------|------- |--------- |
+|FSIN    |sin        |half\_sin  |native\_sin  |NONE       |sin     |NONE      |
+|FCOS    |cos        |half\_cos  |native\_cos  |NONE       |cos     |NONE      |
+|FTAN    |tan        |half\_tan  |native\_tan  |NONE       |tan     |NONE      |
+|NONE (1)|sincos     |NONE       |NONE         |NONE       |NONE    |NONE      |
+|FASIN   |asin       |NONE       |NONE         |NONE       |asin    |NONE      |
+|FACOS   |acos       |NONE       |NONE         |NONE       |acos    |NONE      |
+|FATAN   |atan       |NONE       |NONE         |NONE       |atan    |NONE      |
+|FSINPI  |sinpi      |NONE       |NONE         |NONE       |sinPi   |NONE      |
+|FCOSPI  |cospi      |NONE       |NONE         |NONE       |cosPi   |NONE      |
+|FTANPI  |tanpi      |NONE       |NONE         |NONE       |tanPi   |NONE      |
+|FASINPI |asinpi     |NONE       |NONE         |NONE       |asinPi  |NONE      |
+|FACOSPI |acospi     |NONE       |NONE         |NONE       |acosPi  |NONE      |
+|FATANPI |atanpi     |NONE       |NONE         |NONE       |atanPi  |NONE      |
+|FSINH   |sinh       |NONE       |NONE         |NONE       |sinh    |NONE      |
+|FCOSH   |cosh       |NONE       |NONE         |NONE       |cosh    |NONE      |
+|FTANH   |tanh       |NONE       |NONE         |NONE       |tanh    |NONE      |
+|FASINH  |asinh      |NONE       |NONE         |NONE       |asinh   |NONE      |
+|FACOSH  |acosh      |NONE       |NONE         |NONE       |acosh   |NONE      |
+|FATANH  |atanh      |NONE       |NONE         |NONE       |atanh   |NONE      |
+|FATAN2  |atan2      |NONE       |NONE         |NONE       |atan2   |NONE      |
+|FATAN2PI|atan2pi    |NONE       |NONE         |NONE       |atan2pi |NONE      |
+|FRSQRT  |rsqrt      |half\_rsqrt|native\_rsqrt|NONE       |rSqrt   |fsqrte, fsqrtes (4)   |
+|FCBRT   |cbrt       |NONE       |NONE         |NONE       |NONE (2)|NONE      |
+|FEXP2   |exp2       |half\_exp2 |native\_exp2 |NONE       |exp2    |NONE      |
+|FLOG2   |log2       |half\_log2 |native\_log2 |NONE       |log2    |NONE      |
+|FEXPM1  |expm1      |NONE       |NONE         |NONE       |expm1   |NONE      |
+|FLOG1P  |log1p      |NONE       |NONE         |NONE       |logp1   |NONE      |
+|FEXP    |exp        |half\_exp  |native\_exp  |NONE       |exp     |NONE      |
+|FLOG    |log        |half\_log  |native\_log  |NONE       |log     |NONE      |
+|FEXP10  |exp10      |half\_exp10|native\_exp10|NONE       |exp10   |NONE      |
+|FLOG10  |log10      |half\_log10|native\_log10|NONE       |log10   |NONE      |
+|FPOW    |pow        |NONE       |NONE         |NONE       |pow     |NONE      |
+|FPOWN   |pown       |NONE       |NONE         |NONE       |pown    |NONE      |
+|FPOWR   |powr       |half\_powr |native\_powr |NONE       |powr    |NONE      |
+|FROOTN  |rootn      |NONE       |NONE         |NONE       |rootn   |NONE      |
+|FHYPOT  |hypot      |NONE       |NONE         |NONE       |hypot   |NONE      |
+|FRECIP  |NONE       |half\_recip|native\_recip|NONE       |NONE (3)|fre, fres (4)        |
+|NONE    |NONE       |NONE       |NONE         |NONE       |compound|NONE      |
+|NONE    |NONE       |NONE       |NONE         |NONE       |exp2m1  |NONE      |
+|NONE    |NONE       |NONE       |NONE         |NONE       |exp10m1 |NONE      |
+|NONE    |NONE       |NONE       |NONE         |NONE       |log2p1  |NONE      |
+|NONE    |NONE       |NONE       |NONE         |NONE       |log10p1 |NONE      |
 
 Note (1) FSINCOS is macro-op fused (see below).
 
@@ -437,151 +435,5 @@ high-performance or correctly-rounding):
 
 # Evaluation and commentary
 
-This section will move later to discussion.
+Moved to [[discussion]]
 
-## Reciprocal
-
-Used to be an alias. Some implementors may wish to implement divide as
-y times recip(x).
-
-Others may have shared hardware for recip and divide, others may not.
-
-To avoid penalising one implementor over another, recip stays.
-
-## To evaluate: should LOG be replaced with LOG1P (and EXP with EXPM1)?
-
-RISC principle says "exclude LOG because it's covered by LOGP1 plus an ADD".
-Research needed to ensure that implementors are not compromised by such
-a decision
-<http://lists.libre-riscv.org/pipermail/libre-riscv-dev/2019-August/002358.html>
-
-> > correctly-rounded LOG will return different results than LOGP1 and ADD.
-> > Likewise for EXP and EXPM1
-
-> ok, they stay in as real opcodes, then.
-
-## ATAN / ATAN2 commentary
-
-Discussion starts here:
-<http://lists.libre-riscv.org/pipermail/libre-riscv-dev/2019-August/002470.html>
-
-from Mitch Alsup:
-
-would like to point out that the general implementations of ATAN2 do a
-bunch of special case checks and then simply call ATAN.
-
-    double ATAN2( double y, double x )
-    {   // IEEE 754-2008 quality ATAN2
-
-        // deal with NANs
-        if( ISNAN( x )             ) return x;
-        if( ISNAN( y )             ) return y;
-
-        // deal with infinities
-        if( x == +â    && |y|== +â  ) return copysign(  Ï/4, y );
-        if( x == +â                 ) return copysign(  0.0, y );
-        if( x == -â    && |y|== +â  ) return copysign( 3Ï/4, y );
-        if( x == -â                 ) return copysign(    Ï, y );
-        if(               |y|== +â  ) return copysign(  Ï/2, y );
-
-        // deal with signed zeros
-        if( x == 0.0  &&  y != 0.0 ) return copysign(  Ï/2, y );
-        if( x >=+0.0  &&  y == 0.0 ) return copysign(  0.0, y );
-        if( x <=-0.0  &&  y == 0.0 ) return copysign(    Ï, y );
-
-        // calculate ATAN2 textbook style
-        if( x  > 0.0               ) return     ATAN( |y / x| );
-        if( x  < 0.0               ) return Ï - ATAN( |y / x| );
-    }
-
-
-Yet the proposed encoding makes ATAN2 the primitive and has ATAN invent
-a constant and then call/use ATAN2.
-
-When one considers an implementation of ATAN, one must consider several
-ranges of evaluation::
-
-     x  [  -â, -1.0]:: ATAN( x ) = -Ï/2 + ATAN( 1/x );
-     x  (-1.0, +1.0]:: ATAN( x ) =      + ATAN(   x );
-     x  [ 1.0,   +â]:: ATAN( x ) = +Ï/2 - ATAN( 1/x );
-
-I should point out that the add/sub of Ï/2 can not lose significance
-since the result of ATAN(1/x) is bounded 0..Ï/2
-
-The bottom line is that I think you are choosing to make too many of
-these into OpCodes, making the hardware function/calculation unit (and
-sequencer) more complicated that necessary.
-
---------------------------------------------------------
-
-We therefore I think have a case for bringing back ATAN and including ATAN2.
-
-The reason is that whilst a microcode-like GPU-centric platform would
-do ATAN2 in terms of ATAN, a UNIX-centric platform would do it the other
-way round.
-
-(that is the hypothesis, to be evaluated for correctness. feedback requested).
-
-This because we cannot compromise or prioritise one platfrom's
-speed/accuracy over another. That is not reasonable or desirable, to
-penalise one implementor over another.
-
-Thus, all implementors, to keep interoperability, must both have both
-opcodes and may choose, at the architectural and routing level, which
-one to implement in terms of the other.
-
-Allowing implementors to choose to add either opcode and let traps sort it
-out leaves an uncertainty in the software developer's mind: they cannot
-trust the hardware, available from many vendors, to be performant right
-across the board.
-
-Standards are a pig.
-
----
-
-I might suggest that if there were a way for a calculation to be performed
-and the result of that calculation chained to a subsequent calculation
-such that the precision of the result-becomes-operand is wider than
-what will fit in a register, then you can dramatically reduce the count
-of instructions in this category while retaining
-
-acceptable accuracy:
-
-     z = x / y
-
-can be calculated as::
-
-     z = x * (1/y)
-
-Where 1/y has about 26-to-32 bits of fraction. No, it's not IEEE 754-2008
-accurate, but GPUs want speed and
-
-1/y is fully pipelined (F32) while x/y cannot be (at reasonable area). It
-is also not "that inaccurate" displaying 0.625-to-0.52 ULP.
-
-Given that one has the ability to carry (and process) more fraction bits,
-one can then do high precision multiplies of  Ï or other transcendental
-radixes.
-
-And GPUs have been doing this almost since the dawn of 3D.
-
-    // calculate ATAN2 high performance style
-    // Note: at this point x != y
-    //
-    if( x  > 0.0             )
-    {
-        if( y < 0.0 && |y| < |x| ) return - Ï/2 - ATAN( x / y );
-        if( y < 0.0 && |y| > |x| ) return       + ATAN( y / x );
-        if( y > 0.0 && |y| < |x| ) return       + ATAN( y / x );
-        if( y > 0.0 && |y| > |x| ) return + Ï/2 - ATAN( x / y );
-    }
-    if( x  < 0.0             )
-    {
-        if( y < 0.0 && |y| < |x| ) return + Ï/2 + ATAN( x / y );
-        if( y < 0.0 && |y| > |x| ) return + Ï   - ATAN( y / x );
-        if( y > 0.0 && |y| < |x| ) return + Ï   - ATAN( y / x );
-        if( y > 0.0 && |y| > |x| ) return +3Ï/2 + ATAN( x / y );
-    }
-
-This way the adds and subtracts from the constant are not in a precision
-precarious position.
diff --git a/openpower/transcendentals/discussion.mdwn b/openpower/transcendentals/discussion.mdwn
new file mode 100644
index 000000000..f8dfa2434
--- /dev/null
+++ b/openpower/transcendentals/discussion.mdwn
@@ -0,0 +1,169 @@
+# Discussion
+
+* <http://bugs.libre-soc.org/show_bug.cgi?id=127>
+* <https://www.khronos.org/registry/spir-v/specs/unified1/OpenCL.ExtendedInstructionSet.100.html>
+* Discussion: <http://lists.libre-riscv.org/pipermail/libre-riscv-dev/2019-August/002342.html>
+* [[power_trans_ops]] for opcode listing.
+
+TODO:
+
+* Decision on accuracy, moved to [[zfpacc_proposal]]
+<http://lists.libre-riscv.org/pipermail/libre-riscv-dev/2019-August/002355.html>
+* Errors **MUST** be repeatable.
+* How about four Platform Specifications? 3DUNIX, UNIX, 3DEmbedded and Embedded?
+<http://lists.libre-riscv.org/pipermail/libre-riscv-dev/2019-August/002361.html>
+  Accuracy requirements for dual (triple) purpose implementations must
+  meet the higher standard.
+* Reciprocal Square-root is in its own separate extension (Zfrsqrt) as
+  it is desirable on its own by other implementors.  This to be evaluated.
+
+# Evaluation and commentary
+
+This section now in discussion
+
+## Reciprocal
+
+Used to be an alias. Some implementors may wish to implement divide as
+y times recip(x).
+
+Others may have shared hardware for recip and divide, others may not.
+
+To avoid penalising one implementor over another, recip stays.
+
+## To evaluate: should LOG be replaced with LOG1P (and EXP with EXPM1)?
+
+RISC principle says "exclude LOG because it's covered by LOGP1 plus an ADD".
+Research needed to ensure that implementors are not compromised by such
+a decision
+<http://lists.libre-riscv.org/pipermail/libre-riscv-dev/2019-August/002358.html>
+
+> > correctly-rounded LOG will return different results than LOGP1 and ADD.
+> > Likewise for EXP and EXPM1
+
+> ok, they stay in as real opcodes, then.
+
+## ATAN / ATAN2 commentary
+
+Discussion starts here:
+<http://lists.libre-riscv.org/pipermail/libre-riscv-dev/2019-August/002470.html>
+
+from Mitch Alsup:
+
+would like to point out that the general implementations of ATAN2 do a
+bunch of special case checks and then simply call ATAN.
+
+    double ATAN2( double y, double x )
+    {   // IEEE 754-2008 quality ATAN2
+
+        // deal with NANs
+        if( ISNAN( x )             ) return x;
+        if( ISNAN( y )             ) return y;
+
+        // deal with infinities
+        if( x == +â    && |y|== +â  ) return copysign(  Ï/4, y );
+        if( x == +â                 ) return copysign(  0.0, y );
+        if( x == -â    && |y|== +â  ) return copysign( 3Ï/4, y );
+        if( x == -â                 ) return copysign(    Ï, y );
+        if(               |y|== +â  ) return copysign(  Ï/2, y );
+
+        // deal with signed zeros
+        if( x == 0.0  &&  y != 0.0 ) return copysign(  Ï/2, y );
+        if( x >=+0.0  &&  y == 0.0 ) return copysign(  0.0, y );
+        if( x <=-0.0  &&  y == 0.0 ) return copysign(    Ï, y );
+
+        // calculate ATAN2 textbook style
+        if( x  > 0.0               ) return     ATAN( |y / x| );
+        if( x  < 0.0               ) return Ï - ATAN( |y / x| );
+    }
+
+
+Yet the proposed encoding makes ATAN2 the primitive and has ATAN invent
+a constant and then call/use ATAN2.
+
+When one considers an implementation of ATAN, one must consider several
+ranges of evaluation::
+
+     x  [  -â, -1.0]:: ATAN( x ) = -Ï/2 + ATAN( 1/x );
+     x  (-1.0, +1.0]:: ATAN( x ) =      + ATAN(   x );
+     x  [ 1.0,   +â]:: ATAN( x ) = +Ï/2 - ATAN( 1/x );
+
+I should point out that the add/sub of Ï/2 can not lose significance
+since the result of ATAN(1/x) is bounded 0..Ï/2
+
+The bottom line is that I think you are choosing to make too many of
+these into OpCodes, making the hardware function/calculation unit (and
+sequencer) more complicated that necessary.
+
+--------------------------------------------------------
+
+We therefore I think have a case for bringing back ATAN and including ATAN2.
+
+The reason is that whilst a microcode-like GPU-centric platform would
+do ATAN2 in terms of ATAN, a UNIX-centric platform would do it the other
+way round.
+
+(that is the hypothesis, to be evaluated for correctness. feedback requested).
+
+This because we cannot compromise or prioritise one platfrom's
+speed/accuracy over another. That is not reasonable or desirable, to
+penalise one implementor over another.
+
+Thus, all implementors, to keep interoperability, must both have both
+opcodes and may choose, at the architectural and routing level, which
+one to implement in terms of the other.
+
+Allowing implementors to choose to add either opcode and let traps sort it
+out leaves an uncertainty in the software developer's mind: they cannot
+trust the hardware, available from many vendors, to be performant right
+across the board.
+
+Standards are a pig.
+
+---
+
+I might suggest that if there were a way for a calculation to be performed
+and the result of that calculation chained to a subsequent calculation
+such that the precision of the result-becomes-operand is wider than
+what will fit in a register, then you can dramatically reduce the count
+of instructions in this category while retaining
+
+acceptable accuracy:
+
+     z = x / y
+
+can be calculated as::
+
+     z = x * (1/y)
+
+Where 1/y has about 26-to-32 bits of fraction. No, it's not IEEE 754-2008
+accurate, but GPUs want speed and
+
+1/y is fully pipelined (F32) while x/y cannot be (at reasonable area). It
+is also not "that inaccurate" displaying 0.625-to-0.52 ULP.
+
+Given that one has the ability to carry (and process) more fraction bits,
+one can then do high precision multiplies of  Ï or other transcendental
+radixes.
+
+And GPUs have been doing this almost since the dawn of 3D.
+
+    // calculate ATAN2 high performance style
+    // Note: at this point x != y
+    //
+    if( x  > 0.0             )
+    {
+        if( y < 0.0 && |y| < |x| ) return - Ï/2 - ATAN( x / y );
+        if( y < 0.0 && |y| > |x| ) return       + ATAN( y / x );
+        if( y > 0.0 && |y| < |x| ) return       + ATAN( y / x );
+        if( y > 0.0 && |y| > |x| ) return + Ï/2 - ATAN( x / y );
+    }
+    if( x  < 0.0             )
+    {
+        if( y < 0.0 && |y| < |x| ) return + Ï/2 + ATAN( x / y );
+        if( y < 0.0 && |y| > |x| ) return + Ï   - ATAN( y / x );
+        if( y > 0.0 && |y| < |x| ) return + Ï   - ATAN( y / x );
+        if( y > 0.0 && |y| > |x| ) return +3Ï/2 + ATAN( x / y );
+    }
+
+This way the adds and subtracts from the constant are not in a precision
+precarious position.
-- 
2.30.2