bug 1244: update pospopcnt.s assembler comments

[libreriscv.git] / openpower / transcendentals.mdwn
diff --git a/openpower/transcendentals.mdwn b/openpower/transcendentals.mdwn

index f4b7f14b694afa8fd81496480de5e4303cc450d1..9e6b86118b1f2687e3a14d18851165041faa155b 100644 (file)
--- a/openpower/transcendentals.mdwn
+++ b/openpower/transcendentals.mdwn
@@ -42,6 +42,7 @@ TODO: rename extension subsets -- we're not on RISC-V anymore.
    acosh, atanh (can be synthesised - see below)
  * **ZftransAdv**: much more complex to implement in hardware
  * **Zfrsqrt**: Reciprocal square-root.
+* **Zfminmax**: Min/Max.
  
  Minimum recommended requirements for 3D: Zftrans, Ztrignpi,
  Zarctrignpi, with Ztrigpi and Zarctrigpi as augmentations.
@@ -233,24 +234,24 @@ Note (6) 4xf32-only, requires VMX.
  | fpowr        | x power of y (x +ve)                 | FRT = exp(FRA log(FRB))                        | ZftransAdv  |
  | frootn       | x power 1/n (n integer)              | FRT = pow(FRA, 1/RB)                           | ZftransAdv  |
  | fhypot       | hypotenuse                           | FRT = sqrt(FRA^2 + FRB^2)                      | ZftransAdv  |
-| fminnum08    | IEEE 754-2008 minNum                 | FRT = minNum(FRA, FRB)  (1)                    | TBD         |
-| fmaxnum08    | IEEE 754-2008 maxNum                 | FRT = maxNum(FRA, FRB)  (1)                    | TBD         |
-| fmin19       | IEEE 754-2019 minimum                | FRT = minimum(FRA, FRB)                        | TBD         |
-| fmax19       | IEEE 754-2019 maximum                | FRT = maximum(FRA, FRB)                        | TBD         |
-| fminnum19    | IEEE 754-2019 minimumNumber          | FRT = minimumNumber(FRA, FRB)                  | TBD         |
-| fmaxnum19    | IEEE 754-2019 maximumNumber          | FRT = maximumNumber(FRA, FRB)                  | TBD         |
-| fminc        | C ternary-op minimum                 | FRT = FRA \< FRB ? FRA : FRB                   | TBD         |
-| fmaxc        | C ternary-op maximum                 | FRT = FRA > FRB ? FRA : FRB                    | TBD         |
-| fminmagnum08 | IEEE 754-2008 minNumMag              | FRT = minmaxmag(FRA, FRB, False, fminnum08) (2)| TBD         |
-| fmaxmagnum08 | IEEE 754-2008 maxNumMag              | FRT = minmaxmag(FRA, FRB, True, fmaxnum08) (2) | TBD         |
-| fminmag19    | IEEE 754-2019 minimumMagnitude       | FRT = minmaxmag(FRA, FRB, False, fmin19) (2)   | TBD         |
-| fmaxmag19    | IEEE 754-2019 maximumMagnitude       | FRT = minmaxmag(FRA, FRB, True, fmax19) (2)    | TBD         |
-| fminmagnum19 | IEEE 754-2019 minimumMagnitudeNumber | FRT = minmaxmag(FRA, FRB, False, fminnum19) (2)| TBD         |
-| fmaxmagnum19 | IEEE 754-2019 maximumMagnitudeNumber | FRT = minmaxmag(FRA, FRB, True, fmaxnum19) (2) | TBD         |
-| fminmagc     | C ternary-op minimum magnitude       | FRT = minmaxmag(FRA, FRB, False, fminc) (2)    | TBD         |
-| fmaxmagc     | C ternary-op maximum magnitude       | FRT = minmaxmag(FRA, FRB, True, fmaxc) (2)     | TBD         |
-| fmod         | modulus                              | FRT = fmod(FRA, FRB)                           | TBD         |
-| fremainder   | IEEE 754 remainder                   | FRT = remainder(FRA, FRB)                      | TBD         |
+| fminnum08    | IEEE 754-2008 minNum                 | FRT = minNum(FRA, FRB)  (1)                    | Zfminmax    |
+| fmaxnum08    | IEEE 754-2008 maxNum                 | FRT = maxNum(FRA, FRB)  (1)                    | Zfminmax    |
+| fmin19       | IEEE 754-2019 minimum                | FRT = minimum(FRA, FRB)                        | Zfminmax    |
+| fmax19       | IEEE 754-2019 maximum                | FRT = maximum(FRA, FRB)                        | Zfminmax    |
+| fminnum19    | IEEE 754-2019 minimumNumber          | FRT = minimumNumber(FRA, FRB)                  | Zfminmax    |
+| fmaxnum19    | IEEE 754-2019 maximumNumber          | FRT = maximumNumber(FRA, FRB)                  | Zfminmax    |
+| fminc        | C ternary-op minimum                 | FRT = FRA \< FRB ? FRA : FRB                   | Zfminmax    |
+| fmaxc        | C ternary-op maximum                 | FRT = FRA > FRB ? FRA : FRB                    | Zfminmax    |
+| fminmagnum08 | IEEE 754-2008 minNumMag              | FRT = minmaxmag(FRA, FRB, False, fminnum08) (2)| Zfminmax    |
+| fmaxmagnum08 | IEEE 754-2008 maxNumMag              | FRT = minmaxmag(FRA, FRB, True, fmaxnum08) (2) | Zfminmax    |
+| fminmag19    | IEEE 754-2019 minimumMagnitude       | FRT = minmaxmag(FRA, FRB, False, fmin19) (2)   | Zfminmax    |
+| fmaxmag19    | IEEE 754-2019 maximumMagnitude       | FRT = minmaxmag(FRA, FRB, True, fmax19) (2)    | Zfminmax    |
+| fminmagnum19 | IEEE 754-2019 minimumMagnitudeNumber | FRT = minmaxmag(FRA, FRB, False, fminnum19) (2)| Zfminmax    |
+| fmaxmagnum19 | IEEE 754-2019 maximumMagnitudeNumber | FRT = minmaxmag(FRA, FRB, True, fmaxnum19) (2) | Zfminmax    |
+| fminmagc     | C ternary-op minimum magnitude       | FRT = minmaxmag(FRA, FRB, False, fminc) (2)    | Zfminmax    |
+| fmaxmagc     | C ternary-op maximum magnitude       | FRT = minmaxmag(FRA, FRB, True, fmaxc) (2)     | Zfminmax    |
+| fmod         | modulus                              | FRT = fmod(FRA, FRB)                           | ZftransExt  |
+| fremainder   | IEEE 754 remainder                   | FRT = remainder(FRA, FRB)                      | ZftransExt  |
  
  Note (1): for the purposes of minNum/maxNum, -0.0 is defined to be less than +0.0. This is left unspecified in IEEE 754-2008.
  
@@ -327,6 +328,8 @@ the less common subsets are still required for IEEE754 HPC.
  MALI Midgard, an embedded / mobile 3D GPU, for example only has the
  following opcodes:
  
+    28 - fmin
+    2C - fmax
      E8 - fatan_pt2
      F0 - frcp (reciprocal)
      F2 - frsqrt (inverse square root, 1/sqrt(x))
@@ -343,6 +346,7 @@ Vivante Embedded/Mobile 3D (etnaviv
  <https://github.com/laanwj/etna_viv/blob/master/rnndb/isa.xml>) 
  only has the following:
  
+    fmin/fmax (implemented using SELECT)
      sin, cos2pi
      cos, sin2pi
      log2, exp
@@ -354,6 +358,7 @@ It also has fast variants of some of these, as a CSR Mode.
  AMD's R600 GPU (R600\_Instruction\_Set\_Architecture.pdf) and the
  RDNA ISA (RDNA\_Shader\_ISA\_5August2019.pdf, Table 22, Section 6.3) have:
  
+    MIN/MAX/MIN_DX10/MAX_DX10
      COS2PI (appx)
      EXP2
      LOG (IEEE754)
@@ -363,7 +368,7 @@ RDNA ISA (RDNA\_Shader\_ISA\_5August2019.pdf, Table 22, Section 6.3) have:
      SIN2PI (appx)
  
  AMD RDNA has F16 and F32 variants of all the above, and also has F64
-variants of SQRT, RSQRT and RECIP.  It is interesting that even the
+variants of SQRT, RSQRT, MIN, MAX, and RECIP.  It is interesting that even the
  modern high-end AMD GPU does not have TAN or ATAN, where MALI Midgard
  does.
  
@@ -392,7 +397,7 @@ They are therefore considered "base" (essential) transcendentals.
  
  ### ZftransExt
  
-LOG, EXP, EXP10, LOG10, LOGP1, EXP1M
+LOG, EXP, EXP10, LOG10, LOGP1, EXP1M, fmod, fremainder
  
  These are extra transcendental functions that are useful, not generally
  needed for 3D, however for Numerical Computation they may be useful.
@@ -431,6 +436,11 @@ CBRT, POW, POWN, POWR, ROOTN
  These are simply much more complex to implement in hardware, and typically
  will only be put into HPC applications.
  
+Note that `pow` is commonly used in Blinn-Phong shading (the shading model used
+by OpenGL 1.0 and commonly used by shader authors that need basic 3D graphics
+with specular highlights), however it can be sufficiently emulated using
+`pow(b, n) = exp2(n*log2(b))`.
+
  * **Zfrsqrt**: Reciprocal square-root.
  
  ## Trigonometric subsets
@@ -482,6 +492,28 @@ is acceptable for 3D.
  
  Therefore they are their own subset extensions.
  
+### Zfminmax
+
+* fminnum08 fmaxnum08
+* fmin19 fmax19
+* fminnum19 fmaxnum19
+* fminc fmaxc
+* fminmagnum08 fmaxmagnum08
+* fminmag19 fmaxmag19
+* fminmagnum19 fmaxmagnum19
+* fminmagc fmaxmagc
+
+These are commonly used for vector reductions, where having them be a single
+instruction is critical. They are also commonly used in GPU shaders, HPC, and
+general-purpose FP algorithms.
+
+These min and max operations are quite cheap to implement hardware-wise,
+being comparable in cost to fcmp + some muxes. They're all in one extension
+because once you implement some of them, the rest require only slightly more
+hardware complexity.
+
+Therefore they are their own subset extension.
+
  # Synthesis, Pseudo-code ops and macro-ops
  
  The pseudo-ops are best left up to the compiler rather than being actual
@@ -502,6 +534,10 @@ high-performance or correctly-rounding):
  
      ASINH( x ) = ln( x + SQRT(x**2+1))
  
+`pow` sufficient for 3D Graphics:
+
+    pow(b, x) = exp2(x * log2(b))
+
  # Evaluation and commentary
  
  Moved to [[discussion]]