(no commit message)

[libreriscv.git] / openpower / sv / bitmanip.mdwn
diff --git a/openpower/sv/bitmanip.mdwn b/openpower/sv/bitmanip.mdwn

index ad120459ef7ef9bae8fea1b8ae0e39a485ef9f07..850a11e9515dc91abbcf009c3bc85969120a0a30 100644 (file)
--- a/openpower/sv/bitmanip.mdwn
+++ b/openpower/sv/bitmanip.mdwn
@@ -1,16 +1,19 @@
  [[!tag standards]]
  
+[[!toc levels=1]]
+
  # Implementation Log
  
  * ternlogi <https://bugs.libre-soc.org/show_bug.cgi?id=745>
  * grev <https://bugs.libre-soc.org/show_bug.cgi?id=755>
  * GF2^M <https://bugs.libre-soc.org/show_bug.cgi?id=782>
  
+
  # bitmanipulation
  
  **DRAFT STATUS**
  
-pseudocode: <https://libre-soc.org/openpower/isa/bitmanip/>
+pseudocode: [[openpower/isa/bitmanip]]
  
  this extension amalgamates bitmanipulation primitives from many sources, including RISC-V bitmanip, Packed SIMD, AVX-512 and OpenPOWER VSX.  Vectorisation and SIMD are removed: these are straight scalar (element) operations making them suitable for embedded applications.
  Vectorisation Context is provided by [[openpower/sv]].
@@ -95,13 +98,10 @@ TODO: convert all instructions to use RT and not RS
  | NN | RT | RA  | RB   | im0-4   | im5-7  00 |1 | grevlog |
  | NN |    |     |      |         | .....  01 |0 | crternlog |
  | NN | RT | RA  | RB   | RC      | mode  010 |Rc| bitmask* |
-| NN | RS | RA  | RB   | RC      | 00    011 |0 | gfbmadd |
-| NN | RS | RA  | RB   | RC      | 00    011 |1 | gfbmaddsub |
-| NN | RS | RA  | RB   | RC      | 01    011 |0 | clmadd |
-| NN | RS | RA  | RB   | RC      | 01    011 |1 | clmaddsub |
-| NN | RS | RA  | RB   | RC      | 10    011 |0 | gfpmadd |
-| NN | RS | RA  | RB   | RC      | 10    011 |1 | gfpmaddsub |
-| NN | RS | RA  | RB   | RC      | 11    011 |  | rsvd |
+| NN |    |     |      |         | 00    011 |  | rsvd |
+| NN |    |     |      |         | 01    011 |  | rsvd |
+| NN |    |     |      |         | 10    011 |  | rsvd |
+| NN |    |     |      |         | 11    011 |Rc| setvl |
  | NN | RT | RA  | RB   | sh0-4   | sh5 1 111 |Rc| bmrevi |
  
  ops (note that av avg and abs as well as vec scalar mask
@@ -123,13 +123,13 @@ double check that instructions didn't need 3 inputs.
  | NN | RA | RB  | RC  | 0  |   01  | 0001 110 |Rc| vec sofm |
  | NN | RA | RB  | RC  | 0  |   10  | 0001 110 |Rc| vec sifm |
  | NN | RA | RB  | RC  | 0  |   11  | 0001 110 |Rc| vec cprop |
+| NN | RT | RA  | RB  | 0  |       | 0101 110 |Rc| rsvd |
  | NN | RT | RA  | RB  | 1  | itype | 0101 110 |Rc| xperm |
-| NN | RA | RB  | RC  | 0  | itype | 0101 110 |Rc| av minmax |
-| NN | RA | RB  | RC  | 1  |   00  | 0101 110 |Rc| av abss |
-| NN | RA | RB  | RC  | 1  |   01  | 0101 110 |Rc| av absu|
-| NN | RA | RB  |     | 1  |   10  | 0101 110 |Rc| av avgadd |
-| NN | RA | RB  |     | 1  |   11  | 0101 110 |Rc| rsvd |
-| NN | RA | RB  |     |    |       | 1001 110 |Rc| rsvd |
+| NN | RA | RB  | RC  | 0  | itype | 1001 110 |Rc| av minmax |
+| NN | RA | RB  | RC  | 1  |   00  | 1001 110 |Rc| av abss |
+| NN | RA | RB  | RC  | 1  |   01  | 1001 110 |Rc| av absu|
+| NN | RA | RB  |     | 1  |   10  | 1001 110 |Rc| av avgadd |
+| NN | RA | RB  |     | 1  |   11  | 1001 110 |Rc| rsvd |
  | NN | RA | RB  |     |    |       | 1101 110 |Rc| rsvd |
  | NN | RA | RB  | RC  | 0  | 00    | 0010 110 |Rc| gorc |
  | NN | RA | RB  | sh  | SH | 00    | 1010 110 |Rc| gorci |
@@ -145,7 +145,7 @@ double check that instructions didn't need 3 inputs.
  | NN | RA | RB  | RC  |    | 10    | --10 110 |Rc| rsvd  |
  | NN | RA | RB  | RC  | 0  | 11    | 1110 110 |Rc| clmulr  |
  | NN | RA | RB  | RC  | 1  | 11    | 1110 110 |Rc| clmulh  |
-| NN |    |     |     |    |       | --11 110 |Rc| setvl  |
+| NN |    |     |     |    |       | --11 110 |Rc| rsvd  |
  
  # ternlog bitops
  
@@ -540,8 +540,63 @@ uint64_t gorc64(uint64_t RA, uint64_t RB)
  }
  
  ```
-
-# Instructions for Carry-less Operations aka. Polynomials with coefficients in `GF(2)`
+# Introduction to Carry-less and GF arithmetic
+
+* obligatory xkcd <https://xkcd.com/2595/>
+
+There are three completely separate types of Galois-Field-based
+arithmetic that we implement which are not well explained even in introductory literature.  A slightly oversimplified explanation
+is followed by more accurate descriptions:
+
+* `GF(2)` carry-less binary arithmetic. this is not actually a Galois Field,
+  but is accidentally referred to as GF(2) - see below as to why.
+* `GF(p)` modulo arithmetic with a Prime number, these are "proper" Galois Fields
+* `GF(2^N)` carry-less binary arithmetic with two limits: modulo a power-of-2
+  (2^N) and a second "reducing" polynomial (similar to a prime number), these
+  are said to be GF(2^N) arithmetic.
+
+further detailed and more precise explanations are provided below
+
+* **Polynomials with coefficients in `GF(2)`**
+  (aka. Carry-less arithmetic -- the `cl*` instructions).
+  This isn't actually a Galois Field, but its coefficients are. This is
+  basically binary integer addition, subtraction, and multiplication like
+  usual, except that carries aren't propagated at all, effectively turning
+  both addition and subtraction into the bitwise xor operation. Division and
+  remainder are defined to match how addition and multiplication works.
+* **Galois Fields with a prime size**
+  (aka. `GF(p)` or Prime Galois Fields -- the `gfp*` instructions).
+  This is basically just the integers mod `p`.
+* **Galois Fields with a power-of-a-prime size**
+  (aka. `GF(p^n)` or `GF(q)` where `q == p^n` for prime `p` and
+  integer `n > 0`).
+  We only implement these for `p == 2`, called Binary Galois Fields
+  (`GF(2^n)` -- the `gfb*` instructions).
+  For any prime `p`, `GF(p^n)` is implemented as polynomials with
+  coefficients in `GF(p)` and degree `< n`, where the polynomials are the
+  remainders of dividing by a specificly chosen polynomial in `GF(p)` called
+  the Reducing Polynomial (we will denote that by `red_poly`). The Reducing
+  Polynomial must be an irreducable polynomial (like primes, but for
+  polynomials), as well as have degree `n`. All `GF(p^n)` for the same `p`
+  and `n` are isomorphic to each other -- the choice of `red_poly` doesn't
+  affect `GF(p^n)`'s mathematical shape, all that changes is the specific
+  polynomials used to implement `GF(p^n)`.
+
+Many implementations and much of the literature do not make a clear
+distinction between these three categories, which makes it confusing
+to understand what their purpose and value is.
+
+* carry-less multiply is extremely common and is used for the ubiquitous
+  CRC32 algorithm. [TODO add many others, helps justify to ISA WG]
+* GF(2^N) forms the basis of Rijndael (the current AES standard) and
+  has significant uses throughout cryptography
+* GF(p) is the basis again of a significant quantity of algorithms
+  (TODO, list them, jacob knows what they are), even though the
+  modulo is limited to be below 64-bit (size of a scalar int)
+
+# Instructions for Carry-less Operations
+
+aka. Polynomials with coefficients in `GF(2)`
  
  Carry-less addition/subtraction is simply XOR, so a `cladd`
  instruction is not provided since the `xor[i]` instruction can be used instead.
@@ -549,7 +604,9 @@ instruction is not provided since the `xor[i]` instruction can be used instead.
  These are operations on polynomials with coefficients in `GF(2)`, with the
  polynomial's coefficients packed into integers with the following algorithm:
  
-[[!inline pagenames="openpower/sv/bitmanip/pack_poly.py" raw="true" feeds="no" actions="yes"]]
+```python
+[[!inline pagenames="gf_reference/pack_poly.py" raw="yes"]]
+```
  
  ## Carry-less Multiply Instructions
  
@@ -563,18 +620,24 @@ They are worth adding as their own non-overwrite operations
  
  ### `clmul` Carry-less Multiply
  
-[[!inline pagenames="openpower/sv/bitmanip/clmul.py" raw="true" feeds="no" actions="yes"]]
+```python
+[[!inline pagenames="gf_reference/clmul.py" raw="yes"]]
+```
  
  ### `clmulh` Carry-less Multiply High
  
-[[!inline pagenames="openpower/sv/bitmanip/clmulh.py" raw="true" feeds="no" actions="yes"]]
+```python
+[[!inline pagenames="gf_reference/clmulh.py" raw="yes"]]
+```
  
  ### `clmulr` Carry-less Multiply (Reversed)
  
  Useful for CRCs. Equivalent to bit-reversing the result of `clmul` on
  bit-reversed inputs.
  
-[[!inline pagenames="openpower/sv/bitmanip/clmulr.py" raw="true" feeds="no" actions="yes"]]
+```python
+[[!inline pagenames="gf_reference/clmulr.py" raw="yes"]]
+```
  
  ## `clmadd` Carry-less Multiply-Add
  
@@ -588,6 +651,17 @@ clmadd RT, RA, RB, RC
  
  ## `cltmadd` Twin Carry-less Multiply-Add (for FFTs)
  
+Used in combination with SV FFT REMAP to perform a full Discrete Fourier
+Transform of Polynomials over GF(2) in-place. Possible by having 3-in 2-out,
+to avoid the need for a temp register. RS is written to as well as RT.
+
+Note: Polynomials over GF(2) are a Ring rather than a Field, so, because the
+definition of the Inverse Discrete Fourier Transform involves calculating a
+multiplicative inverse, which may not exist in every Ring, therefore the
+Inverse Discrete Fourier Transform may not exist. (AFAICT the number of inputs
+to the IDFT must be odd for the IDFT to be defined for Polynomials over GF(2).
+TODO: check with someone who knows for sure if that's correct.)
+
  ```
  cltmadd RT, RA, RB, RC
  ```
@@ -608,7 +682,9 @@ c = (RC)
  `cldivrem` isn't an actual instruction, but is just used in the pseudo-code
  for other instructions.
  
-[[!inline pagenames="openpower/sv/bitmanip/cldivrem.py" raw="true" feeds="no" actions="yes"]]
+```python
+[[!inline pagenames="gf_reference/cldivrem.py" raw="yes"]]
+```
  
  ## `cldiv` Carry-less Division
  
@@ -663,13 +739,17 @@ the LSB set, since otherwise it would be divisible by the polynomial `x`,
  making it reducible, making whatever we're working on no longer a Field.
  Therefore, we can reuse the LSB to indicate `degree == XLEN`.
  
-[[!inline pagenames="openpower/sv/bitmanip/decode_reducing_polynomial.py" raw="true" feeds="no" actions="yes"]]
+```python
+[[!inline pagenames="gf_reference/decode_reducing_polynomial.py" raw="yes"]]
+```
  
  ## `gfbredpoly` -- Set the Reducing Polynomial SPR `GFBREDPOLY`
  
  unless this is an immediate op, `mtspr` is completely sufficient.
  
-[[!inline pagenames="openpower/sv/bitmanip/gfbredpoly.py" raw="true" feeds="no" actions="yes"]]
+```python
+[[!inline pagenames="gf_reference/gfbredpoly.py" raw="yes"]]
+```
  
  ## `gfbmul` -- Binary Galois Field `GF(2^m)` Multiplication
  
@@ -677,7 +757,9 @@ unless this is an immediate op, `mtspr` is completely sufficient.
  gfbmul RT, RA, RB
  ```
  
-[[!inline pagenames="openpower/sv/bitmanip/gfbmul.py" raw="true" feeds="no" actions="yes"]]
+```python
+[[!inline pagenames="gf_reference/gfbmul.py" raw="yes"]]
+```
  
  ## `gfbmadd` -- Binary Galois Field `GF(2^m)` Multiply-Add
  
@@ -685,10 +767,16 @@ gfbmul RT, RA, RB
  gfbmadd RT, RA, RB, RC
  ```
  
-[[!inline pagenames="openpower/sv/bitmanip/gfbmadd.py" raw="true" feeds="no" actions="yes"]]
+```python
+[[!inline pagenames="gf_reference/gfbmadd.py" raw="yes"]]
+```
  
  ## `gfbtmadd` -- Binary Galois Field `GF(2^m)` Twin Multiply-Add (for FFT)
  
+Used in combination with SV FFT REMAP to perform a full `GF(2^m)` Discrete
+Fourier Transform in-place. Possible by having 3-in 2-out, to avoid the need
+for a temp register. RS is written to as well as RT.
+
  ```
  gfbtmadd RT, RA, RB, RC
  ```
@@ -711,17 +799,12 @@ c = (RC)
  gfbinv RT, RA
  ```
  
-[[!inline pagenames="openpower/sv/bitmanip/gfbinv.py" raw="true" feeds="no" actions="yes"]]
-
-# Instructions for Prime Galois Fields `GF(p)`
-
-## Helper algorithms
-
  ```python
-def int_to_gfp(int_value, prime):
-    return int_value % prime # follows Python remainder semantics
+[[!inline pagenames="gf_reference/gfbinv.py" raw="yes"]]
  ```
  
+# Instructions for Prime Galois Fields `GF(p)`
+
  ## `GFPRIME` SPR -- Prime Modulus For `gfp*` Instructions
  
  ## `gfpadd` Prime Galois Field `GF(p)` Addition
@@ -730,8 +813,8 @@ def int_to_gfp(int_value, prime):
  gfpadd RT, RA, RB
  ```
  
-```
-(RT) = int_to_gfp((RA) + (RB), GFPRIME)
+```python
+[[!inline pagenames="gf_reference/gfpadd.py" raw="yes"]]
  ```
  
  the addition happens on infinite-precision integers
@@ -742,8 +825,8 @@ the addition happens on infinite-precision integers
  gfpsub RT, RA, RB
  ```
  
-```
-(RT) = int_to_gfp((RA) - (RB), GFPRIME)
+```python
+[[!inline pagenames="gf_reference/gfpsub.py" raw="yes"]]
  ```
  
  the subtraction happens on infinite-precision integers
@@ -754,8 +837,8 @@ the subtraction happens on infinite-precision integers
  gfpmul RT, RA, RB
  ```
  
-```
-(RT) = int_to_gfp((RA) * (RB), GFPRIME)
+```python
+[[!inline pagenames="gf_reference/gfpmul.py" raw="yes"]]
  ```
  
  the multiplication happens on infinite-precision integers
@@ -769,11 +852,9 @@ gfpinv RT, RA
  Some potential hardware implementations are found in:
  <https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.90.5233&rep=rep1&type=pdf>
  
+```python
+[[!inline pagenames="gf_reference/gfpinv.py" raw="yes"]]
  ```
-(RT) = gfpinv((RA), GFPRIME)
-```
-
-the multiplication happens on infinite-precision integers
  
  ## `gfpmadd` Prime Galois Field `GF(p)` Multiply-Add
  
@@ -781,8 +862,8 @@ the multiplication happens on infinite-precision integers
  gfpmadd RT, RA, RB, RC
  ```
  
-```
-(RT) = int_to_gfp((RA) * (RB) + (RC), GFPRIME)
+```python
+[[!inline pagenames="gf_reference/gfpmadd.py" raw="yes"]]
  ```
  
  the multiplication and addition happens on infinite-precision integers
@@ -793,8 +874,8 @@ the multiplication and addition happens on infinite-precision integers
  gfpmsub RT, RA, RB, RC
  ```
  
-```
-(RT) = int_to_gfp((RA) * (RB) - (RC), GFPRIME)
+```python
+[[!inline pagenames="gf_reference/gfpmsub.py" raw="yes"]]
  ```
  
  the multiplication and subtraction happens on infinite-precision integers
@@ -805,14 +886,19 @@ the multiplication and subtraction happens on infinite-precision integers
  gfpmsubr RT, RA, RB, RC
  ```
  
-```
-(RT) = int_to_gfp((RC) - (RA) * (RB), GFPRIME)
+```python
+[[!inline pagenames="gf_reference/gfpmsubr.py" raw="yes"]]
  ```
  
  the multiplication and subtraction happens on infinite-precision integers
  
  ## `gfpmaddsubr` Prime Galois Field `GF(p)` Multiply-Add and Multiply-Sub-Reversed (for FFT)
  
+Used in combination with SV FFT REMAP to perform
+a full Number-Theoretic-Transform in-place. Possible by having 3-in 2-out,
+to avoid the need for a temp register. RS is written
+to as well as RT.
+
  ```
  gfpmaddsubr RT, RA, RB, RC
  ```
@@ -820,120 +906,15 @@ gfpmaddsubr RT, RA, RB, RC
  TODO: add link to explanation for where `RS` comes from.
  
  ```
-product = (RA) * (RB)
+factor1 = (RA)
+factor2 = (RB)
  term = (RC)
-(RT) = int_to_gfp(product + term, GFPRIME)
-(RS) = int_to_gfp(term - product, GFPRIME)
+# read all inputs before writing to any outputs in case
+# an input overlaps with an output register.
+(RT) = gfpmadd(factor1, factor2, term)
+(RS) = gfpmsubr(factor1, factor2, term)
  ```
  
-the multiplication, addition, and subtraction happens on infinite-precision integers
-
-## Twin Butterfly (Tukey-Cooley) Mul-add-sub
-
-used in combination with SV FFT REMAP to perform
-a full NTT in-place. possible by having 3-in 2-out,
-to avoid the need for a temp register.  RS is written
-to as well as RT.
-
-    gffmadd  RT,RA,RC,RB (Rc=0)
-    gffmadd. RT,RA,RC,RB (Rc=1)
-
-Pseudo-code:
-
-    RT <- GFADD(GFMUL(RA, RC), RB))
-    RS <- GFADD(GFMUL(RA, RC), RB))
-
-
-## Multiply
-
-with the modulo and degree being in an SPR, multiply can be identical
-equivalent to standard integer add
-
-    RS = GFMUL(RA, RB)
-
-| 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
-| -- | -- | --- | --- | --- | ------ |--|
-| NN | RT | RA  | RB  |11000|  01110 |Rc|
-
-
-
-```
-from functools import reduce
-
-def gf_degree(a) :
-  res = 0
-  a >>= 1
-  while (a != 0) :
-    a >>= 1;
-    res += 1;
-  return res
-
-# constants used in the multGF2 function
-mask1 = mask2 = polyred = None
-
-def setGF2(irPoly):
-    """Define parameters of binary finite field GF(2^m)/g(x)
-       - irPoly: coefficients of irreducible polynomial g(x)
-    """
-    # degree: extension degree of binary field
-    degree = gf_degree(irPoly)
-
-    def i2P(sInt):
-        """Convert an integer into a polynomial"""
-        return [(sInt >> i) & 1
-                for i in reversed(range(sInt.bit_length()))]    
-    
-    global mask1, mask2, polyred
-    mask1 = mask2 = 1 << degree
-    mask2 -= 1
-    polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
-        
-def multGF2(p1, p2):
-    """Multiply two polynomials in GF(2^m)/g(x)"""
-    p = 0
-    while p2:
-        # standard long-multiplication: check LSB and add
-        if p2 & 1:
-            p ^= p1
-        p1 <<= 1
-        # standard modulo: check MSB and add polynomial
-        if p1 & mask1:
-            p1 ^= polyred
-        p2 >>= 1
-    return p & mask2
-
-if __name__ == "__main__":
-  
-    # Define binary field GF(2^3)/x^3 + x + 1
-    setGF2(0b1011) # degree 3
-
-    # Evaluate the product (x^2 + x + 1)(x^2 + 1)
-    print("{:02x}".format(multGF2(0b111, 0b101)))
-    
-    # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
-    # (used in the Advanced Encryption Standard-AES)
-    setGF2(0b100011011) # degree 8
-    
-    # Evaluate the product (x^7)(x^7 + x + 1)
-    print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
-```
-
-## carryless Twin Butterfly (Tukey-Cooley) Mul-add-sub
-
-used in combination with SV FFT REMAP to perform
-a full NTT in-place. possible by having 3-in 2-out,
-to avoid the need for a temp register.  RS is written
-to as well as RT.
-
-    clfmadd  RT,RA,RC,RB (Rc=0)
-    clfmadd. RT,RA,RC,RB (Rc=1)
-
-Pseudo-code:
-
-    RT <- CLMUL(RA, RC) ^ RB
-    RS <- CLMUL(RA, RC) ^ RB
-
-
  # bitmatrix
  
  ```
@@ -997,7 +978,7 @@ RA ← EXTZ64(count)
  
  ## bit deposit
  
-vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep, found already in v3.1 p106
+pdepd VRT,VRA,VRB, identical to RV bitmamip bdep, found already in v3.1 p106
  
      do while(m < 64)
         if VSR[VRB+32].dword[i].bit[63-m]=1 then do
@@ -1022,9 +1003,9 @@ uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
  
  ```
  
-# bit extract
+## bit extract
  
-other way round: identical to RV bext, found in v3.1 p196
+other way round: identical to RV bext: pextd, found in v3.1 p196
  
  ```
  uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
@@ -1040,7 +1021,7 @@ uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
  }
  ```
  
-# centrifuge
+## centrifuge
  
  found in v3.1 p106 so not to be added here
  
@@ -1059,13 +1040,18 @@ do i = 0 to 63
  RA = result
  ```
  
-# bit to byte permute
+## bit to byte permute
  
  similar to matrix permute in RV bitmanip, which has XOR and OR variants,
-these perform a transpose.
+these perform a transpose. TODO this looks VSX is there a scalar variant
+in v3.0/1 already
  
      do j = 0 to 7
        do k = 0 to 7
           b = VSR[VRB+32].dword[i].byte[k].bit[j]
           VSR[VRT+32].dword[i].byte[j].bit[k] = b
  
+# Appendix
+
+see [[bitmanip/appendix]]
+