convert table from code block to standard markdown table

[libreriscv.git] / openpower / sv / bitmanip.mdwn
diff --git a/openpower/sv/bitmanip.mdwn b/openpower/sv/bitmanip.mdwn

index b7a54c7ffcd5b64e3b1d6bbefeec9ed9c11b1909..6b3277bb34954c17f403ecc5b9d86b1faf2cc6fe 100644 (file)
--- a/openpower/sv/bitmanip.mdwn
+++ b/openpower/sv/bitmanip.mdwn
@@ -12,6 +12,8 @@
  
  **DRAFT STATUS**
  
+pseudocode: <https://libre-soc.org/openpower/isa/bitmanip/>
+
  this extension amalgamates bitmanipulation primitives from many sources, including RISC-V bitmanip, Packed SIMD, AVX-512 and OpenPOWER VSX.  Vectorisation and SIMD are removed: these are straight scalar (element) operations making them suitable for embedded applications.
  Vectorisation Context is provided by [[openpower/sv]].
  
@@ -33,17 +35,17 @@ Useful resource:
  
  minor opcode allocation
  
-    |  28.30 |31| name      |
-    | ------ |--| --------- |
-    |   00   |0 | ternlogi  |
-    |  000   |1 | ternlog   |
-    |  100   |1 | grevlog   |
-    |  010   |Rc| bitmask   |
-    |  011   |Rc| gf*       |
-    |  101   |1 | ternlogv  |
-    |  101   |0 | ternlogcr |
-    |  110   |Rc| 1/2-op    |
-    |  111   |Rc| 3-op      |
+|  28.30 |31| name      |
+| ------ |--| --------- |
+|  -00   |0 | ternlogi  |
+|  -00   |1 | grevlog   |
+|  -01   |  | grevlogi  |
+|  010   |Rc| bitmask   |
+|  011   |0 | gfbmadd*  |
+|  011   |1 | clmadd*   |
+|  110   |Rc| 1/2-op    |
+|  111   |1 | ternlogv  |
+|  111   |0 | ternlogcr |
  
  1-op and variants
  
@@ -66,36 +68,35 @@ minor opcode allocation
  | RT   | RA   | RB   | type | minmax | 
  | RT   | RA   | RB   |      | av abs avgadd  | 
  | RT   | RA   | RB   | type | vmask ops | 
-| RT   | RA   | RB   |  |  | 
+| RT   | RA   | RB   |      |       | 
  
  3 ops 
  
-* bitmask set/extract
+* grevlog
  * ternlog bitops
-* GF
+* GF mul-add
+* bitmask-reverse
  
  TODO: convert all instructions to use RT and not RS
  
-| 0.5|6.10|11.15|16.20 |21..25   | 26....30 |31| name |
-| -- | -- | --- | ---  | -----   | -------- |--| ------ |
-| NN | RT | RA  | RB   | RC      | mode 000 |1 | ternlog |
-| NN | RT | RA  | RB   | im0-4   | im5-7 00 |0 | ternlogi |
-| NN | RT | RA  | RB   | / im0-3 | 00   100 |1 | grevlog |
-| NN | RT | RA  | s0-5 | s6 im0-3| 01   100 |1 | grevlogi |
-| NN | RT | RA  |      |         | 1-   100 |1 | rsvd |
-| NN | RS | RA  | RB   | RC      | 00  011  |Rc| gfmul |
-| NN | RS | RA  | RB   | RC      | 01  011  |Rc| gfmaddsub |
-| NN | RT | RA  | RB   |         | 10  011  |Rc| rsvd  |
-| NN | RS | RA  | RB   |         | 11  011  |Rc| rsvd  |
-| NN | RS | RA  | RB   |         | 11  111  |Rc| rsvd   |
+| 0.5|6.10|11.15|16.20 |21..25   | 26....30  |31| name |
+| -- | -- | --- | ---  | -----   | --------  |--| ------ |
+| NN | RT | RA  | RB   | im0-4   | im5-7  00 |0 | ternlogi |
+| NN | RT | RA  | RB   | im0-4   | im5-7  00 |1 | grevlog |
+| NN | RT | RA  | s0-4 | im0-4   | im5-7  01 |s5| grevlogi |
+| NN | RS | RA  | RB   | RC      | 00    011 |0 | gfbmadd |
+| NN | RS | RA  | RB   | RC      | 10    011 |0 | gfbmaddsub |
+| NN | RS | RA  | RB   | RC      | 00    011 |1 | clmadd |
+| NN | RS | RA  | RB   | RC      | 10    011 |1 | clmaddsub |
+| NN | RT | RA  | RB   | sh0-4   | sh5 1 011 |Rc| bmrevi |
  
  | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
  | -- | -- | --- | ----- | ---- | ----- |--| ------ |
-| NN | RT | RA  | imm   | mask | 101   |1 | ternlogv |
+| NN | RT | RA  | imm   | mask | 111   |1 | ternlogv |
  
  | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
  | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
-| NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 | ternlogcr |
+| NN | BA | BB  | BC  |0 |imm  | mask | 111  |0 | ternlogcr |
  
  ops (note that av avg and abs as well as vec scalar mask
  are included here)
@@ -114,10 +115,10 @@ double check that instructions didn't need 3 inputs.
  | NN | RA | RB  |     | 1  |   11  | 0100 110 |Rc| rsvd |
  | NN | RA | RB  | sh  | SH | itype | 1000 110 |Rc| bmopsi |
  | NN | RA | RB  |     |    |       | 1100 110 |Rc| rsvd |
-| NN | RT | RA  | RB  | 1  |  00   | 0001 110 |Rc| gfdiv |
-| NN | RT | RA  | RB  | 1  |  01   | 0001 110 |Rc| gfmod |
-| NN | RT | RA  |     | 1  |  10   | 0001 110 |Rc| rsvd |
-| NN | RA | RB  |     | 1  |  11   | 0001 110 |Rc| rsvd |
+| NN | RT | RA  | RB  | 1  |  00   | 0001 110 |Rc| cldiv |
+| NN | RT | RA  | RB  | 1  |  01   | 0001 110 |Rc| clmod |
+| NN | RT | RA  | RB  | 1  |  10   | 0001 110 |Rc|       |
+| NN | RT | RB  | RB  | 1  |  11   | 0001 110 |Rc| clinv |
  | NN | RA | RB  | RC  | 0  |   00  | 0001 110 |Rc| vec sbfm |
  | NN | RA | RB  | RC  | 0  |   01  | 0001 110 |Rc| vec sofm |
  | NN | RA | RB  | RC  | 0  |   10  | 0001 110 |Rc| vec sifm |
@@ -215,7 +216,7 @@ also, another possible variant involving swizzle and vec4:
  
  | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
  | -- | -- | --- | ----- | ---- | ----- |--|
-| NN | RT | RA  | imm   | mask | 101   |1 |
+| NN | RT | RA  | imm   | mask | 111   |1 |
  
      for i in range(8):
          idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]   
@@ -229,7 +230,7 @@ another mode selection would be CRs not Ints.
  
  | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
  | -- | -- | --- | --- |- |-----|----- | -----|--|
-| NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 |
+| NN | BA | BB  | BC  |0 |imm  | mask | 111  |0 |
  
      for i in range(4):
          if not mask[i] continue
@@ -265,7 +266,6 @@ bmset(RA=0, RB=0, RC=mask) will produce a run of ones of length "mask" in a sing
  | 0.5|6.10|11.15|16.20|21.25| 26..30  |31| name  |
  | -- | -- | --- | --- | --- | ------- |--| ----- |
  | NN | RT | RA  | RB  | RC  | mode 010 |Rc| bm*   |
-| NN | RT | RA  | RB  | RC  | 0 1  111 |Rc| bmrev |
  
  
  ```
@@ -317,31 +317,40 @@ uint_xlen_t bmextrev(RA, RB, sh)
  
  | 0.5|6.10|11.15|16.20|21.26| 27..30  |31| name   |
  | -- | -- | --- | --- | --- | ------- |--| ------ |
-| NN | RT | RA  | RB  | sh  | 0   111 |Rc| bmrevi |
+| NN | RT | RA  | RB  | sh  | 1   011 |Rc| bmrevi |
  
  
  # grevlut
  
-generalised reverse combined with a LUT2 and allowing
+generalised reverse combined with a pair of LUT2s and allowing
  zero when RA=0 provides a wide range of instructions
  and a means to set regular 64 bit patterns in one
  32 bit instruction.
  
+the two LUT2s are applied left-half (when not swapping)
+and right-half (when swapping) so as to allow a wider
+range of options
+
+<img src="/openpower/sv/grevlut2x2.jpg" width=700 />
+
  ```
  lut2(imm, a, b):
      idx = b << 1 | a
      return imm[idx] # idx by LSB0 order
  
-dorow(imm, step_i, chunksize):
+dorow(imm8, step_i, chunksize):
      for j in 0 to 63:
+        if (j&chunk_size) == 0
+           imm = imm8[0..3]
+        else
+           imm = imm8[4..7]
          step_o[j] = lut2(imm, step_i[j], step_i[j ^ chunk_size])
      return step_o
  
-uint64_t grevlut64(uint64_t RA, uint64_t RB, uint8 lut2)
+uint64_t grevlut64(uint64_t RA, uint64_t RB, uint8 imm)
  {
      uint64_t x = RA;
      int shamt = RB & 63;
-    int imm = lut2 & 0b1111;
      for i in 0 to 6
          step = 1<<i
          if (shamt & step) x = dorow(imm, x, step)
@@ -514,11 +523,11 @@ uint64_t gorc64(uint64_t RA, uint64_t RB)
  
  # Galois Field 2^M
  
-see <https://courses.csail.mit.edu/6.857/2016/files/ffield.py>
-
-
-https://engineering.purdue.edu/kak/compsec/NewLectures/Lecture7.pdf
+see:
  
+* <https://courses.csail.mit.edu/6.857/2016/files/ffield.py>
+* <https://engineering.purdue.edu/kak/compsec/NewLectures/Lecture7.pdf>
+* <https://foss.heptapod.net/math/libgf2/-/blob/branch/default/src/libgf2/gf2.py>
  
  ## SPRs to set modulo and degree
  
@@ -528,34 +537,29 @@ arithmetic the modulo is to be set in an SPR
  ## Twin Butterfly (Tukey-Cooley) Mul-add-sub
  
  used in combination with SV FFT REMAP to perform
-a full NTT in-place
+a full NTT in-place. possible by having 3-in 2-out,
+to avoid the need for a temp register.  RS is written
+to as well as RT.
  
      gffmadd  RT,RA,RC,RB (Rc=0)
      gffmadd. RT,RA,RC,RB (Rc=1)
  
  Pseudo-code:
  
-    RT <- GFMULADD(RA, RC, RB)
-    RS <- GFMULADD(RA, RC, RB)
+    RT <- GFADD(GFMUL(RA, RC), RB))
+    RS <- GFADD(GFMUL(RA, RC), RB))
  
  
  ## Multiply
  
-this requires 3 parameters and a "degree"
-
-    RT = GFMUL(RA, RB, gfdegree, modulo=RC)
+with the modulo and degree being in an SPR, multiply can be identical
+equivalent to standard integer add
  
-realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
+    RS = GFMUL(RA, RB)
  
-    RS = GFMUL(RS, RA, gfdegree, modulo=RC)
-    RS = GFMUL(RS, RA, gfdegree=RB, modulo=RC)
-
-| 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
-| -- | -- | --- | --- | --- | ------- |--|
-| NN | RS | RA  | deg | RC  | 00  011 |Rc|
-| NN | RS | RA  | RB  | RC  | 11  011 |Rc|
-
-where the SimpleV variant may override RS-as-src differently from RS-as-dest
+| 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
+| -- | -- | --- | --- | --- | ------ |--|
+| NN | RT | RA  | RB  |11000|  01110 |Rc|
  
  
  
@@ -594,9 +598,11 @@ def multGF2(p1, p2):
      """Multiply two polynomials in GF(2^m)/g(x)"""
      p = 0
      while p2:
+        # standard long-multiplication: check LSB and add
          if p2 & 1:
              p ^= p1
          p1 <<= 1
+        # standard modulo: check MSB and add polynomial
          if p1 & mask1:
              p1 ^= polyred
          p2 >>= 1
@@ -617,42 +623,45 @@ if __name__ == "__main__":
      # Evaluate the product (x^7)(x^7 + x + 1)
      print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
  ```
-## GF div and mod
+
+## GF(2^M) Inverse
  
  ```
-def FullDivision(self, f, v, fDegree, vDegree):
-        """
-        Takes four arguments, f, v, fDegree, and vDegree where
-        fDegree and vDegree are the degrees of the field elements
-        f and v represented as a polynomials.
-        This method returns the field elements a and b such that
+# https://bugs.libre-soc.org/show_bug.cgi?id=782#c33
+# https://ftp.libre-soc.org/ARITH18_Kobayashi.pdf
+def gf_invert(a) :
  
-            f(x) = a(x) * v(x) + b(x).  
+    s = getGF2() # get the full polynomial (including the MSB)
+    r = a
+    v = 0
+    u = 1
+    j = 0
  
-        That is, a is the divisor and b is the remainder, or in
-        other words a is like floor(f/v) and b is like f modulo v.
-        """
+    for i in range(1, 2*degree+1):
+        # could use count-trailing-1s here to skip ahead
+        if r & mask1:          # test MSB of r
+            if s & mask1:      # test MSB of s
+                s ^= r
+                v ^= u
+            s <<= 1            # shift left 1
+            if j == 0:
+                r, s = s, r    # swap r,s
+                u, v = v<<1, u # shift v and swap
+                j = 1
+            else:
+                u >>= 1        # right shift left
+                j -= 1
+        else:
+            r <<= 1            # shift left 1
+            u <<= 1            # shift left 1
+            j += 1
  
-        res, rem = 0, f
-        i = fDegree
-        mask = 1 << i
-        while (i >= vDegree):
-            if (mask & rem):
-                res ^= (1 << (i - vDegree))
-                rem ^= ( v << (i - vDegree)))
-            i -= 1
-            mask >>= 1
-        return (res, rem)
+    return u
  ```
  
-| 0.5|6.10|11.15|16.20|21.25| 26..30  |31| name  |
-| -- | -- | --- | --- | --- | ------- |--| ----- |
-| NN | RS | RA  | deg | RC  | 0 1  011 |Rc| gfaddi |
-| NN | RS | RA  | RB  | RC  | 1 1  111 |Rc| gfadd |
-
-GFMOD is a pseudo-op where RA=0
+# GF2 (Carryless)
  
-## gf invert
+## GF2 (carryless) div and mod
  
  ```
  def gf_degree(a) :
@@ -663,33 +672,39 @@ def gf_degree(a) :
      res += 1;
    return res
  
-def gf_invert(a, mod=0x1B) :
-  v = mod
-  g1 = 1
-  g2 = 0
-  j = gf_degree(a) - 8
-
-  while (a != 1) :
-    if (j < 0) :
-      a, v = v, a
-      g1, g2 = g2, g1
-      j = -j
-
-    a ^= v << j
-    g1 ^= g2 << j
+def FullDivision(self, f, v):
+        """
+        Takes two arguments, f, v
+        fDegree and vDegree are the degrees of the field elements
+        f and v represented as a polynomials.
+        This method returns the field elements a and b such that
  
-    a %= 256  # Emulating 8-bit overflow
-    g1 %= 256 # Emulating 8-bit overflow
+            f(x) = a(x) * v(x) + b(x).  
  
-    j = gf_degree(a) - gf_degree(v)
+        That is, a is the divisor and b is the remainder, or in
+        other words a is like floor(f/v) and b is like f modulo v.
+        """
  
-  return g1
+        fDegree, vDegree = gf_degree(f), gf_degree(v)
+        res, rem = 0, f
+        for i in reversed(range(vDegree, fDegree+1):
+            if ((rem >> i) & 1): # check bit
+                res ^= (1 << (i - vDegree))
+                rem ^= ( v << (i - vDegree)))
+        return (res, rem)
  ```
  
-## carryless mul
+| 0.5|6.10|11.15|16.20| 21 | 22.23 | 24....30 |31| name |
+| -- | -- | --- | --- | -- | ----- | -------- |--| ---- |
+| NN | RT | RA  | RB  | 1  |  00   | 0001 110 |Rc| cldiv |
+| NN | RT | RA  | RB  | 1  |  01   | 0001 110 |Rc| clmod |
+
+## GF2 carryless mul
  
  based on RV bitmanip
-see https://en.wikipedia.org/wiki/CLMUL_instruction_set
+see <https://en.wikipedia.org/wiki/CLMUL_instruction_set> and
+<https://www.felixcloutier.com/x86/pclmulqdq> and
+<https://en.m.wikipedia.org/wiki/Carry-less_product>
  
  these are GF2 operations with the modulo set to 2^degree.
  they are worth adding as their own non-overwrite operations