X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=openpower%2Fsv%2Fbitmanip.mdwn;h=6b3277bb34954c17f403ecc5b9d86b1faf2cc6fe;hb=ffc445c716a07de767a78fa7a16143a9b7393342;hp=b7a54c7ffcd5b64e3b1d6bbefeec9ed9c11b1909;hpb=bbbd1296e042ffbfd4cb94e4749f106d25d2ac4a;p=libreriscv.git

diff --git a/openpower/sv/bitmanip.mdwn b/openpower/sv/bitmanip.mdwn
index b7a54c7ff..6b3277bb3 100644
--- a/openpower/sv/bitmanip.mdwn
+++ b/openpower/sv/bitmanip.mdwn
@@ -12,6 +12,8 @@
 
 **DRAFT STATUS**
 
+pseudocode: <https://libre-soc.org/openpower/isa/bitmanip/>
+
 this extension amalgamates bitmanipulation primitives from many sources, including RISC-V bitmanip, Packed SIMD, AVX-512 and OpenPOWER VSX.  Vectorisation and SIMD are removed: these are straight scalar (element) operations making them suitable for embedded applications.
 Vectorisation Context is provided by [[openpower/sv]].
 
@@ -33,17 +35,17 @@ Useful resource:
 
 minor opcode allocation
 
-    |  28.30 |31| name      |
-    | ------ |--| --------- |
-    |   00   |0 | ternlogi  |
-    |  000   |1 | ternlog   |
-    |  100   |1 | grevlog   |
-    |  010   |Rc| bitmask   |
-    |  011   |Rc| gf*       |
-    |  101   |1 | ternlogv  |
-    |  101   |0 | ternlogcr |
-    |  110   |Rc| 1/2-op    |
-    |  111   |Rc| 3-op      |
+|  28.30 |31| name      |
+| ------ |--| --------- |
+|  -00   |0 | ternlogi  |
+|  -00   |1 | grevlog   |
+|  -01   |  | grevlogi  |
+|  010   |Rc| bitmask   |
+|  011   |0 | gfbmadd*  |
+|  011   |1 | clmadd*   |
+|  110   |Rc| 1/2-op    |
+|  111   |1 | ternlogv  |
+|  111   |0 | ternlogcr |
 
 1-op and variants
 
@@ -66,36 +68,35 @@ minor opcode allocation
 | RT   | RA   | RB   | type | minmax | 
 | RT   | RA   | RB   |      | av abs avgadd  | 
 | RT   | RA   | RB   | type | vmask ops | 
-| RT   | RA   | RB   |  |  | 
+| RT   | RA   | RB   |      |       | 
 
 3 ops 
 
-* bitmask set/extract
+* grevlog
 * ternlog bitops
-* GF
+* GF mul-add
+* bitmask-reverse
 
 TODO: convert all instructions to use RT and not RS
 
-| 0.5|6.10|11.15|16.20 |21..25   | 26....30 |31| name |
-| -- | -- | --- | ---  | -----   | -------- |--| ------ |
-| NN | RT | RA  | RB   | RC      | mode 000 |1 | ternlog |
-| NN | RT | RA  | RB   | im0-4   | im5-7 00 |0 | ternlogi |
-| NN | RT | RA  | RB   | / im0-3 | 00   100 |1 | grevlog |
-| NN | RT | RA  | s0-5 | s6 im0-3| 01   100 |1 | grevlogi |
-| NN | RT | RA  |      |         | 1-   100 |1 | rsvd |
-| NN | RS | RA  | RB   | RC      | 00  011  |Rc| gfmul |
-| NN | RS | RA  | RB   | RC      | 01  011  |Rc| gfmaddsub |
-| NN | RT | RA  | RB   |         | 10  011  |Rc| rsvd  |
-| NN | RS | RA  | RB   |         | 11  011  |Rc| rsvd  |
-| NN | RS | RA  | RB   |         | 11  111  |Rc| rsvd   |
+| 0.5|6.10|11.15|16.20 |21..25   | 26....30  |31| name |
+| -- | -- | --- | ---  | -----   | --------  |--| ------ |
+| NN | RT | RA  | RB   | im0-4   | im5-7  00 |0 | ternlogi |
+| NN | RT | RA  | RB   | im0-4   | im5-7  00 |1 | grevlog |
+| NN | RT | RA  | s0-4 | im0-4   | im5-7  01 |s5| grevlogi |
+| NN | RS | RA  | RB   | RC      | 00    011 |0 | gfbmadd |
+| NN | RS | RA  | RB   | RC      | 10    011 |0 | gfbmaddsub |
+| NN | RS | RA  | RB   | RC      | 00    011 |1 | clmadd |
+| NN | RS | RA  | RB   | RC      | 10    011 |1 | clmaddsub |
+| NN | RT | RA  | RB   | sh0-4   | sh5 1 011 |Rc| bmrevi |
 
 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
-| NN | RT | RA  | imm   | mask | 101   |1 | ternlogv |
+| NN | RT | RA  | imm   | mask | 111   |1 | ternlogv |
 
 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
-| NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 | ternlogcr |
+| NN | BA | BB  | BC  |0 |imm  | mask | 111  |0 | ternlogcr |
 
 ops (note that av avg and abs as well as vec scalar mask
 are included here)
@@ -114,10 +115,10 @@ double check that instructions didn't need 3 inputs.
 | NN | RA | RB  |     | 1  |   11  | 0100 110 |Rc| rsvd |
 | NN | RA | RB  | sh  | SH | itype | 1000 110 |Rc| bmopsi |
 | NN | RA | RB  |     |    |       | 1100 110 |Rc| rsvd |
-| NN | RT | RA  | RB  | 1  |  00   | 0001 110 |Rc| gfdiv |
-| NN | RT | RA  | RB  | 1  |  01   | 0001 110 |Rc| gfmod |
-| NN | RT | RA  |     | 1  |  10   | 0001 110 |Rc| rsvd |
-| NN | RA | RB  |     | 1  |  11   | 0001 110 |Rc| rsvd |
+| NN | RT | RA  | RB  | 1  |  00   | 0001 110 |Rc| cldiv |
+| NN | RT | RA  | RB  | 1  |  01   | 0001 110 |Rc| clmod |
+| NN | RT | RA  | RB  | 1  |  10   | 0001 110 |Rc|       |
+| NN | RT | RB  | RB  | 1  |  11   | 0001 110 |Rc| clinv |
 | NN | RA | RB  | RC  | 0  |   00  | 0001 110 |Rc| vec sbfm |
 | NN | RA | RB  | RC  | 0  |   01  | 0001 110 |Rc| vec sofm |
 | NN | RA | RB  | RC  | 0  |   10  | 0001 110 |Rc| vec sifm |
@@ -215,7 +216,7 @@ also, another possible variant involving swizzle and vec4:
 
 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 | -- | -- | --- | ----- | ---- | ----- |--|
-| NN | RT | RA  | imm   | mask | 101   |1 |
+| NN | RT | RA  | imm   | mask | 111   |1 |
 
     for i in range(8):
         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]   
@@ -229,7 +230,7 @@ another mode selection would be CRs not Ints.
 
 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
 | -- | -- | --- | --- |- |-----|----- | -----|--|
-| NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 |
+| NN | BA | BB  | BC  |0 |imm  | mask | 111  |0 |
 
     for i in range(4):
         if not mask[i] continue
@@ -265,7 +266,6 @@ bmset(RA=0, RB=0, RC=mask) will produce a run of ones of length "mask" in a sing
 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31| name  |
 | -- | -- | --- | --- | --- | ------- |--| ----- |
 | NN | RT | RA  | RB  | RC  | mode 010 |Rc| bm*   |
-| NN | RT | RA  | RB  | RC  | 0 1  111 |Rc| bmrev |
 
 
 ```
@@ -317,31 +317,40 @@ uint_xlen_t bmextrev(RA, RB, sh)
 
 | 0.5|6.10|11.15|16.20|21.26| 27..30  |31| name   |
 | -- | -- | --- | --- | --- | ------- |--| ------ |
-| NN | RT | RA  | RB  | sh  | 0   111 |Rc| bmrevi |
+| NN | RT | RA  | RB  | sh  | 1   011 |Rc| bmrevi |
 
 
 # grevlut
 
-generalised reverse combined with a LUT2 and allowing
+generalised reverse combined with a pair of LUT2s and allowing
 zero when RA=0 provides a wide range of instructions
 and a means to set regular 64 bit patterns in one
 32 bit instruction.
 
+the two LUT2s are applied left-half (when not swapping)
+and right-half (when swapping) so as to allow a wider
+range of options
+
+<img src="/openpower/sv/grevlut2x2.jpg" width=700 />
+
 ```
 lut2(imm, a, b):
     idx = b << 1 | a
     return imm[idx] # idx by LSB0 order
 
-dorow(imm, step_i, chunksize):
+dorow(imm8, step_i, chunksize):
     for j in 0 to 63:
+        if (j&chunk_size) == 0
+           imm = imm8[0..3]
+        else
+           imm = imm8[4..7]
         step_o[j] = lut2(imm, step_i[j], step_i[j ^ chunk_size])
     return step_o
 
-uint64_t grevlut64(uint64_t RA, uint64_t RB, uint8 lut2)
+uint64_t grevlut64(uint64_t RA, uint64_t RB, uint8 imm)
 {
     uint64_t x = RA;
     int shamt = RB & 63;
-    int imm = lut2 & 0b1111;
     for i in 0 to 6
         step = 1<<i
         if (shamt & step) x = dorow(imm, x, step)
@@ -514,11 +523,11 @@ uint64_t gorc64(uint64_t RA, uint64_t RB)
 
 # Galois Field 2^M
 
-see <https://courses.csail.mit.edu/6.857/2016/files/ffield.py>
-
-
-https://engineering.purdue.edu/kak/compsec/NewLectures/Lecture7.pdf
+see:
 
+* <https://courses.csail.mit.edu/6.857/2016/files/ffield.py>
+* <https://engineering.purdue.edu/kak/compsec/NewLectures/Lecture7.pdf>
+* <https://foss.heptapod.net/math/libgf2/-/blob/branch/default/src/libgf2/gf2.py>
 
 ## SPRs to set modulo and degree
 
@@ -528,34 +537,29 @@ arithmetic the modulo is to be set in an SPR
 ## Twin Butterfly (Tukey-Cooley) Mul-add-sub
 
 used in combination with SV FFT REMAP to perform
-a full NTT in-place
+a full NTT in-place. possible by having 3-in 2-out,
+to avoid the need for a temp register.  RS is written
+to as well as RT.
 
     gffmadd  RT,RA,RC,RB (Rc=0)
     gffmadd. RT,RA,RC,RB (Rc=1)
 
 Pseudo-code:
 
-    RT <- GFMULADD(RA, RC, RB)
-    RS <- GFMULADD(RA, RC, RB)
+    RT <- GFADD(GFMUL(RA, RC), RB))
+    RS <- GFADD(GFMUL(RA, RC), RB))
 
 
 ## Multiply
 
-this requires 3 parameters and a "degree"
-
-    RT = GFMUL(RA, RB, gfdegree, modulo=RC)
+with the modulo and degree being in an SPR, multiply can be identical
+equivalent to standard integer add
 
-realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
+    RS = GFMUL(RA, RB)
 
-    RS = GFMUL(RS, RA, gfdegree, modulo=RC)
-    RS = GFMUL(RS, RA, gfdegree=RB, modulo=RC)
-
-| 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
-| -- | -- | --- | --- | --- | ------- |--|
-| NN | RS | RA  | deg | RC  | 00  011 |Rc|
-| NN | RS | RA  | RB  | RC  | 11  011 |Rc|
-
-where the SimpleV variant may override RS-as-src differently from RS-as-dest
+| 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
+| -- | -- | --- | --- | --- | ------ |--|
+| NN | RT | RA  | RB  |11000|  01110 |Rc|
 
 
 
@@ -594,9 +598,11 @@ def multGF2(p1, p2):
     """Multiply two polynomials in GF(2^m)/g(x)"""
     p = 0
     while p2:
+        # standard long-multiplication: check LSB and add
         if p2 & 1:
             p ^= p1
         p1 <<= 1
+        # standard modulo: check MSB and add polynomial
         if p1 & mask1:
             p1 ^= polyred
         p2 >>= 1
@@ -617,42 +623,45 @@ if __name__ == "__main__":
     # Evaluate the product (x^7)(x^7 + x + 1)
     print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
 ```
-## GF div and mod
+
+## GF(2^M) Inverse
 
 ```
-def FullDivision(self, f, v, fDegree, vDegree):
-        """
-        Takes four arguments, f, v, fDegree, and vDegree where
-        fDegree and vDegree are the degrees of the field elements
-        f and v represented as a polynomials.
-        This method returns the field elements a and b such that
+# https://bugs.libre-soc.org/show_bug.cgi?id=782#c33
+# https://ftp.libre-soc.org/ARITH18_Kobayashi.pdf
+def gf_invert(a) :
 
-            f(x) = a(x) * v(x) + b(x).  
+    s = getGF2() # get the full polynomial (including the MSB)
+    r = a
+    v = 0
+    u = 1
+    j = 0
 
-        That is, a is the divisor and b is the remainder, or in
-        other words a is like floor(f/v) and b is like f modulo v.
-        """
+    for i in range(1, 2*degree+1):
+        # could use count-trailing-1s here to skip ahead
+        if r & mask1:          # test MSB of r
+            if s & mask1:      # test MSB of s
+                s ^= r
+                v ^= u
+            s <<= 1            # shift left 1
+            if j == 0:
+                r, s = s, r    # swap r,s
+                u, v = v<<1, u # shift v and swap
+                j = 1
+            else:
+                u >>= 1        # right shift left
+                j -= 1
+        else:
+            r <<= 1            # shift left 1
+            u <<= 1            # shift left 1
+            j += 1
 
-        res, rem = 0, f
-        i = fDegree
-        mask = 1 << i
-        while (i >= vDegree):
-            if (mask & rem):
-                res ^= (1 << (i - vDegree))
-                rem ^= ( v << (i - vDegree)))
-            i -= 1
-            mask >>= 1
-        return (res, rem)
+    return u
 ```
 
-| 0.5|6.10|11.15|16.20|21.25| 26..30  |31| name  |
-| -- | -- | --- | --- | --- | ------- |--| ----- |
-| NN | RS | RA  | deg | RC  | 0 1  011 |Rc| gfaddi |
-| NN | RS | RA  | RB  | RC  | 1 1  111 |Rc| gfadd |
-
-GFMOD is a pseudo-op where RA=0
+# GF2 (Carryless)
 
-## gf invert
+## GF2 (carryless) div and mod
 
 ```
 def gf_degree(a) :
@@ -663,33 +672,39 @@ def gf_degree(a) :
     res += 1;
   return res
 
-def gf_invert(a, mod=0x1B) :
-  v = mod
-  g1 = 1
-  g2 = 0
-  j = gf_degree(a) - 8
-
-  while (a != 1) :
-    if (j < 0) :
-      a, v = v, a
-      g1, g2 = g2, g1
-      j = -j
-
-    a ^= v << j
-    g1 ^= g2 << j
+def FullDivision(self, f, v):
+        """
+        Takes two arguments, f, v
+        fDegree and vDegree are the degrees of the field elements
+        f and v represented as a polynomials.
+        This method returns the field elements a and b such that
 
-    a %= 256  # Emulating 8-bit overflow
-    g1 %= 256 # Emulating 8-bit overflow
+            f(x) = a(x) * v(x) + b(x).  
 
-    j = gf_degree(a) - gf_degree(v)
+        That is, a is the divisor and b is the remainder, or in
+        other words a is like floor(f/v) and b is like f modulo v.
+        """
 
-  return g1
+        fDegree, vDegree = gf_degree(f), gf_degree(v)
+        res, rem = 0, f
+        for i in reversed(range(vDegree, fDegree+1):
+            if ((rem >> i) & 1): # check bit
+                res ^= (1 << (i - vDegree))
+                rem ^= ( v << (i - vDegree)))
+        return (res, rem)
 ```
 
-## carryless mul
+| 0.5|6.10|11.15|16.20| 21 | 22.23 | 24....30 |31| name |
+| -- | -- | --- | --- | -- | ----- | -------- |--| ---- |
+| NN | RT | RA  | RB  | 1  |  00   | 0001 110 |Rc| cldiv |
+| NN | RT | RA  | RB  | 1  |  01   | 0001 110 |Rc| clmod |
+
+## GF2 carryless mul
 
 based on RV bitmanip
-see https://en.wikipedia.org/wiki/CLMUL_instruction_set
+see <https://en.wikipedia.org/wiki/CLMUL_instruction_set> and
+<https://www.felixcloutier.com/x86/pclmulqdq> and
+<https://en.m.wikipedia.org/wiki/Carry-less_product>
 
 these are GF2 operations with the modulo set to 2^degree.
 they are worth adding as their own non-overwrite operations