X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=openpower%2Fsv%2Fbitmanip.mdwn;h=6b3277bb34954c17f403ecc5b9d86b1faf2cc6fe;hb=ffc445c716a07de767a78fa7a16143a9b7393342;hp=b7a54c7ffcd5b64e3b1d6bbefeec9ed9c11b1909;hpb=bbbd1296e042ffbfd4cb94e4749f106d25d2ac4a;p=libreriscv.git diff --git a/openpower/sv/bitmanip.mdwn b/openpower/sv/bitmanip.mdwn index b7a54c7ff..6b3277bb3 100644 --- a/openpower/sv/bitmanip.mdwn +++ b/openpower/sv/bitmanip.mdwn @@ -12,6 +12,8 @@ **DRAFT STATUS** +pseudocode: + this extension amalgamates bitmanipulation primitives from many sources, including RISC-V bitmanip, Packed SIMD, AVX-512 and OpenPOWER VSX. Vectorisation and SIMD are removed: these are straight scalar (element) operations making them suitable for embedded applications. Vectorisation Context is provided by [[openpower/sv]]. @@ -33,17 +35,17 @@ Useful resource: minor opcode allocation - | 28.30 |31| name | - | ------ |--| --------- | - | 00 |0 | ternlogi | - | 000 |1 | ternlog | - | 100 |1 | grevlog | - | 010 |Rc| bitmask | - | 011 |Rc| gf* | - | 101 |1 | ternlogv | - | 101 |0 | ternlogcr | - | 110 |Rc| 1/2-op | - | 111 |Rc| 3-op | +| 28.30 |31| name | +| ------ |--| --------- | +| -00 |0 | ternlogi | +| -00 |1 | grevlog | +| -01 | | grevlogi | +| 010 |Rc| bitmask | +| 011 |0 | gfbmadd* | +| 011 |1 | clmadd* | +| 110 |Rc| 1/2-op | +| 111 |1 | ternlogv | +| 111 |0 | ternlogcr | 1-op and variants @@ -66,36 +68,35 @@ minor opcode allocation | RT | RA | RB | type | minmax | | RT | RA | RB | | av abs avgadd | | RT | RA | RB | type | vmask ops | -| RT | RA | RB | | | +| RT | RA | RB | | | 3 ops -* bitmask set/extract +* grevlog * ternlog bitops -* GF +* GF mul-add +* bitmask-reverse TODO: convert all instructions to use RT and not RS -| 0.5|6.10|11.15|16.20 |21..25 | 26....30 |31| name | -| -- | -- | --- | --- | ----- | -------- |--| ------ | -| NN | RT | RA | RB | RC | mode 000 |1 | ternlog | -| NN | RT | RA | RB | im0-4 | im5-7 00 |0 | ternlogi | -| NN | RT | RA | RB | / im0-3 | 00 100 |1 | grevlog | -| NN | RT | RA | s0-5 | s6 im0-3| 01 100 |1 | grevlogi | -| NN | RT | RA | | | 1- 100 |1 | rsvd | -| NN | RS | RA | RB | RC | 00 011 |Rc| gfmul | -| NN | RS | RA | RB | RC | 01 011 |Rc| gfmaddsub | -| NN | RT | RA | RB | | 10 011 |Rc| rsvd | -| NN | RS | RA | RB | | 11 011 |Rc| rsvd | -| NN | RS | RA | RB | | 11 111 |Rc| rsvd | +| 0.5|6.10|11.15|16.20 |21..25 | 26....30 |31| name | +| -- | -- | --- | --- | ----- | -------- |--| ------ | +| NN | RT | RA | RB | im0-4 | im5-7 00 |0 | ternlogi | +| NN | RT | RA | RB | im0-4 | im5-7 00 |1 | grevlog | +| NN | RT | RA | s0-4 | im0-4 | im5-7 01 |s5| grevlogi | +| NN | RS | RA | RB | RC | 00 011 |0 | gfbmadd | +| NN | RS | RA | RB | RC | 10 011 |0 | gfbmaddsub | +| NN | RS | RA | RB | RC | 00 011 |1 | clmadd | +| NN | RS | RA | RB | RC | 10 011 |1 | clmaddsub | +| NN | RT | RA | RB | sh0-4 | sh5 1 011 |Rc| bmrevi | | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name | | -- | -- | --- | ----- | ---- | ----- |--| ------ | -| NN | RT | RA | imm | mask | 101 |1 | ternlogv | +| NN | RT | RA | imm | mask | 111 |1 | ternlogv | | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name | | -- | -- | --- | --- |- |-----|----- | -----|--| -------| -| NN | BA | BB | BC |0 |imm | mask | 101 |0 | ternlogcr | +| NN | BA | BB | BC |0 |imm | mask | 111 |0 | ternlogcr | ops (note that av avg and abs as well as vec scalar mask are included here) @@ -114,10 +115,10 @@ double check that instructions didn't need 3 inputs. | NN | RA | RB | | 1 | 11 | 0100 110 |Rc| rsvd | | NN | RA | RB | sh | SH | itype | 1000 110 |Rc| bmopsi | | NN | RA | RB | | | | 1100 110 |Rc| rsvd | -| NN | RT | RA | RB | 1 | 00 | 0001 110 |Rc| gfdiv | -| NN | RT | RA | RB | 1 | 01 | 0001 110 |Rc| gfmod | -| NN | RT | RA | | 1 | 10 | 0001 110 |Rc| rsvd | -| NN | RA | RB | | 1 | 11 | 0001 110 |Rc| rsvd | +| NN | RT | RA | RB | 1 | 00 | 0001 110 |Rc| cldiv | +| NN | RT | RA | RB | 1 | 01 | 0001 110 |Rc| clmod | +| NN | RT | RA | RB | 1 | 10 | 0001 110 |Rc| | +| NN | RT | RB | RB | 1 | 11 | 0001 110 |Rc| clinv | | NN | RA | RB | RC | 0 | 00 | 0001 110 |Rc| vec sbfm | | NN | RA | RB | RC | 0 | 01 | 0001 110 |Rc| vec sofm | | NN | RA | RB | RC | 0 | 10 | 0001 110 |Rc| vec sifm | @@ -215,7 +216,7 @@ also, another possible variant involving swizzle and vec4: | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| | -- | -- | --- | ----- | ---- | ----- |--| -| NN | RT | RA | imm | mask | 101 |1 | +| NN | RT | RA | imm | mask | 111 |1 | for i in range(8): idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i] @@ -229,7 +230,7 @@ another mode selection would be CRs not Ints. | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| | -- | -- | --- | --- |- |-----|----- | -----|--| -| NN | BA | BB | BC |0 |imm | mask | 101 |0 | +| NN | BA | BB | BC |0 |imm | mask | 111 |0 | for i in range(4): if not mask[i] continue @@ -265,7 +266,6 @@ bmset(RA=0, RB=0, RC=mask) will produce a run of ones of length "mask" in a sing | 0.5|6.10|11.15|16.20|21.25| 26..30 |31| name | | -- | -- | --- | --- | --- | ------- |--| ----- | | NN | RT | RA | RB | RC | mode 010 |Rc| bm* | -| NN | RT | RA | RB | RC | 0 1 111 |Rc| bmrev | ``` @@ -317,31 +317,40 @@ uint_xlen_t bmextrev(RA, RB, sh) | 0.5|6.10|11.15|16.20|21.26| 27..30 |31| name | | -- | -- | --- | --- | --- | ------- |--| ------ | -| NN | RT | RA | RB | sh | 0 111 |Rc| bmrevi | +| NN | RT | RA | RB | sh | 1 011 |Rc| bmrevi | # grevlut -generalised reverse combined with a LUT2 and allowing +generalised reverse combined with a pair of LUT2s and allowing zero when RA=0 provides a wide range of instructions and a means to set regular 64 bit patterns in one 32 bit instruction. +the two LUT2s are applied left-half (when not swapping) +and right-half (when swapping) so as to allow a wider +range of options + + + ``` lut2(imm, a, b): idx = b << 1 | a return imm[idx] # idx by LSB0 order -dorow(imm, step_i, chunksize): +dorow(imm8, step_i, chunksize): for j in 0 to 63: + if (j&chunk_size) == 0 + imm = imm8[0..3] + else + imm = imm8[4..7] step_o[j] = lut2(imm, step_i[j], step_i[j ^ chunk_size]) return step_o -uint64_t grevlut64(uint64_t RA, uint64_t RB, uint8 lut2) +uint64_t grevlut64(uint64_t RA, uint64_t RB, uint8 imm) { uint64_t x = RA; int shamt = RB & 63; - int imm = lut2 & 0b1111; for i in 0 to 6 step = 1< - - -https://engineering.purdue.edu/kak/compsec/NewLectures/Lecture7.pdf +see: +* +* +* ## SPRs to set modulo and degree @@ -528,34 +537,29 @@ arithmetic the modulo is to be set in an SPR ## Twin Butterfly (Tukey-Cooley) Mul-add-sub used in combination with SV FFT REMAP to perform -a full NTT in-place +a full NTT in-place. possible by having 3-in 2-out, +to avoid the need for a temp register. RS is written +to as well as RT. gffmadd RT,RA,RC,RB (Rc=0) gffmadd. RT,RA,RC,RB (Rc=1) Pseudo-code: - RT <- GFMULADD(RA, RC, RB) - RS <- GFMULADD(RA, RC, RB) + RT <- GFADD(GFMUL(RA, RC), RB)) + RS <- GFADD(GFMUL(RA, RC), RB)) ## Multiply -this requires 3 parameters and a "degree" - - RT = GFMUL(RA, RB, gfdegree, modulo=RC) +with the modulo and degree being in an SPR, multiply can be identical +equivalent to standard integer add -realistically with the degree also needing to be an immediate it should be brought down to an overwrite version: + RS = GFMUL(RA, RB) - RS = GFMUL(RS, RA, gfdegree, modulo=RC) - RS = GFMUL(RS, RA, gfdegree=RB, modulo=RC) - -| 0.5|6.10|11.15|16.20|21.25| 26..30 |31| -| -- | -- | --- | --- | --- | ------- |--| -| NN | RS | RA | deg | RC | 00 011 |Rc| -| NN | RS | RA | RB | RC | 11 011 |Rc| - -where the SimpleV variant may override RS-as-src differently from RS-as-dest +| 0.5|6.10|11.15|16.20|21.25| 26..30 |31| +| -- | -- | --- | --- | --- | ------ |--| +| NN | RT | RA | RB |11000| 01110 |Rc| @@ -594,9 +598,11 @@ def multGF2(p1, p2): """Multiply two polynomials in GF(2^m)/g(x)""" p = 0 while p2: + # standard long-multiplication: check LSB and add if p2 & 1: p ^= p1 p1 <<= 1 + # standard modulo: check MSB and add polynomial if p1 & mask1: p1 ^= polyred p2 >>= 1 @@ -617,42 +623,45 @@ if __name__ == "__main__": # Evaluate the product (x^7)(x^7 + x + 1) print("{:02x}".format(multGF2(0b10000000, 0b10000011))) ``` -## GF div and mod + +## GF(2^M) Inverse ``` -def FullDivision(self, f, v, fDegree, vDegree): - """ - Takes four arguments, f, v, fDegree, and vDegree where - fDegree and vDegree are the degrees of the field elements - f and v represented as a polynomials. - This method returns the field elements a and b such that +# https://bugs.libre-soc.org/show_bug.cgi?id=782#c33 +# https://ftp.libre-soc.org/ARITH18_Kobayashi.pdf +def gf_invert(a) : - f(x) = a(x) * v(x) + b(x). + s = getGF2() # get the full polynomial (including the MSB) + r = a + v = 0 + u = 1 + j = 0 - That is, a is the divisor and b is the remainder, or in - other words a is like floor(f/v) and b is like f modulo v. - """ + for i in range(1, 2*degree+1): + # could use count-trailing-1s here to skip ahead + if r & mask1: # test MSB of r + if s & mask1: # test MSB of s + s ^= r + v ^= u + s <<= 1 # shift left 1 + if j == 0: + r, s = s, r # swap r,s + u, v = v<<1, u # shift v and swap + j = 1 + else: + u >>= 1 # right shift left + j -= 1 + else: + r <<= 1 # shift left 1 + u <<= 1 # shift left 1 + j += 1 - res, rem = 0, f - i = fDegree - mask = 1 << i - while (i >= vDegree): - if (mask & rem): - res ^= (1 << (i - vDegree)) - rem ^= ( v << (i - vDegree))) - i -= 1 - mask >>= 1 - return (res, rem) + return u ``` -| 0.5|6.10|11.15|16.20|21.25| 26..30 |31| name | -| -- | -- | --- | --- | --- | ------- |--| ----- | -| NN | RS | RA | deg | RC | 0 1 011 |Rc| gfaddi | -| NN | RS | RA | RB | RC | 1 1 111 |Rc| gfadd | - -GFMOD is a pseudo-op where RA=0 +# GF2 (Carryless) -## gf invert +## GF2 (carryless) div and mod ``` def gf_degree(a) : @@ -663,33 +672,39 @@ def gf_degree(a) : res += 1; return res -def gf_invert(a, mod=0x1B) : - v = mod - g1 = 1 - g2 = 0 - j = gf_degree(a) - 8 - - while (a != 1) : - if (j < 0) : - a, v = v, a - g1, g2 = g2, g1 - j = -j - - a ^= v << j - g1 ^= g2 << j +def FullDivision(self, f, v): + """ + Takes two arguments, f, v + fDegree and vDegree are the degrees of the field elements + f and v represented as a polynomials. + This method returns the field elements a and b such that - a %= 256 # Emulating 8-bit overflow - g1 %= 256 # Emulating 8-bit overflow + f(x) = a(x) * v(x) + b(x). - j = gf_degree(a) - gf_degree(v) + That is, a is the divisor and b is the remainder, or in + other words a is like floor(f/v) and b is like f modulo v. + """ - return g1 + fDegree, vDegree = gf_degree(f), gf_degree(v) + res, rem = 0, f + for i in reversed(range(vDegree, fDegree+1): + if ((rem >> i) & 1): # check bit + res ^= (1 << (i - vDegree)) + rem ^= ( v << (i - vDegree))) + return (res, rem) ``` -## carryless mul +| 0.5|6.10|11.15|16.20| 21 | 22.23 | 24....30 |31| name | +| -- | -- | --- | --- | -- | ----- | -------- |--| ---- | +| NN | RT | RA | RB | 1 | 00 | 0001 110 |Rc| cldiv | +| NN | RT | RA | RB | 1 | 01 | 0001 110 |Rc| clmod | + +## GF2 carryless mul based on RV bitmanip -see https://en.wikipedia.org/wiki/CLMUL_instruction_set +see and + and + these are GF2 operations with the modulo set to 2^degree. they are worth adding as their own non-overwrite operations