openpower/sv/bitmanip.mdwn

   1 [[!tag standards]]
   2
   3 # Implementation Log
   4
   5 * ternlogi <https://bugs.libre-soc.org/show_bug.cgi?id=745>
   6 * grev <https://bugs.libre-soc.org/show_bug.cgi?id=755>
   7 * remove Rc=1 from ternlog due to conflicts in encoding as well
   8   as saving space <https://bugs.libre-soc.org/show_bug.cgi?id=753#c5>
   9
  10 # bitmanipulation
  11
  12 **DRAFT STATUS**
  13
  14 this extension amalgamates bitmanipulation primitives from many sources, including RISC-V bitmanip, Packed SIMD, AVX-512 and OpenPOWER VSX.  Vectorisation and SIMD are removed: these are straight scalar (element) operations making them suitable for embedded applications.
  15 Vectorisation Context is provided by [[openpower/sv]].
  16
  17 When combined with SV, scalar variants of bitmanip operations found in VSX are added so that VSX may be retired as "legacy" in the far future (10 to 20 years).  Also, VSX is hundreds of opcodes, requires 128 bit pathways, and is wholly unsuited to low power or embedded scenarios.
  18
  19 ternlogv is experimental and is the only operation that may be considered a "Packed SIMD".  It is added as a variant of the already well-justified ternlog operation (done in AVX512 as an immediate only) "because it looks fun". As it is based on the LUT4 concept it will allow accelerated emulation of FPGAs.  Other vendors of ISAs are buying FPGA companies to achieve similar objectives.
  20
  21 general-purpose Galois Field operations are added so as to avoid huge custom opcode proliferation across many areas of Computer Science.  however for convenience and also to avoid setup costs, some of the more common operations (clmul, crc32) are also added.  The expectation is that these operations would all be covered by the same pipeline.
  22
  23 note that there are brownfield spaces below that could incorporate some of the set-before-first and other scalar operations listed in [[sv/vector_ops]], and
  24 the [[sv/av_opcodes]] as well as [[sv/setvl]]
  25
  26 Useful resource:
  27
  28 * <https://en.wikiversity.org/wiki/Reed%E2%80%93Solomon_codes_for_coders>
  29 * <https://maths-people.anu.edu.au/~brent/pd/rpb232tr.pdf>
  30
  31 # summary
  32
  33 minor opcode allocation
  34
  35     |  28.30 |31| name      |
  36     | ------ |--| --------- |
  37     |   00   |0 | ternlogi  |
  38     |  000   |1 | ternlog   |
  39     |  100   |1 | reserved  |
  40     |  010   |Rc| bitmask   |
  41     |  011   |Rc| gf*       |
  42     |  101   |1 | ternlogv  |
  43     |  101   |0 | ternlogcr |
  44     |  110   |Rc| 1/2-op    |
  45     |  111   |Rc| 3-op      |
  46
  47 1-op and variants
  48
  49 | dest | src1 | subop | op       |
  50 | ---- | ---- | ----- | -------- |
  51 | RT   | RA   | ..    | bmatflip |
  52
  53 2-op and variants
  54
  55 | dest | src1 | src2 | subop | op       |
  56 | ---- | ---- | ---- | ----- | -------- |
  57 | RT   | RA   | RB   | or    | bmatflip |
  58 | RT   | RA   | RB   | xor   | bmatflip |
  59 | RT   | RA   | RB   |       | grev  |
  60 | RT   | RA   | RB   |       | clmul*  |
  61 | RT   | RA   | RB   |       | gorc |
  62 | RT   | RA   | RB   | shuf  | shuffle |
  63 | RT   | RA   | RB   | unshuf| shuffle |
  64 | RT   | RA   | RB   | width | xperm  |
  65 | RT   | RA   | RB   | type | minmax |
  66 | RT   | RA   | RB   |      | av abs avgadd  |
  67 | RT   | RA   | RB   | type | vmask ops |
  68 | RT   | RA   | RB   |  |  |
  69
  70 3 ops
  71
  72 * bitmask set/extract
  73 * ternlog bitops
  74 * GF
  75
  76 | 0.5|6.10|11.15|16.20|21..25 | 26....30 |31| name |
  77 | -- | -- | --- | --- | ----- | -------- |--| ------ |
  78 | NN | RT | RA  | RB  | RC    | mode 000 |1 | ternlog |
  79 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |0 | ternlogi |
  80 | NN | RS | RA  | RB  | RC    | 00  011  |Rc| gfmul |
  81 | NN | RS | RA  | RB  | RC    | 01  011  |Rc| gfadd |
  82 | NN | RT | RA  | RB  | deg   | 10  011  |Rc| gfinv |
  83 | NN | RS | RA  | RB  | deg   | 11  011  |Rc| gfmuli |
  84 | NN | RS | RA  | RB  | deg   | 11  111  |Rc| gfaddi |
  85
  86 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
  87 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
  88 | NN | RT | RA  | imm   | mask | 101   |1 | ternlogv |
  89
  90 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
  91 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
  92 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 | ternlogcr |
  93
  94 ops (note that av avg and abs as well as vec scalar mask
  95 are included here)
  96
  97 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
  98 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
  99 | NN | RA | RB  |     |       | 0  | 0000 110 |Rc| rsvd   |
 100 | NN | RA | RB  | RC  | itype | 1  | 0000 110 |Rc| xperm |
 101 | NN | RA | RB  | RC  | itype | 0  | 0100 110 |Rc| minmax |
 102 | NN | RA | RB  | RC  |   00  | 1  | 0100 110 |Rc| av avgadd |
 103 | NN | RA | RB  | RC  |   01  | 1  | 0100 110 |Rc| av abs |
 104 | NN | RA | RB  |     |   10  | 1  | 0100 110 |Rc| rsvd |
 105 | NN | RA | RB  |     |   11  | 1  | 0100 110 |Rc| rsvd |
 106 | NN | RA | RB  | sh  | itype | SH | 1000 110 |Rc| bmopsi |
 107 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
 108 | NN | RA | RB  |     |       | 1  | 0001 110 |Rc| rsvd |
 109 | NN | RA | RB  | RC  |   00  | 0  | 0001 110 |Rc| vec sbfm |
 110 | NN | RA | RB  | RC  |   01  | 0  | 0001 110 |Rc| vec sofm |
 111 | NN | RA | RB  | RC  |   10  | 0  | 0001 110 |Rc| vec sifm |
 112 | NN | RA | RB  | RC  |   11  | 0  | 0001 110 |Rc| vec cprop |
 113 | NN | RA | RB  |     |       | 0  | 0101 110 |Rc| rsvd |
 114 | NN | RA | RB  | RC  | 00    | 0  | 0010 110 |Rc| gorc |
 115 | NN | RA | RB  | sh  | 00    | SH | 1010 110 |Rc| gorci |
 116 | NN | RA | RB  | RC  | 00    | 0  | 0110 110 |Rc| gorcw |
 117 | NN | RA | RB  | sh  | 00    | 0  | 1110 110 |Rc| gorcwi |
 118 | NN | RA | RB  | RC  | 00    | 1  | 1110 110 |Rc| bmator  |
 119 | NN | RA | RB  | RC  | 01    | 0  | 0010 110 |Rc| grev |
 120 | NN | RA | RB  | RC  | 01    | 1  | 0010 110 |Rc| clmul |
 121 | NN | RA | RB  | sh  | 01    | SH | 1010 110 |Rc| grevi |
 122 | NN | RA | RB  | RC  | 01    | 0  | 0110 110 |Rc| grevw |
 123 | NN | RA | RB  | sh  | 01    | 0  | 1110 110 |Rc| grevwi |
 124 | NN | RA | RB  | RC  | 01    | 1  | 1110 110 |Rc| bmatxor   |
 125 | NN | RA | RB  | RC  | 10    | 0  | 0010 110 |Rc| shfl |
 126 | NN | RA | RB  | sh  | 10    | SH | 1010 110 |Rc| shfli |
 127 | NN | RA | RB  | RC  | 10    | 0  | 0110 110 |Rc| shflw |
 128 | NN | RA | RB  | RC  | 10    |    | 1110 110 |Rc| rsvd   |
 129 | NN | RA | RB  | RC  | 11    | 0  | 1110 110 |Rc| clmulr  |
 130 | NN | RA | RB  | RC  | 11    | 1  | 1110 110 |Rc| clmulh  |
 131 | NN |    |     |     |       |    | --11 110 |Rc| setvl  |
 132
 133 # bit to byte permute
 134
 135 similar to matrix permute in RV bitmanip, which has XOR and OR variants
 136
 137     do j = 0 to 7
 138       do k = 0 to 7
 139          b = VSR[VRB+32].dword[i].byte[k].bit[j]
 140          VSR[VRT+32].dword[i].byte[j].bit[k] = b
 141
 142 # int min/max
 143
 144 signed and unsigned min/max for integer.  this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned.  when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
 145
 146 signed/unsigned min/max gives more flexibility.
 147
 148 ```
 149 uint_xlen_t min(uint_xlen_t rs1, uint_xlen_t rs2)
 150 { return (int_xlen_t)rs1 < (int_xlen_t)rs2 ? rs1 : rs2;
 151 }
 152 uint_xlen_t max(uint_xlen_t rs1, uint_xlen_t rs2)
 153 { return (int_xlen_t)rs1 > (int_xlen_t)rs2 ? rs1 : rs2;
 154 }
 155 uint_xlen_t minu(uint_xlen_t rs1, uint_xlen_t rs2)
 156 { return rs1 < rs2 ? rs1 : rs2;
 157 }
 158 uint_xlen_t maxu(uint_xlen_t rs1, uint_xlen_t rs2)
 159 { return rs1 > rs2 ? rs1 : rs2;
 160 }
 161 ```
 162
 163
 164 # ternlog bitops
 165
 166 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register.
 167
 168 Like the x86 AVX512F [vpternlogd/vpternlogq](https://www.felixcloutier.com/x86/vpternlogd:vpternlogq) instructions.
 169
 170 ## ternlogi
 171
 172 | 0.5|6.10|11.15|16.20| 21..25| 26..30   |31|
 173 | -- | -- | --- | --- | ----- | -------- |--|
 174 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |0 |
 175
 176     for i in range(64):
 177         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 178         RT[i] = (imm & (1<<idx)) != 0
 179
 180 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
 181
 182 ## ternlog
 183
 184 a 4 operand variant which becomes more along the lines of an FPGA:
 185
 186 | 0.5|6.10|11.15|16.20|21.25| 26...30  |31|
 187 | -- | -- | --- | --- | --- | -------- |--|
 188 | NN | RT | RA  | RB  | RC  | mode 100 |1 |
 189
 190     for i in range(64):
 191         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 192         RT[i] = (RC & (1<<idx)) != 0
 193
 194 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
 195 3 modes.
 196
 197 ## ternlogv
 198
 199 also, another possible variant involving swizzle and vec4:
 200
 201 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 202 | -- | -- | --- | ----- | ---- | ----- |--|
 203 | NN | RT | RA  | imm   | mask | 101   |1 |
 204
 205     for i in range(8):
 206         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 207         res = (imm & (1<<idx)) != 0
 208         for j in range(3):
 209              if mask[j]: RT[i+j*8] = res
 210
 211 ## ternlogcr
 212
 213 another mode selection would be CRs not Ints.
 214
 215 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
 216 | -- | -- | --- | --- |- |-----|----- | -----|--|
 217 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 |
 218
 219     for i in range(4):
 220         if not mask[i] continue
 221         idx = crregs[BA][i] << 2 |
 222               crregs[BB][i] << 1 |
 223               crregs[BC][i]
 224         crregs[BA][i] = (imm & (1<<idx)) != 0
 225
 226 ## cmix
 227
 228 based on RV bitmanip, covered by ternlog bitops
 229
 230 ```
 231 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
 232     return (RA & RB) | (RC & ~RB);
 233 }
 234 ```
 235
 236
 237 # bitmask set
 238
 239 based on RV bitmanip singlebit set, instruction format similar to shift
 240 [[isa/fixedshift]].  bmext is actually covered already (shift-with-mask rldicl but only immediate version).
 241 however bitmask-invert is not, and set/clr are not covered, although they can use the same Shift ALU.
 242
 243 bmext (RB) version is not the same as rldicl because bmext is a right shift by RC, where rldicl is a left rotate.  for the immediate version this does not matter, so a bmexti is not required.
 244 bmrev however there is no direct equivalent and consequently a bmrevi is required.
 245
 246 bmset (register for mask amount) is particularly useful for creating
 247 predicate masks where the length is a dynamic runtime quantity.
 248 bmset(RA=0, RB=0, RC=mask) will produce a run of ones of length "mask" in a single instruction without needing to initialise or depend on any other registers.
 249
 250 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31| name  |
 251 | -- | -- | --- | --- | --- | ------- |--| ----- |
 252 | NN | RT | RA  | RB  | RC  | mode 010 |Rc| bm*   |
 253 | NN | RT | RA  | RB  | RC  | 0 1  111 |Rc| bmrev |
 254
 255
 256 ```
 257 uint_xlen_t bmset(RA, RB, sh)
 258 {
 259     int shamt = RB & (XLEN - 1);
 260     mask = (2<<sh)-1;
 261     return RA | (mask << shamt);
 262 }
 263
 264 uint_xlen_t bmclr(RA, RB, sh)
 265 {
 266     int shamt = RB & (XLEN - 1);
 267     mask = (2<<sh)-1;
 268     return RA & ~(mask << shamt);
 269 }
 270
 271 uint_xlen_t bminv(RA, RB, sh)
 272 {
 273     int shamt = RB & (XLEN - 1);
 274     mask = (2<<sh)-1;
 275     return RA ^ (mask << shamt);
 276 }
 277
 278 uint_xlen_t bmext(RA, RB, sh)
 279 {
 280     int shamt = RB & (XLEN - 1);
 281     mask = (2<<sh)-1;
 282     return mask & (RA >> shamt);
 283 }
 284 ```
 285
 286 bitmask extract with reverse.  can be done by bitinverting all of RA and getting bits of RA from the opposite end.
 287
 288 ```
 289 msb = rb[5:0];
 290 rev[0:msb] = ra[msb:0];
 291 rt = ZE(rev[msb:0]);
 292
 293 uint_xlen_t bmextrev(RA, RB, sh)
 294 {
 295     int shamt = (RB & (XLEN - 1));
 296     shamt = (XLEN-1)-shamt;  # shift other end
 297     bra = bitreverse(RA)     # swap LSB-MSB
 298     mask = (2<<sh)-1;
 299     return mask & (bra >> shamt);
 300 }
 301 ```
 302
 303 | 0.5|6.10|11.15|16.20|21.26| 27..30  |31| name   |
 304 | -- | -- | --- | --- | --- | ------- |--| ------ |
 305 | NN | RT | RA  | RB  | sh  | 0   111 |Rc| bmrevi |
 306
 307
 308
 309 # grev
 310
 311 based on RV bitmanip
 312
 313 ```
 314 uint64_t grev64(uint64_t RA, uint64_t RB)
 315 {
 316     uint64_t x = RA;
 317     int shamt = RB & 63;
 318     if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
 319                         ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
 320     if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
 321                         ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
 322     if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
 323                         ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
 324     if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
 325                         ((x & 0xFF00FF00FF00FF00LL) >>  8);
 326     if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
 327                         ((x & 0xFFFF0000FFFF0000LL) >> 16);
 328     if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
 329                         ((x & 0xFFFFFFFF00000000LL) >> 32);
 330     return x;
 331 }
 332
 333 ```
 334
 335 # shuffle / unshuffle
 336
 337 based on RV bitmanip
 338
 339 ```
 340 uint32_t shfl32(uint32_t RA, uint32_t RB)
 341 {
 342     uint32_t x = RA;
 343     int shamt = RB & 15;
 344     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 345     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 346     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 347     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 348     return x;
 349 }
 350 uint32_t unshfl32(uint32_t RA, uint32_t RB)
 351 {
 352     uint32_t x = RA;
 353     int shamt = RB & 15;
 354     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 355     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 356     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 357     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 358     return x;
 359 }
 360
 361 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
 362 {
 363     uint64_t x = src & ~(maskL | maskR);
 364     x |= ((src << N) & maskL) | ((src >> N) & maskR);
 365     return x;
 366 }
 367 uint64_t shfl64(uint64_t RA, uint64_t RB)
 368 {
 369     uint64_t x = RA;
 370     int shamt = RB & 31;
 371     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 372                                            0x00000000ffff0000LL, 16);
 373     if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 374                                            0x0000ff000000ff00LL, 8);
 375     if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 376                                            0x00f000f000f000f0LL, 4);
 377     if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
 378                                            0x0c0c0c0c0c0c0c0cLL, 2);
 379     if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
 380                                            0x2222222222222222LL, 1);
 381     return x;
 382 }
 383 uint64_t unshfl64(uint64_t RA, uint64_t RB)
 384 {
 385     uint64_t x = RA;
 386     int shamt = RB & 31;
 387     if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
 388                                            0x2222222222222222LL, 1);
 389     if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
 390                                            0x0c0c0c0c0c0c0c0cLL, 2);
 391     if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 392                                            0x00f000f000f000f0LL, 4);
 393     if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 394                                            0x0000ff000000ff00LL, 8);
 395     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 396                                            0x00000000ffff0000LL, 16);
 397     return x;
 398 }
 399 ```
 400
 401 # xperm
 402
 403 based on RV bitmanip
 404
 405 ```
 406 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
 407 {
 408     uint_xlen_t r = 0;
 409     uint_xlen_t sz = 1LL << sz_log2;
 410     uint_xlen_t mask = (1LL << sz) - 1;
 411     for (int i = 0; i < XLEN; i += sz) {
 412         uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
 413         if (pos < XLEN)
 414             r |= ((RA >> pos) & mask) << i;
 415     }
 416     return r;
 417 }
 418 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
 419 {  return xperm(RA, RB, 2); }
 420 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
 421 {  return xperm(RA, RB, 3); }
 422 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
 423 {  return xperm(RA, RB, 4); }
 424 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
 425 {  return xperm(RA, RB, 5); }
 426 ```
 427
 428 # gorc
 429
 430 based on RV bitmanip
 431
 432 ```
 433 uint32_t gorc32(uint32_t RA, uint32_t RB)
 434 {
 435     uint32_t x = RA;
 436     int shamt = RB & 31;
 437     if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
 438     if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
 439     if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
 440     if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
 441     if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
 442     return x;
 443 }
 444 uint64_t gorc64(uint64_t RA, uint64_t RB)
 445 {
 446     uint64_t x = RA;
 447     int shamt = RB & 63;
 448     if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
 449                          ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
 450     if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
 451                          ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
 452     if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
 453                          ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
 454     if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
 455                          ((x & 0xFF00FF00FF00FF00LL)  >>  8);
 456     if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
 457                          ((x & 0xFFFF0000FFFF0000LL)  >> 16);
 458     if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
 459                          ((x & 0xFFFFFFFF00000000LL)  >> 32);
 460     return x;
 461 }
 462
 463 ```
 464
 465 # Galois Field
 466
 467 see <https://courses.csail.mit.edu/6.857/2016/files/ffield.py>
 468
 469 ## Multiply
 470
 471 this requires 3 parameters and a "degree"
 472
 473     RT = GFMUL(RA, RB, gfdegree, modulo=RC)
 474
 475 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
 476
 477     RS = GFMUL(RS, RA, gfdegree, modulo=RC)
 478     RS = GFMUL(RS, RA, gfdegree=RB, modulo=RC)
 479
 480 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 481 | -- | -- | --- | --- | --- | ------- |--|
 482 | NN | RS | RA  | deg | RC  | 00  011 |Rc|
 483 | NN | RS | RA  | RB  | RC  | 11  011 |Rc|
 484
 485 where the SimpleV variant may override RS-as-src differently from RS-as-dest
 486
 487
 488
 489 ```
 490 from functools import reduce
 491
 492 # constants used in the multGF2 function
 493 mask1 = mask2 = polyred = None
 494
 495 def setGF2(degree, irPoly):
 496     """Define parameters of binary finite field GF(2^m)/g(x)
 497        - degree: extension degree of binary field
 498        - irPoly: coefficients of irreducible polynomial g(x)
 499     """
 500     def i2P(sInt):
 501         """Convert an integer into a polynomial"""
 502         return [(sInt >> i) & 1
 503                 for i in reversed(range(sInt.bit_length()))]
 504
 505     global mask1, mask2, polyred
 506     mask1 = mask2 = 1 << degree
 507     mask2 -= 1
 508     polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
 509
 510 def multGF2(p1, p2):
 511     """Multiply two polynomials in GF(2^m)/g(x)"""
 512     p = 0
 513     while p2:
 514         if p2 & 1:
 515             p ^= p1
 516         p1 <<= 1
 517         if p1 & mask1:
 518             p1 ^= polyred
 519         p2 >>= 1
 520     return p & mask2
 521
 522 if __name__ == "__main__":
 523
 524     # Define binary field GF(2^3)/x^3 + x + 1
 525     setGF2(3, 0b1011)
 526
 527     # Evaluate the product (x^2 + x + 1)(x^2 + 1)
 528     print("{:02x}".format(multGF2(0b111, 0b101)))
 529
 530     # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
 531     # (used in the Advanced Encryption Standard-AES)
 532     setGF2(8, 0b100011011)
 533
 534     # Evaluate the product (x^7)(x^7 + x + 1)
 535     print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
 536 ```
 537 ## GF add
 538
 539     RS = GFADDI(RS, RA|0, gfdegree, modulo=RC)
 540     RS = GFADD(RS, RA|0, gfdegree=RB, modulo=RC)
 541
 542 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31| name  |
 543 | -- | -- | --- | --- | --- | ------- |--| ----- |
 544 | NN | RS | RA  | deg | RC  | 0 1  011 |Rc| gfaddi |
 545 | NN | RS | RA  | RB  | RC  | 1 1  111 |Rc| gfadd |
 546
 547 GFMOD is a pseudo-op where RA=0
 548
 549 ## gf invert
 550
 551 ```
 552 def gf_degree(a) :
 553   res = 0
 554   a >>= 1
 555   while (a != 0) :
 556     a >>= 1;
 557     res += 1;
 558   return res
 559
 560 def gf_invert(a, mod=0x1B) :
 561   v = mod
 562   g1 = 1
 563   g2 = 0
 564   j = gf_degree(a) - 8
 565
 566   while (a != 1) :
 567     if (j < 0) :
 568       a, v = v, a
 569       g1, g2 = g2, g1
 570       j = -j
 571
 572     a ^= v << j
 573     g1 ^= g2 << j
 574
 575     a %= 256  # Emulating 8-bit overflow
 576     g1 %= 256 # Emulating 8-bit overflow
 577
 578     j = gf_degree(a) - gf_degree(v)
 579
 580   return g1
 581 ```
 582
 583 ## carryless mul
 584
 585 based on RV bitmanip
 586 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
 587
 588 these are GF2 operations with the modulo set to 2^degree.
 589 they are worth adding as their own non-overwrite operations
 590 (in the same pipeline).
 591
 592 ```
 593 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
 594 {
 595     uint_xlen_t x = 0;
 596     for (int i = 0; i < XLEN; i++)
 597         if ((RB >> i) & 1)
 598             x ^= RA << i;
 599     return x;
 600 }
 601 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
 602 {
 603     uint_xlen_t x = 0;
 604     for (int i = 1; i < XLEN; i++)
 605         if ((RB >> i) & 1)
 606             x ^= RA >> (XLEN-i);
 607     return x;
 608 }
 609 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
 610 {
 611     uint_xlen_t x = 0;
 612     for (int i = 0; i < XLEN; i++)
 613         if ((RB >> i) & 1)
 614             x ^= RA >> (XLEN-i-1);
 615     return x;
 616 }
 617 ```
 618
 619 # bitmatrix
 620
 621 ```
 622 uint64_t bmatflip(uint64_t RA)
 623 {
 624     uint64_t x = RA;
 625     x = shfl64(x, 31);
 626     x = shfl64(x, 31);
 627     x = shfl64(x, 31);
 628     return x;
 629 }
 630 uint64_t bmatxor(uint64_t RA, uint64_t RB)
 631 {
 632     // transpose of RB
 633     uint64_t RBt = bmatflip(RB);
 634     uint8_t u[8]; // rows of RA
 635     uint8_t v[8]; // cols of RB
 636     for (int i = 0; i < 8; i++) {
 637         u[i] = RA >> (i*8);
 638         v[i] = RBt >> (i*8);
 639     }
 640     uint64_t x = 0;
 641     for (int i = 0; i < 64; i++) {
 642         if (pcnt(u[i / 8] & v[i % 8]) & 1)
 643             x |= 1LL << i;
 644     }
 645     return x;
 646 }
 647 uint64_t bmator(uint64_t RA, uint64_t RB)
 648 {
 649     // transpose of RB
 650     uint64_t RBt = bmatflip(RB);
 651     uint8_t u[8]; // rows of RA
 652     uint8_t v[8]; // cols of RB
 653     for (int i = 0; i < 8; i++) {
 654         u[i] = RA >> (i*8);
 655         v[i] = RBt >> (i*8);
 656     }
 657     uint64_t x = 0;
 658     for (int i = 0; i < 64; i++) {
 659         if ((u[i / 8] & v[i % 8]) != 0)
 660             x |= 1LL << i;
 661     }
 662     return x;
 663 }
 664
 665 ```
 666
 667 # Already in POWER ISA
 668
 669 ## count leading/trailing zeros with mask
 670
 671 in v3.1 p105
 672
 673 ```
 674 count = 0
 675 do i = 0 to 63 if((RB)i=1) then do
 676 if((RS)i=1) then break end end count ← count + 1
 677 RA ← EXTZ64(count)
 678 ```
 679
 680 ##  bit deposit
 681
 682 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep, found already in v3.1 p106
 683
 684     do while(m < 64)
 685        if VSR[VRB+32].dword[i].bit[63-m]=1 then do
 686           result = VSR[VRA+32].dword[i].bit[63-k]
 687           VSR[VRT+32].dword[i].bit[63-m] = result
 688           k = k + 1
 689        m = m + 1
 690
 691 ```
 692
 693 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
 694 {
 695     uint_xlen_t r = 0;
 696     for (int i = 0, j = 0; i < XLEN; i++)
 697         if ((RB >> i) & 1) {
 698             if ((RA >> j) & 1)
 699                 r |= uint_xlen_t(1) << i;
 700             j++;
 701         }
 702     return r;
 703 }
 704
 705 ```
 706
 707 # bit extract
 708
 709 other way round: identical to RV bext, found in v3.1 p196
 710
 711 ```
 712 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
 713 {
 714     uint_xlen_t r = 0;
 715     for (int i = 0, j = 0; i < XLEN; i++)
 716         if ((RB >> i) & 1) {
 717             if ((RA >> i) & 1)
 718                 r |= uint_xlen_t(1) << j;
 719             j++;
 720         }
 721     return r;
 722 }
 723 ```
 724
 725 # centrifuge
 726
 727 found in v3.1 p106 so not to be added here
 728
 729 ```
 730 ptr0 = 0
 731 ptr1 = 0
 732 do i = 0 to 63
 733     if((RB)i=0) then do
 734        resultptr0 = (RS)i
 735     end
 736     ptr0 = ptr0 + 1
 737     if((RB)63-i==1) then do
 738         result63-ptr1 = (RS)63-i
 739     end
 740     ptr1 = ptr1 + 1
 741 RA = result
 742 ```
 743