openpower/sv/bitmanip.mdwn

   1 [[!tag standards]]
   2
   3 # bitmanipulation
   4
   5 **DRAFT STATUS**
   6
   7 this extension amalgamates bitmanipulation primitives from many sources, including RISC-V bitmanip, Packed SIMD, AVX-512 and OpenPOWER VSX.  Vectorisation and SIMD are removed: these are straight scalar (element) operations making them suitable for embedded applications.
   8 Vectorisation Context is provided by [[openpower/sv]].
   9
  10 Scaoar variants of bitmanip oerations found in VSX are added so that VSX may be retired as "legacy" in the far future (10 to 20 years).  Also, because VSX is hundreds of opcodes, requires 128 bit pathways, and is wholly unsuited to low power or embedded scenarios.
  11
  12 ternaryv is experimental and is the only operation that may be considered a "Packed SIMD".  It is added as a variant of the already well-justified ternary operation (done in AVX512 as an immediate only) "because it looks fun". As it is based on the LUT4 concept it will allow accelerated emulation of FPGAs.  Other vendors of ISAs are buying FPGA companies to achieve a similar objective.
  13
  14 general-purpose Galois Field operations are added so as to avoid huge opcode proliferation across many areas of Computer Science.  however for convenience and also to avoid setup costs, some of the more common operations (clmul, crc32) are also added.  The expectation is that these operations would all be covered by the same pipeline.
  15
  16 note that there are brownfield spaces below that could incorporate some of the set-before-first and other scalar operations listed in [[sv/vector_ops]],
  17 the [[sv/av_opcodes]] and [[sv/cr_int_predication]]
  18
  19 # summary
  20
  21 minor opcode allocation
  22
  23     |  28.30 |31| name      |
  24     | ------ |--| --------- |
  25     |   00   |Rc| ternaryi  |
  26     |  001   |Rc| ternary   |
  27     |  010   |Rc| bitmask   |
  28     |  011   |Rc| gf*       |
  29     |  101   |1 | ternaryv  |
  30     |  101   |0 | ternarycr |
  31     |  110   |Rc| 1/2-op    |
  32     |  111   |Rc| 3-op      |
  33
  34 1-op and variants
  35
  36 | dest | src1 | subop | op       |
  37 | ---- | ---- | ----- | -------- |
  38 | RT   | RA   | ..    | bmatflip |
  39
  40 2-op and variants
  41
  42 | dest | src1 | src2 | subop | op       |
  43 | ---- | ---- | ---- | ----- | -------- |
  44 | RT   | RA   | RB   | or    | bmatflip |
  45 | RT   | RA   | RB   | xor   | bmatflip |
  46 | RT   | RA   | RB   | bdep  | dep/ext  |
  47 | RT   | RA   | RB   | bext  | dep/ext  |
  48 | RT   | RA   | RB   |       | grev  |
  49 | RT   | RA   | RB   |       | clmul*  |
  50 | RT   | RA   | RB   |       | gorc |
  51 | RT   | RA   | RB   | shuf  | shuffle |
  52 | RT   | RA   | RB   | unshuf| shuffle |
  53 | RT   | RA   | RB   | width | xperm  |
  54 | RT   | RA   | RB   | type | minmax |
  55 | RT   | RA   | RB   |  |  |
  56 | RT   | RA   | RB   |  |  |
  57 | RT   | RA   | RB   |  |  |
  58
  59 3 ops
  60
  61 * bitmask set/extract
  62 * ternary bitops
  63 * GF
  64
  65 | 0.5|6.10|11.15|16.20|21..25 | 26....30 |31| name |
  66 | -- | -- | --- | --- | ----- | -------- |--| ------ |
  67 | NN | RT | RA  | RB  | RC    | mode 001 |Rc| ternary |
  68 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc| ternaryi |
  69 | NN | RS | RA  | RB  | RC    | 00  011  |Rc| gfmul |
  70 | NN | RS | RA  | RB  | RC    | 01  011  |Rc| gfadd |
  71 | NN | RT | RA  | RB  | deg   | 10  011  |Rc| gfinv |
  72 | NN | RS | RA  | RB  | deg   | 11  011  |Rc| gfmuli |
  73 | NN | RS | RA  | RB  | deg   | 11  111  |Rc| gfaddi |
  74
  75 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
  76 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
  77 | NN | RT | RA  | imm   | mask | 101   |1 | ternaryv |
  78
  79 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
  80 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
  81 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 | ternarycr |
  82
  83 ops
  84
  85 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
  86 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
  87 | NN | RA | RB  |     |       | 0  | 0000 110 |Rc| rsvd   |
  88 | NN | RA | RB  | RC  | itype | 1  | 0000 110 |Rc| xperm |
  89 | NN | RA | RB  | RC  | itype | 0  | 0100 110 |Rc| minmax |
  90 | NN | RA | RB  |     |       | 1  | 0100 110 |Rc| rsvd |
  91 | NN | RA | RB  | sh  | itype | SH | 1000 110 |Rc| bmopsi |
  92 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  93 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  94 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  95 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  96 | NN | RA | RB  |     |       | 0  | 0001 110 |Rc| rsvd |
  97 | NN | RA | RB  |     |       | 0  | 0101 110 |Rc| rsvd |
  98 | NN | RA | RB  | RC  | 00    | 0  | 0010 110 |Rc| gorc |
  99 | NN | RA | RB  | sh  | 00    | SH | 1010 110 |Rc| gorci |
 100 | NN | RA | RB  | RC  | 00    | 0  | 0110 110 |Rc| gorcw |
 101 | NN | RA | RB  | sh  | 00    | 0  | 1110 110 |Rc| gorcwi |
 102 | NN | RA | RB  | RC  | 00    | 1  | 1110 110 |Rc| bmator  |
 103 | NN | RA | RB  | RC  | 01    | 0  | 0010 110 |Rc| grev |
 104 | NN | RA | RB  | RC  | 01    | 1  | 0010 110 |Rc| clmul |
 105 | NN | RA | RB  | sh  | 01    | SH | 1010 110 |Rc| grevi |
 106 | NN | RA | RB  | RC  | 01    | 0  | 0110 110 |Rc| grevw |
 107 | NN | RA | RB  | sh  | 01    | 0  | 1110 110 |Rc| grevwi |
 108 | NN | RA | RB  | RC  | 01    | 1  | 1110 110 |Rc| bmatxor   |
 109 | NN | RA | RB  | RC  | 10    | 0  | 0010 110 |Rc| shfl |
 110 | NN | RA | RB  | sh  | 10    | SH | 1010 110 |Rc| shfli |
 111 | NN | RA | RB  | RC  | 10    | 0  | 0110 110 |Rc| shflw |
 112 | NN | RA | RB  | RC  | 10    | 0  | 1110 110 |Rc| bdep   |
 113 | NN | RA | RB  | RC  | 10    | 1  | 1110 110 |Rc| bext  |
 114 | NN | RA | RB  | RC  | 11    | 0  | 1110 110 |Rc| clmulr  |
 115 | NN | RA | RB  | RC  | 11    | 1  | 1110 110 |Rc| clmulh  |
 116 | NN | RA | RB  |     |       |    | NN11 110 |Rc| rsvd  |
 117
 118 # count leading/trailing zeros with mask
 119
 120 in v3.1 p105
 121
 122 ```
 123 count = 0
 124 do i = 0 to 63 if((RB)i=1) then do
 125 if((RS)i=1) then break end end count ← count + 1
 126 RA ← EXTZ64(count)
 127 ```
 128
 129 # bit to byte permute
 130
 131 similar to matrix permute in RV bitmanip, which has XOR and OR variants
 132
 133     do j = 0 to 7
 134       do k = 0 to 7
 135          b = VSR[VRB+32].dword[i].byte[k].bit[j]
 136          VSR[VRT+32].dword[i].byte[j].bit[k] = b
 137
 138 #  bit deposit
 139
 140 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep, found already in v3.1 p106
 141
 142     do while(m < 64)
 143        if VSR[VRB+32].dword[i].bit[63-m]=1 then do
 144           result = VSR[VRA+32].dword[i].bit[63-k]
 145           VSR[VRT+32].dword[i].bit[63-m] = result
 146           k = k + 1
 147        m = m + 1
 148
 149 ```
 150
 151 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
 152 {
 153     uint_xlen_t r = 0;
 154     for (int i = 0, j = 0; i < XLEN; i++)
 155         if ((RB >> i) & 1) {
 156             if ((RA >> j) & 1)
 157                 r |= uint_xlen_t(1) << i;
 158             j++;
 159         }
 160     return r;
 161 }
 162
 163 ```
 164
 165 # bit extract
 166
 167 other way round: identical to RV bext, found in v3.1 p196
 168
 169 ```
 170 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
 171 {
 172     uint_xlen_t r = 0;
 173     for (int i = 0, j = 0; i < XLEN; i++)
 174         if ((RB >> i) & 1) {
 175             if ((RA >> i) & 1)
 176                 r |= uint_xlen_t(1) << j;
 177             j++;
 178         }
 179     return r;
 180 }
 181 ```
 182
 183 # centrifuge
 184
 185 found in v3.1 p106
 186
 187 ```
 188 ptr0 = 0
 189 ptr1 = 0
 190 do i = 0 to 63
 191     if((RB)i=0) then do
 192        resultptr0 = (RS)i
 193     end
 194     ptr0 = ptr0 + 1
 195     if((RB)63-i==1) then do
 196         result63-ptr1 = (RS)63-i
 197     end
 198     ptr1 = ptr1 + 1
 199 RA = result
 200 ```
 201
 202 # int min/max
 203
 204 signed and unsigned min/max for integer.  this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned.  when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
 205
 206 signed/unsigned min/max gives more flexibility.
 207
 208 ```
 209 uint_xlen_t min(uint_xlen_t rs1, uint_xlen_t rs2)
 210 { return (int_xlen_t)rs1 < (int_xlen_t)rs2 ? rs1 : rs2;
 211 }
 212 uint_xlen_t max(uint_xlen_t rs1, uint_xlen_t rs2)
 213 { return (int_xlen_t)rs1 > (int_xlen_t)rs2 ? rs1 : rs2;
 214 }
 215 uint_xlen_t minu(uint_xlen_t rs1, uint_xlen_t rs2)
 216 { return rs1 < rs2 ? rs1 : rs2;
 217 }
 218 uint_xlen_t maxu(uint_xlen_t rs1, uint_xlen_t rs2)
 219 { return rs1 > rs2 ? rs1 : rs2;
 220 }
 221 ```
 222
 223
 224 # ternary bitops
 225
 226 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
 227
 228 | 0.5|6.10|11.15|16.20| 21..25| 26..30   |31|
 229 | -- | -- | --- | --- | ----- | -------- |--|
 230 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc|
 231
 232     for i in range(64):
 233         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 234         RT[i] = (imm & (1<<idx)) != 0
 235
 236 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
 237
 238 a 4 operand variant which becomes more along the lines of an FPGA:
 239
 240 | 0.5|6.10|11.15|16.20|21.25| 26...30  |31|
 241 | -- | -- | --- | --- | --- | -------- |--|
 242 | NN | RT | RA  | RB  | RC  | mode 001 |Rc|
 243
 244     for i in range(64):
 245         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 246         RT[i] = (RC & (1<<idx)) != 0
 247
 248 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
 249 3 modes.
 250
 251 also, another possible variant involving swizzle and vec4:
 252
 253 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 254 | -- | -- | --- | ----- | ---- | ----- |--|
 255 | NN | RT | RA  | imm   | mask | 101   |1 |
 256
 257     for i in range(8):
 258         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 259         res = (imm & (1<<idx)) != 0
 260         for j in range(3):
 261              if mask[j]: RT[i+j*8] = res
 262
 263 another mode selection would be CRs not Ints.
 264
 265 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
 266 | -- | -- | --- | --- |- |-----|----- | -----|--|
 267 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 |
 268
 269     for i in range(4):
 270         if not mask[i] continue
 271         idx = crregs[BA][i] << 2 |
 272               crregs[BB][i] << 1 |
 273               crregs[BC][i]
 274         crregs[BA][i] = (imm & (1<<idx)) != 0
 275
 276 # bitmask set
 277
 278 based on RV bitmanip singlebit set, instruction format similar to shift
 279 [[isa/fixedshift]].  bmext is actually covered already (shift-with-mask rldicl but only immediate version).
 280 however bitmask-invert is not, and set/clr are not covered, although they can use the same Shift ALU.
 281
 282 bmext (RB) version is not the same as rldicl because bmext is a right shift by RC, where rldicl is a left rotate.  for the immediate version this does not matter, so a bmexti is not required.
 283 bmrev however there is no direct equivalent and consequently a bmrevi is required.
 284
 285 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31| name  |
 286 | -- | -- | --- | --- | --- | ------- |--| ----- |
 287 | NN | RT | RA  | RB  | RC  | mode 010 |Rc| bm*   |
 288 | NN | RT | RA  | RB  | RC  | 0 1  111 |Rc| bmrev |
 289
 290
 291 ```
 292 uint_xlen_t bmset(RA, RB, sh)
 293 {
 294     int shamt = RB & (XLEN - 1);
 295     mask = (2<<sh)-1;
 296     return RA | (mask << shamt);
 297 }
 298
 299 uint_xlen_t bmclr(RA, RB, sh)
 300 {
 301     int shamt = RB & (XLEN - 1);
 302     mask = (2<<sh)-1;
 303     return RA & ~(mask << shamt);
 304 }
 305
 306 uint_xlen_t bminv(RA, RB, sh)
 307 {
 308     int shamt = RB & (XLEN - 1);
 309     mask = (2<<sh)-1;
 310     return RA ^ (mask << shamt);
 311 }
 312
 313 uint_xlen_t bmext(RA, RB, sh)
 314 {
 315     int shamt = RB & (XLEN - 1);
 316     mask = (2<<sh)-1;
 317     return mask & (RA >> shamt);
 318 }
 319 ```
 320
 321 bitmask extract with reverse.  can be done by bitinverting all of RA and getting bits of RA from the opposite end.
 322
 323 ```
 324 msb = rb[5:0];
 325 rev[0:msb] = ra[msb:0];
 326 rt = ZE(rev[msb:0]);
 327
 328 uint_xlen_t bmextrev(RA, RB, sh)
 329 {
 330     int shamt = (RB & (XLEN - 1));
 331     shamt = (XLEN-1)-shamt;  # shift other end
 332     bra = bitreverse(RA)     # swap LSB-MSB
 333     mask = (2<<sh)-1;
 334     return mask & (bra >> shamt);
 335 }
 336 ```
 337
 338 | 0.5|6.10|11.15|16.20|21.26| 27..30  |31| name   |
 339 | -- | -- | --- | --- | --- | ------- |--| ------ |
 340 | NN | RT | RA  | RB  | sh  | 0   111 |Rc| bmrevi |
 341
 342
 343
 344 # grev
 345
 346 based on RV bitmanip
 347
 348 ```
 349 uint64_t grev64(uint64_t RA, uint64_t RB)
 350 {
 351     uint64_t x = RA;
 352     int shamt = RB & 63;
 353     if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
 354                         ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
 355     if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
 356                         ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
 357     if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
 358                         ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
 359     if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
 360                         ((x & 0xFF00FF00FF00FF00LL) >>  8);
 361     if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
 362                         ((x & 0xFFFF0000FFFF0000LL) >> 16);
 363     if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
 364                         ((x & 0xFFFFFFFF00000000LL) >> 32);
 365     return x;
 366 }
 367
 368 ```
 369
 370 # shuffle / unshuffle
 371
 372 based on RV bitmanip
 373
 374 ```
 375 uint32_t shfl32(uint32_t RA, uint32_t RB)
 376 {
 377     uint32_t x = RA;
 378     int shamt = RB & 15;
 379     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 380     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 381     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 382     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 383     return x;
 384 }
 385 uint32_t unshfl32(uint32_t RA, uint32_t RB)
 386 {
 387     uint32_t x = RA;
 388     int shamt = RB & 15;
 389     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 390     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 391     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 392     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 393     return x;
 394 }
 395
 396 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
 397 {
 398     uint64_t x = src & ~(maskL | maskR);
 399     x |= ((src << N) & maskL) | ((src >> N) & maskR);
 400     return x;
 401 }
 402 uint64_t shfl64(uint64_t RA, uint64_t RB)
 403 {
 404     uint64_t x = RA;
 405     int shamt = RB & 31;
 406     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 407                                            0x00000000ffff0000LL, 16);
 408     if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 409                                            0x0000ff000000ff00LL, 8);
 410     if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 411                                            0x00f000f000f000f0LL, 4);
 412     if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
 413                                            0x0c0c0c0c0c0c0c0cLL, 2);
 414     if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
 415                                            0x2222222222222222LL, 1);
 416     return x;
 417 }
 418 uint64_t unshfl64(uint64_t RA, uint64_t RB)
 419 {
 420     uint64_t x = RA;
 421     int shamt = RB & 31;
 422     if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
 423                                            0x2222222222222222LL, 1);
 424     if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
 425                                            0x0c0c0c0c0c0c0c0cLL, 2);
 426     if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 427                                            0x00f000f000f000f0LL, 4);
 428     if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 429                                            0x0000ff000000ff00LL, 8);
 430     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 431                                            0x00000000ffff0000LL, 16);
 432     return x;
 433 }
 434 ```
 435
 436 # xperm
 437
 438 based on RV bitmanip
 439
 440 ```
 441 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
 442 {
 443     uint_xlen_t r = 0;
 444     uint_xlen_t sz = 1LL << sz_log2;
 445     uint_xlen_t mask = (1LL << sz) - 1;
 446     for (int i = 0; i < XLEN; i += sz) {
 447         uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
 448         if (pos < XLEN)
 449             r |= ((RA >> pos) & mask) << i;
 450     }
 451     return r;
 452 }
 453 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
 454 {  return xperm(RA, RB, 2); }
 455 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
 456 {  return xperm(RA, RB, 3); }
 457 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
 458 {  return xperm(RA, RB, 4); }
 459 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
 460 {  return xperm(RA, RB, 5); }
 461 ```
 462
 463 # gorc
 464
 465 based on RV bitmanip
 466
 467 ```
 468 uint32_t gorc32(uint32_t RA, uint32_t RB)
 469 {
 470     uint32_t x = RA;
 471     int shamt = RB & 31;
 472     if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
 473     if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
 474     if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
 475     if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
 476     if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
 477     return x;
 478 }
 479 uint64_t gorc64(uint64_t RA, uint64_t RB)
 480 {
 481     uint64_t x = RA;
 482     int shamt = RB & 63;
 483     if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
 484                          ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
 485     if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
 486                          ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
 487     if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
 488                          ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
 489     if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
 490                          ((x & 0xFF00FF00FF00FF00LL)  >>  8);
 491     if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
 492                          ((x & 0xFFFF0000FFFF0000LL)  >> 16);
 493     if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
 494                          ((x & 0xFFFFFFFF00000000LL)  >> 32);
 495     return x;
 496 }
 497
 498 ```
 499
 500 # cmix
 501
 502 based on RV bitmanip, covered by ternary bitops
 503
 504 ```
 505 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
 506     return (RA & RB) | (RC & ~RB);
 507 }
 508 ```
 509
 510 # carryless mul
 511
 512 based on RV bitmanip
 513 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
 514
 515 ```
 516 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
 517 {
 518     uint_xlen_t x = 0;
 519     for (int i = 0; i < XLEN; i++)
 520         if ((RB >> i) & 1)
 521             x ^= RA << i;
 522     return x;
 523 }
 524 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
 525 {
 526     uint_xlen_t x = 0;
 527     for (int i = 1; i < XLEN; i++)
 528         if ((RB >> i) & 1)
 529             x ^= RA >> (XLEN-i);
 530     return x;
 531 }
 532 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
 533 {
 534     uint_xlen_t x = 0;
 535     for (int i = 0; i < XLEN; i++)
 536         if ((RB >> i) & 1)
 537             x ^= RA >> (XLEN-i-1);
 538     return x;
 539 }
 540 ```
 541 # Galois Field
 542
 543 see <https://courses.csail.mit.edu/6.857/2016/files/ffield.py>
 544
 545 ## Multiply
 546
 547 this requires 3 parameters and a "degree"
 548
 549     RT = GFMUL(RA, RB, gfdegree, modulo=RC)
 550
 551 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
 552
 553     RS = GFMUL(RS, RA, gfdegree, modulo=RB)
 554     RS = GFMUL(RS, RA, gfdegree=RC, modulo=RB)
 555
 556 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 557 | -- | -- | --- | --- | --- | ------- |--|
 558 | NN | RS | RA  | RB  | deg | 00  011 |Rc|
 559 | NN | RS | RA  | RB  | RC  | 11  011 |Rc|
 560
 561 where the SimpleV variant may override RS-as-src differently from RS-as-dest
 562
 563
 564
 565 ```
 566 from functools import reduce
 567
 568 # constants used in the multGF2 function
 569 mask1 = mask2 = polyred = None
 570
 571 def setGF2(degree, irPoly):
 572     """Define parameters of binary finite field GF(2^m)/g(x)
 573        - degree: extension degree of binary field
 574        - irPoly: coefficients of irreducible polynomial g(x)
 575     """
 576     def i2P(sInt):
 577         """Convert an integer into a polynomial"""
 578         return [(sInt >> i) & 1
 579                 for i in reversed(range(sInt.bit_length()))]
 580
 581     global mask1, mask2, polyred
 582     mask1 = mask2 = 1 << degree
 583     mask2 -= 1
 584     polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
 585
 586 def multGF2(p1, p2):
 587     """Multiply two polynomials in GF(2^m)/g(x)"""
 588     p = 0
 589     while p2:
 590         if p2 & 1:
 591             p ^= p1
 592         p1 <<= 1
 593         if p1 & mask1:
 594             p1 ^= polyred
 595         p2 >>= 1
 596     return p & mask2
 597
 598 if __name__ == "__main__":
 599
 600     # Define binary field GF(2^3)/x^3 + x + 1
 601     setGF2(3, 0b1011)
 602
 603     # Evaluate the product (x^2 + x + 1)(x^2 + 1)
 604     print("{:02x}".format(multGF2(0b111, 0b101)))
 605
 606     # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
 607     # (used in the Advanced Encryption Standard-AES)
 608     setGF2(8, 0b100011011)
 609
 610     # Evaluate the product (x^7)(x^7 + x + 1)
 611     print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
 612 ```
 613 ## GF add
 614
 615     RS = GFADDI(RS, RA|0, gfdegree, modulo=RB)
 616     RS = GFADD(RS, RA|0, gfdegree=RC, modulo=RB)
 617
 618 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31| name  |
 619 | -- | -- | --- | --- | --- | ------- |--| ----- |
 620 | NN | RS | RA  | RB  | deg | 0 1  011 |Rc| gfaddi |
 621 | NN | RS | RA  | RB  | RC  | 1 1  111 |Rc| gfadd |
 622
 623 GFMOD is a pseudo-op where RA=0
 624
 625 ## gf invert
 626
 627 ```
 628 def gf_degree(a) :
 629   res = 0
 630   a >>= 1
 631   while (a != 0) :
 632     a >>= 1;
 633     res += 1;
 634   return res
 635
 636 def gf_invert(a, mod=0x1B) :
 637   v = mod
 638   g1 = 1
 639   g2 = 0
 640   j = gf_degree(a) - 8
 641
 642   while (a != 1) :
 643     if (j < 0) :
 644       a, v = v, a
 645       g1, g2 = g2, g1
 646       j = -j
 647
 648     a ^= v << j
 649     g1 ^= g2 << j
 650
 651     a %= 256  # Emulating 8-bit overflow
 652     g1 %= 256 # Emulating 8-bit overflow
 653
 654     j = gf_degree(a) - gf_degree(v)
 655
 656   return g1
 657 ```
 658
 659 # bitmatrix
 660
 661 ```
 662 uint64_t bmatflip(uint64_t RA)
 663 {
 664     uint64_t x = RA;
 665     x = shfl64(x, 31);
 666     x = shfl64(x, 31);
 667     x = shfl64(x, 31);
 668     return x;
 669 }
 670 uint64_t bmatxor(uint64_t RA, uint64_t RB)
 671 {
 672     // transpose of RB
 673     uint64_t RBt = bmatflip(RB);
 674     uint8_t u[8]; // rows of RA
 675     uint8_t v[8]; // cols of RB
 676     for (int i = 0; i < 8; i++) {
 677         u[i] = RA >> (i*8);
 678         v[i] = RBt >> (i*8);
 679     }
 680     uint64_t x = 0;
 681     for (int i = 0; i < 64; i++) {
 682         if (pcnt(u[i / 8] & v[i % 8]) & 1)
 683             x |= 1LL << i;
 684     }
 685     return x;
 686 }
 687 uint64_t bmator(uint64_t RA, uint64_t RB)
 688 {
 689     // transpose of RB
 690     uint64_t RBt = bmatflip(RB);
 691     uint8_t u[8]; // rows of RA
 692     uint8_t v[8]; // cols of RB
 693     for (int i = 0; i < 8; i++) {
 694         u[i] = RA >> (i*8);
 695         v[i] = RBt >> (i*8);
 696     }
 697     uint64_t x = 0;
 698     for (int i = 0; i < 64; i++) {
 699         if ((u[i / 8] & v[i % 8]) != 0)
 700             x |= 1LL << i;
 701     }
 702     return x;
 703 }
 704
 705 ```