openpower/sv/bitmanip.mdwn

   1 [[!tag standards]]
   2
   3 # bitmanipulation
   4
   5 **DRAFT STATUS**
   6
   7 this extension amalgamates bitnanipulation primitives from many sources, including RISC-V bitmanip, Packed SIMD, AVX-512 and OpenPOWER VSX.  Vectorisation and SIMD are removed: these are straight scalar (element) operations.  Vectorisation Context is provided by [[openpower/sv]].
   8
   9 ternaryv is experimental and is the only operation that may be considered a "Packed SIMD".  It is added as a variant of the already well-justified ternary operation (done in AVX512 as an immediate only) "because it looks fun". As it is based on the LUT4 concept it will allow accelerated emulation of FPGAs.  Other vendors of ISAs are buying FPGA companies to achieve a similar objective.
  10
  11 general-purpose Galois Field operations are added so as to avoid huge opcode proliferation across many areas of Computer Science.  however for convenience and also to avoid setup costs, some of the more common operations (clmul, crc32) are also added.  The expectation is that these operations would all be covered by the same pipeline.
  12
  13 # summary
  14
  15 minor opcode allocation
  16
  17     |  28.30 |31| name      |
  18     | ------ |--| --------- |
  19     |   00   |Rc| ternaryi  |
  20     |  001   |Rc| ternary   |
  21     |  010   |Rc| bitmask   |
  22     |  011   |Rc| gf*       |
  23     |  101   |1 | ternaryv  |
  24     |  101   |0 | ternarycr |
  25     |  110   |Rc| 1/2-op    |
  26     |  111   |Rc| 3-op      |
  27
  28 1-op and variants
  29
  30 | dest | src1 | subop | op       |
  31 | ---- | ---- | ----- | -------- |
  32 | RT   | RA   | ..    | bmatflip |
  33
  34 2-op and variants
  35
  36 | dest | src1 | src2 | subop | op       |
  37 | ---- | ---- | ---- | ----- | -------- |
  38 | RT   | RA   | RB   | or    | bmatflip |
  39 | RT   | RA   | RB   | xor   | bmatflip |
  40 | RT   | RA   | RB   | bdep  | dep/ext  |
  41 | RT   | RA   | RB   | bext  | dep/ext  |
  42 | RT   | RA   | RB   |       | grev  |
  43 | RT   | RA   | RB   |       | clmul*  |
  44 | RT   | RA   | RB   |       | gorc |
  45 | RT   | RA   | RB   | shuf  | shuffle |
  46 | RT   | RA   | RB   | unshuf| shuffle |
  47 | RT   | RA   | RB   | width | xperm  |
  48 | RT   | RA   | RB   | type | minmax |
  49 | RT   | RA   | RB   |  |  |
  50 | RT   | RA   | RB   |  |  |
  51 | RT   | RA   | RB   |  |  |
  52
  53 3 ops
  54
  55 * bitmask set/extract
  56 * ternary bitops
  57 * GF
  58
  59 | 0.5|6.10|11.15|16.20|21..25 | 26....30 |31| name |
  60 | -- | -- | --- | --- | ----- | -------- |--| ------ |
  61 | NN | RT | RA  | RB  | RC    | mode 001 |Rc| ternary |
  62 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc| ternaryi |
  63 | NN | RS | RA  | RB  | RC    | 00  011  |Rc| gfmul |
  64 | NN | RS | RA  | RB  | RC    | 01  011  |Rc| gfadd |
  65 | NN | RT | RA  | RB  | deg   | 10  011  |Rc| gfinv |
  66 | NN | RS | RA  | RB  | deg   | 11  011  |Rc| gfmuli |
  67 | NN | RS | RA  | RB  | deg   | 11  111  |Rc| gfaddi |
  68
  69 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
  70 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
  71 | NN | RT | RA  | imm   | mask | 101   |1 | ternaryv |
  72
  73 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
  74 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
  75 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 | ternarycr |
  76
  77 ops
  78
  79 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
  80 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
  81 | NN | RA | RB  |     |       | 0  | 0000 110 |Rc| rsvd   |
  82 | NN | RA | RB  | RC  | itype | 1  | 0000 110 |Rc| xperm |
  83 | NN | RA | RB  | RC  | itype | 0  | 0100 110 |Rc| minmax |
  84 | NN | RA | RB  |     |       | 1  | 0100 110 |Rc| rsvd |
  85 | NN | RA | RB  | sh  | itype | SH | 1000 110 |Rc| bmopsi |
  86 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  87 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  88 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  89 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  90 | NN | RA | RB  |     |       | 0  | 0001 110 |Rc| rsvd |
  91 | NN | RA | RB  |     |       | 0  | 0101 110 |Rc| rsvd |
  92 | NN | RA | RB  | RC  | 00    | 0  | 0010 110 |Rc| gorc |
  93 | NN | RA | RB  | sh  | 00    | SH | 1010 110 |Rc| gorci |
  94 | NN | RA | RB  | RC  | 00    | 0  | 0110 110 |Rc| gorcw |
  95 | NN | RA | RB  | sh  | 00    | 0  | 1110 110 |Rc| gorcwi |
  96 | NN | RA | RB  | RC  | 00    | 1  | 1110 110 |Rc| bmator  |
  97 | NN | RA | RB  | RC  | 01    | 0  | 0010 110 |Rc| grev |
  98 | NN | RA | RB  | RC  | 01    | 1  | 0010 110 |Rc| clmul |
  99 | NN | RA | RB  | sh  | 01    | SH | 1010 110 |Rc| grevi |
 100 | NN | RA | RB  | RC  | 01    | 0  | 0110 110 |Rc| grevw |
 101 | NN | RA | RB  | sh  | 01    | 0  | 1110 110 |Rc| grevwi |
 102 | NN | RA | RB  | RC  | 01    | 1  | 1110 110 |Rc| bmatxor   |
 103 | NN | RA | RB  | RC  | 10    | 0  | 0010 110 |Rc| shfl |
 104 | NN | RA | RB  | sh  | 10    | SH | 1010 110 |Rc| shfli |
 105 | NN | RA | RB  | RC  | 10    | 0  | 0110 110 |Rc| shflw |
 106 | NN | RA | RB  | RC  | 10    | 0  | 1110 110 |Rc| bdep   |
 107 | NN | RA | RB  | RC  | 10    | 1  | 1110 110 |Rc| bext  |
 108 | NN | RA | RB  | RC  | 11    | 0  | 1110 110 |Rc| clmulr  |
 109 | NN | RA | RB  | RC  | 11    | 1  | 1110 110 |Rc| clmulh  |
 110 | NN | RA | RB  |     |       |    | NN11 110 |Rc| rsvd  |
 111
 112 # bit to byte permute
 113
 114 similar to matrix permute in RV bitmanip, which has XOR and OR variants
 115
 116     do j = 0 to 7
 117       do k = 0 to 7
 118          b = VSR[VRB+32].dword[i].byte[k].bit[j]
 119          VSR[VRT+32].dword[i].byte[j].bit[k] = b
 120
 121 # vector bit deposit
 122
 123 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
 124
 125     do while(m < 64)
 126        if VSR[VRB+32].dword[i].bit[63-m]=1 then do
 127           result = VSR[VRA+32].dword[i].bit[63-k]
 128           VSR[VRT+32].dword[i].bit[63-m] = result
 129           k = k + 1
 130        m = m + 1
 131
 132 ```
 133
 134 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
 135 {
 136     uint_xlen_t r = 0;
 137     for (int i = 0, j = 0; i < XLEN; i++)
 138         if ((RB >> i) & 1) {
 139             if ((RA >> j) & 1)
 140                 r |= uint_xlen_t(1) << i;
 141             j++;
 142         }
 143     return r;
 144 }
 145
 146 ```
 147
 148 # vector bit extract
 149
 150 other way round: identical to RV bext
 151
 152 ```
 153 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
 154 {
 155     uint_xlen_t r = 0;
 156     for (int i = 0, j = 0; i < XLEN; i++)
 157         if ((RB >> i) & 1) {
 158             if ((RA >> i) & 1)
 159                 r |= uint_xlen_t(1) << j;
 160             j++;
 161         }
 162     return r;
 163 }
 164 ```
 165
 166 # int min/max
 167
 168 signed and unsigned min/max for integer.  this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned.  when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
 169
 170 signed/unsigned min/max gives more flexibility.
 171
 172 ```
 173 uint_xlen_t min(uint_xlen_t rs1, uint_xlen_t rs2)
 174 { return (int_xlen_t)rs1 < (int_xlen_t)rs2 ? rs1 : rs2;
 175 }
 176 uint_xlen_t max(uint_xlen_t rs1, uint_xlen_t rs2)
 177 { return (int_xlen_t)rs1 > (int_xlen_t)rs2 ? rs1 : rs2;
 178 }
 179 uint_xlen_t minu(uint_xlen_t rs1, uint_xlen_t rs2)
 180 { return rs1 < rs2 ? rs1 : rs2;
 181 }
 182 uint_xlen_t maxu(uint_xlen_t rs1, uint_xlen_t rs2)
 183 { return rs1 > rs2 ? rs1 : rs2;
 184 }
 185 ```
 186
 187
 188 # ternary bitops
 189
 190 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
 191
 192 | 0.5|6.10|11.15|16.20| 21..25| 26..30   |31|
 193 | -- | -- | --- | --- | ----- | -------- |--|
 194 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc|
 195
 196     for i in range(64):
 197         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 198         RT[i] = (imm & (1<<idx)) != 0
 199
 200 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
 201
 202 a 4 operand variant which becomes more along the lines of an FPGA:
 203
 204 | 0.5|6.10|11.15|16.20|21.25| 26...30  |31|
 205 | -- | -- | --- | --- | --- | -------- |--|
 206 | NN | RT | RA  | RB  | RC  | mode 001 |Rc|
 207
 208     for i in range(64):
 209         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 210         RT[i] = (RC & (1<<idx)) != 0
 211
 212 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
 213 3 modes.
 214
 215 also, another possible variant involving swizzle and vec4:
 216
 217 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 218 | -- | -- | --- | ----- | ---- | ----- |--|
 219 | NN | RT | RA  | imm   | mask | 101   |1 |
 220
 221     for i in range(8):
 222         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 223         res = (imm & (1<<idx)) != 0
 224         for j in range(3):
 225              if mask[j]: RT[i+j*8] = res
 226
 227 another mode selection would be CRs not Ints.
 228
 229 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
 230 | -- | -- | --- | --- |- |-----|----- | -----|--|
 231 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 |
 232
 233     for i in range(4):
 234         if not mask[i] continue
 235         idx = crregs[BA][i] << 2 |
 236               crregs[BB][i] << 1 |
 237               crregs[BC][i]
 238         crregs[BA][i] = (imm & (1<<idx)) != 0
 239
 240 # bitmask set
 241
 242 based on RV bitmanip singlebit set, instruction format similar to shift
 243 [[isa/fixedshift]].  bmext is actually covered already (shift-with-mask rldicl but only immediate version).
 244 however bitmask-invert is not, and set/clr are not covered, although they can use the same Shift ALU.
 245
 246 bmext (RB) version is not the same as rldicl because bmext is a right shift by RC, where rldicl is a left rotate.  for the immediate version this does not matter, so a bmexti is not required.
 247 bmrev however there is no direct equivalent and consequently a bmrevi is required.
 248
 249 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31| name  |
 250 | -- | -- | --- | --- | --- | ------- |--| ----- |
 251 | NN | RT | RA  | RB  | RC  | mode 010 |Rc| bm*   |
 252 | NN | RT | RA  | RB  | RC  | 0 1  111 |Rc| bmrev |
 253
 254
 255 ```
 256 uint_xlen_t bmset(RA, RB, sh)
 257 {
 258     int shamt = RB & (XLEN - 1);
 259     mask = (2<<sh)-1;
 260     return RA | (mask << shamt);
 261 }
 262
 263 uint_xlen_t bmclr(RA, RB, sh)
 264 {
 265     int shamt = RB & (XLEN - 1);
 266     mask = (2<<sh)-1;
 267     return RA & ~(mask << shamt);
 268 }
 269
 270 uint_xlen_t bminv(RA, RB, sh)
 271 {
 272     int shamt = RB & (XLEN - 1);
 273     mask = (2<<sh)-1;
 274     return RA ^ (mask << shamt);
 275 }
 276
 277 uint_xlen_t bmext(RA, RB, sh)
 278 {
 279     int shamt = RB & (XLEN - 1);
 280     mask = (2<<sh)-1;
 281     return mask & (RA >> shamt);
 282 }
 283 ```
 284
 285 bitmask extract with reverse.  can be done by bitinverting all of RA and getting bits of RA from the opposite end.
 286
 287 ```
 288 msb = rb[5:0];
 289 rev[0:msb] = ra[msb:0];
 290 rt = ZE(rev[msb:0]);
 291
 292 uint_xlen_t bmextrev(RA, RB, sh)
 293 {
 294     int shamt = (RB & (XLEN - 1));
 295     shamt = (XLEN-1)-shamt;  # shift other end
 296     bra = bitreverse(RA)     # swap LSB-MSB
 297     mask = (2<<sh)-1;
 298     return mask & (bra >> shamt);
 299 }
 300 ```
 301
 302 | 0.5|6.10|11.15|16.20|21.26| 27..30  |31| name   |
 303 | -- | -- | --- | --- | --- | ------- |--| ------ |
 304 | NN | RT | RA  | RB  | sh  | 0   111 |Rc| bmrevi |
 305
 306
 307
 308 # grev
 309
 310 based on RV bitmanip
 311
 312 ```
 313 uint64_t grev64(uint64_t RA, uint64_t RB)
 314 {
 315     uint64_t x = RA;
 316     int shamt = RB & 63;
 317     if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
 318                         ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
 319     if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
 320                         ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
 321     if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
 322                         ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
 323     if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
 324                         ((x & 0xFF00FF00FF00FF00LL) >>  8);
 325     if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
 326                         ((x & 0xFFFF0000FFFF0000LL) >> 16);
 327     if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
 328                         ((x & 0xFFFFFFFF00000000LL) >> 32);
 329     return x;
 330 }
 331
 332 ```
 333
 334 # shuffle / unshuffle
 335
 336 based on RV bitmanip
 337
 338 ```
 339 uint32_t shfl32(uint32_t RA, uint32_t RB)
 340 {
 341     uint32_t x = RA;
 342     int shamt = RB & 15;
 343     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 344     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 345     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 346     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 347     return x;
 348 }
 349 uint32_t unshfl32(uint32_t RA, uint32_t RB)
 350 {
 351     uint32_t x = RA;
 352     int shamt = RB & 15;
 353     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 354     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 355     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 356     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 357     return x;
 358 }
 359
 360 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
 361 {
 362     uint64_t x = src & ~(maskL | maskR);
 363     x |= ((src << N) & maskL) | ((src >> N) & maskR);
 364     return x;
 365 }
 366 uint64_t shfl64(uint64_t RA, uint64_t RB)
 367 {
 368     uint64_t x = RA;
 369     int shamt = RB & 31;
 370     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 371                                            0x00000000ffff0000LL, 16);
 372     if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 373                                            0x0000ff000000ff00LL, 8);
 374     if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 375                                            0x00f000f000f000f0LL, 4);
 376     if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
 377                                            0x0c0c0c0c0c0c0c0cLL, 2);
 378     if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
 379                                            0x2222222222222222LL, 1);
 380     return x;
 381 }
 382 uint64_t unshfl64(uint64_t RA, uint64_t RB)
 383 {
 384     uint64_t x = RA;
 385     int shamt = RB & 31;
 386     if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
 387                                            0x2222222222222222LL, 1);
 388     if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
 389                                            0x0c0c0c0c0c0c0c0cLL, 2);
 390     if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 391                                            0x00f000f000f000f0LL, 4);
 392     if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 393                                            0x0000ff000000ff00LL, 8);
 394     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 395                                            0x00000000ffff0000LL, 16);
 396     return x;
 397 }
 398 ```
 399
 400 # xperm
 401
 402 based on RV bitmanip
 403
 404 ```
 405 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
 406 {
 407     uint_xlen_t r = 0;
 408     uint_xlen_t sz = 1LL << sz_log2;
 409     uint_xlen_t mask = (1LL << sz) - 1;
 410     for (int i = 0; i < XLEN; i += sz) {
 411         uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
 412         if (pos < XLEN)
 413             r |= ((RA >> pos) & mask) << i;
 414     }
 415     return r;
 416 }
 417 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
 418 {  return xperm(RA, RB, 2); }
 419 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
 420 {  return xperm(RA, RB, 3); }
 421 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
 422 {  return xperm(RA, RB, 4); }
 423 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
 424 {  return xperm(RA, RB, 5); }
 425 ```
 426
 427 # gorc
 428
 429 based on RV bitmanip
 430
 431 ```
 432 uint32_t gorc32(uint32_t RA, uint32_t RB)
 433 {
 434     uint32_t x = RA;
 435     int shamt = RB & 31;
 436     if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
 437     if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
 438     if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
 439     if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
 440     if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
 441     return x;
 442 }
 443 uint64_t gorc64(uint64_t RA, uint64_t RB)
 444 {
 445     uint64_t x = RA;
 446     int shamt = RB & 63;
 447     if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
 448                          ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
 449     if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
 450                          ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
 451     if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
 452                          ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
 453     if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
 454                          ((x & 0xFF00FF00FF00FF00LL)  >>  8);
 455     if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
 456                          ((x & 0xFFFF0000FFFF0000LL)  >> 16);
 457     if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
 458                          ((x & 0xFFFFFFFF00000000LL)  >> 32);
 459     return x;
 460 }
 461
 462 ```
 463
 464 # cmix
 465
 466 based on RV bitmanip, covered by ternary bitops
 467
 468 ```
 469 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
 470     return (RA & RB) | (RC & ~RB);
 471 }
 472 ```
 473
 474 # carryless mul
 475
 476 based on RV bitmanip
 477 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
 478
 479 ```
 480 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
 481 {
 482     uint_xlen_t x = 0;
 483     for (int i = 0; i < XLEN; i++)
 484         if ((RB >> i) & 1)
 485             x ^= RA << i;
 486     return x;
 487 }
 488 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
 489 {
 490     uint_xlen_t x = 0;
 491     for (int i = 1; i < XLEN; i++)
 492         if ((RB >> i) & 1)
 493             x ^= RA >> (XLEN-i);
 494     return x;
 495 }
 496 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
 497 {
 498     uint_xlen_t x = 0;
 499     for (int i = 0; i < XLEN; i++)
 500         if ((RB >> i) & 1)
 501             x ^= RA >> (XLEN-i-1);
 502     return x;
 503 }
 504 ```
 505 # Galois Field
 506
 507 see <https://courses.csail.mit.edu/6.857/2016/files/ffield.py>
 508
 509 ## Multiply
 510
 511 this requires 3 parameters and a "degree"
 512
 513     RT = GFMUL(RA, RB, gfdegree, modulo=RC)
 514
 515 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
 516
 517     RS = GFMUL(RS, RA, gfdegree, modulo=RB)
 518     RS = GFMUL(RS, RA, gfdegree=RC, modulo=RB)
 519
 520 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 521 | -- | -- | --- | --- | --- | ------- |--|
 522 | NN | RS | RA  | RB  | deg | 00  011 |Rc|
 523 | NN | RS | RA  | RB  | RC  | 11  011 |Rc|
 524
 525 where the SimpleV variant may override RS-as-src differently from RS-as-dest
 526
 527
 528
 529 ```
 530 from functools import reduce
 531
 532 # constants used in the multGF2 function
 533 mask1 = mask2 = polyred = None
 534
 535 def setGF2(degree, irPoly):
 536     """Define parameters of binary finite field GF(2^m)/g(x)
 537        - degree: extension degree of binary field
 538        - irPoly: coefficients of irreducible polynomial g(x)
 539     """
 540     def i2P(sInt):
 541         """Convert an integer into a polynomial"""
 542         return [(sInt >> i) & 1
 543                 for i in reversed(range(sInt.bit_length()))]
 544
 545     global mask1, mask2, polyred
 546     mask1 = mask2 = 1 << degree
 547     mask2 -= 1
 548     polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
 549
 550 def multGF2(p1, p2):
 551     """Multiply two polynomials in GF(2^m)/g(x)"""
 552     p = 0
 553     while p2:
 554         if p2 & 1:
 555             p ^= p1
 556         p1 <<= 1
 557         if p1 & mask1:
 558             p1 ^= polyred
 559         p2 >>= 1
 560     return p & mask2
 561
 562 if __name__ == "__main__":
 563
 564     # Define binary field GF(2^3)/x^3 + x + 1
 565     setGF2(3, 0b1011)
 566
 567     # Evaluate the product (x^2 + x + 1)(x^2 + 1)
 568     print("{:02x}".format(multGF2(0b111, 0b101)))
 569
 570     # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
 571     # (used in the Advanced Encryption Standard-AES)
 572     setGF2(8, 0b100011011)
 573
 574     # Evaluate the product (x^7)(x^7 + x + 1)
 575     print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
 576 ```
 577 ## GF add
 578
 579     RS = GFADDI(RS, RA|0, gfdegree, modulo=RB)
 580     RS = GFADD(RS, RA|0, gfdegree=RC, modulo=RB)
 581
 582 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31| name  |
 583 | -- | -- | --- | --- | --- | ------- |--| ----- |
 584 | NN | RS | RA  | RB  | deg | 0 1  011 |Rc| gfaddi |
 585 | NN | RS | RA  | RB  | RC  | 1 1  111 |Rc| gfadd |
 586
 587 GFMOD is a pseudo-op where RA=0
 588
 589 ## gf invert
 590
 591 ```
 592 def gf_degree(a) :
 593   res = 0
 594   a >>= 1
 595   while (a != 0) :
 596     a >>= 1;
 597     res += 1;
 598   return res
 599
 600 def gf_invert(a, mod=0x1B) :
 601   v = mod
 602   g1 = 1
 603   g2 = 0
 604   j = gf_degree(a) - 8
 605
 606   while (a != 1) :
 607     if (j < 0) :
 608       a, v = v, a
 609       g1, g2 = g2, g1
 610       j = -j
 611
 612     a ^= v << j
 613     g1 ^= g2 << j
 614
 615     a %= 256  # Emulating 8-bit overflow
 616     g1 %= 256 # Emulating 8-bit overflow
 617
 618     j = gf_degree(a) - gf_degree(v)
 619
 620   return g1
 621 ```
 622
 623 # bitmatrix
 624
 625 ```
 626 uint64_t bmatflip(uint64_t RA)
 627 {
 628     uint64_t x = RA;
 629     x = shfl64(x, 31);
 630     x = shfl64(x, 31);
 631     x = shfl64(x, 31);
 632     return x;
 633 }
 634 uint64_t bmatxor(uint64_t RA, uint64_t RB)
 635 {
 636     // transpose of RB
 637     uint64_t RBt = bmatflip(RB);
 638     uint8_t u[8]; // rows of RA
 639     uint8_t v[8]; // cols of RB
 640     for (int i = 0; i < 8; i++) {
 641         u[i] = RA >> (i*8);
 642         v[i] = RBt >> (i*8);
 643     }
 644     uint64_t x = 0;
 645     for (int i = 0; i < 64; i++) {
 646         if (pcnt(u[i / 8] & v[i % 8]) & 1)
 647             x |= 1LL << i;
 648     }
 649     return x;
 650 }
 651 uint64_t bmator(uint64_t RA, uint64_t RB)
 652 {
 653     // transpose of RB
 654     uint64_t RBt = bmatflip(RB);
 655     uint8_t u[8]; // rows of RA
 656     uint8_t v[8]; // cols of RB
 657     for (int i = 0; i < 8; i++) {
 658         u[i] = RA >> (i*8);
 659         v[i] = RBt >> (i*8);
 660     }
 661     uint64_t x = 0;
 662     for (int i = 0; i < 64; i++) {
 663         if ((u[i / 8] & v[i % 8]) != 0)
 664             x |= 1LL << i;
 665     }
 666     return x;
 667 }
 668
 669 ```