openpower/sv/bitmanip.mdwn

   1 [[!tag standards]]
   2
   3 # bitmanipulation
   4
   5 **DRAFT STATUS**
   6
   7 this extension amalgamates bitnanipulation primitives from many sources, including RISC-V bitmanip, Packed SIMD, AVX-512 and OpenPOWER VSX.  Vectorisation and SIMD are removed: these are straight scalar (element) operations.  Vectorisation Context is provided by [[openpower/sv]].
   8
   9 ternaryv is experimental and is the only operation that may be considered a "Packed SIMD".  It is added as a variant of the already well-justified ternary operation (done in AVX512 as an immediate only) "because it looks fun". As it is based on the LUT4 concept it will allow accelerated emulation of FPGAs.  Other vendors of ISAs are buying FPGA companies to achieve a similar objective.
  10
  11 general-purpose Galois Field operations are added so as to avoid huge opcode proliferation across many areas of Computer Science.  however for convenience and also to avoid setup costs, some of the more common operations (clmul, crc32) are also added.  The expectation is that these operations would all be covered by the same pipeline.
  12
  13 # summary
  14
  15 minor opcode allocation
  16
  17     |  28.30 |31| name      |
  18     | ------ |--| --------- |
  19     |   00   |Rc| ternaryi  |
  20     |  001   |Rc| ternary   |
  21     |  010   |Rc| bitmask   |
  22     |  011   |Rc| gf*       |
  23     |  101   |1 | ternaryv  |
  24     |  101   |0 | ternarycr |
  25     |  110   |Rc| 1/2-op    |
  26     |  111   |Rc| 3-op      |
  27
  28 1-op and variants
  29
  30 | dest | src1 | subop | op       |
  31 | ---- | ---- | ----- | -------- |
  32 | RT   | RA   | ..    | bmatflip |
  33
  34 2-op and variants
  35
  36 | dest | src1 | src2 | subop | op       |
  37 | ---- | ---- | ---- | ----- | -------- |
  38 | RT   | RA   | RB   | or    | bmatflip |
  39 | RT   | RA   | RB   | xor   | bmatflip |
  40 | RT   | RA   | RB   | bdep  | dep/ext  |
  41 | RT   | RA   | RB   | bext  | dep/ext  |
  42 | RT   | RA   | RB   |       | grev  |
  43 | RT   | RA   | RB   |       | clmul*  |
  44 | RT   | RA   | RB   |       | gorc |
  45 | RT   | RA   | RB   | shuf  | shuffle |
  46 | RT   | RA   | RB   | unshuf| shuffle |
  47 | RT   | RA   | RB   | width | xperm  |
  48 | RT   | RA   | RB   | type | minmax |
  49 | RT   | RA   | RB   |  |  |
  50 | RT   | RA   | RB   |  |  |
  51 | RT   | RA   | RB   |  |  |
  52
  53 3 ops
  54
  55 * bitmask set/extract
  56 * ternary bitops
  57 * GF
  58
  59 | 0.5|6.10|11.15|16.20|21..25 | 26....30 |31| name |
  60 | -- | -- | --- | --- | ----- | -------- |--| ------ |
  61 | NN | RT | RA  | RB  | RC    | mode 001 |Rc| ternary |
  62 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc| ternaryi |
  63 | NN | RS | RA  | RB  | RC    | 00  011  |Rc| gfmul |
  64 | NN | RS | RA  | RB  | RC    | 01  011  |Rc| gfadd |
  65 | NN | RT | RA  | RB  | deg   | 10  011  |Rc| gfinv |
  66 | NN | RS | RA  | RB  | deg   | 11  011  |Rc| gfmuli |
  67 | NN | RS | RA  | RB  | deg   | 11  111  |Rc| gfaddi |
  68
  69 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
  70 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
  71 | NN | RT | RA  | imm   | mask | 101   |1 | ternaryv |
  72
  73 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
  74 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
  75 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 | ternarycr |
  76
  77 ops
  78
  79 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
  80 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
  81 | NN | RA | RB  |     |       | 0  | 0000 110 |Rc| rsvd   |
  82 | NN | RA | RB  | RC  | itype | 1  | 0000 110 |Rc| xperm |
  83 | NN | RA | RB  | RC  | itype | 0  | 0100 110 |Rc| minmax |
  84 | NN | RA | RB  |     |       | 1  | 0100 110 |Rc| rsvd |
  85 | NN | RA | RB  | sh  | itype | SH | 1000 110 |Rc| bmopsi |
  86 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  87 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  88 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  89 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  90 | NN | RA | RB  |     |       | 0  | 0001 110 |Rc| rsvd |
  91 | NN | RA | RB  |     |       | 0  | 0101 110 |Rc| rsvd |
  92 | NN | RA | RB  | RC  | 00    | 0  | 0010 110 |Rc| gorc |
  93 | NN | RA | RB  | sh  | 00    | SH | 1010 110 |Rc| gorci |
  94 | NN | RA | RB  | RC  | 00    | 0  | 0110 110 |Rc| gorcw |
  95 | NN | RA | RB  | sh  | 00    | 0  | 1110 110 |Rc| gorcwi |
  96 | NN | RA | RB  | RC  | 00    | 1  | 1110 110 |Rc| bmator  |
  97 | NN | RA | RB  | RC  | 01    | 0  | 0010 110 |Rc| grev |
  98 | NN | RA | RB  | RC  | 01    | 1  | 0010 110 |Rc| clmul |
  99 | NN | RA | RB  | sh  | 01    | SH | 1010 110 |Rc| grevi |
 100 | NN | RA | RB  | RC  | 01    | 0  | 0110 110 |Rc| grevw |
 101 | NN | RA | RB  | sh  | 01    | 0  | 1110 110 |Rc| grevwi |
 102 | NN | RA | RB  | RC  | 01    | 1  | 1110 110 |Rc| bmatxor   |
 103 | NN | RA | RB  | RC  | 10    | 0  | 0010 110 |Rc| shfl |
 104 | NN | RA | RB  | sh  | 10    | SH | 1010 110 |Rc| shfli |
 105 | NN | RA | RB  | RC  | 10    | 0  | 0110 110 |Rc| shflw |
 106 | NN | RA | RB  | RC  | 10    | 0  | 1110 110 |Rc| bdep   |
 107 | NN | RA | RB  | RC  | 10    | 1  | 1110 110 |Rc| bext  |
 108 | NN | RA | RB  | RC  | 11    | 0  | 1110 110 |Rc| clmulr  |
 109 | NN | RA | RB  | RC  | 11    | 1  | 1110 110 |Rc| clmulh  |
 110 | NN | RA | RB  |     |       |    | NN11 110 |Rc| rsvd  |
 111
 112 # bit to byte permute
 113
 114 similar to matrix permute in RV bitmanip, which has XOR and OR variants
 115
 116     do j = 0 to 7
 117       do k = 0 to 7
 118          b = VSR[VRB+32].dword[i].byte[k].bit[j]
 119          VSR[VRT+32].dword[i].byte[j].bit[k] = b
 120
 121 #  bit deposit
 122
 123 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep, found already in v3.1 p106
 124
 125     do while(m < 64)
 126        if VSR[VRB+32].dword[i].bit[63-m]=1 then do
 127           result = VSR[VRA+32].dword[i].bit[63-k]
 128           VSR[VRT+32].dword[i].bit[63-m] = result
 129           k = k + 1
 130        m = m + 1
 131
 132 ```
 133
 134 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
 135 {
 136     uint_xlen_t r = 0;
 137     for (int i = 0, j = 0; i < XLEN; i++)
 138         if ((RB >> i) & 1) {
 139             if ((RA >> j) & 1)
 140                 r |= uint_xlen_t(1) << i;
 141             j++;
 142         }
 143     return r;
 144 }
 145
 146 ```
 147
 148 # bit extract
 149
 150 other way round: identical to RV bext, found in v3.1 p196
 151
 152 ```
 153 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
 154 {
 155     uint_xlen_t r = 0;
 156     for (int i = 0, j = 0; i < XLEN; i++)
 157         if ((RB >> i) & 1) {
 158             if ((RA >> i) & 1)
 159                 r |= uint_xlen_t(1) << j;
 160             j++;
 161         }
 162     return r;
 163 }
 164 ```
 165
 166 # centrifuge
 167
 168 found in v3.1 p106
 169
 170 ```
 171 ptr0 ← 0 ptr1 ← 0 do i = 0 to 63 if((RB)i=0) then do
 172 resultptr0 ← (RS)i end ptr0 ← ptr0 + 1
 173 if((RB)63-i==1) then do
 174 result63-ptr1 ← (RS)63-i end end ptr1 ← ptr1 + 1
 175 RA ← result
 176 ```
 177
 178 # int min/max
 179
 180 signed and unsigned min/max for integer.  this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned.  when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
 181
 182 signed/unsigned min/max gives more flexibility.
 183
 184 ```
 185 uint_xlen_t min(uint_xlen_t rs1, uint_xlen_t rs2)
 186 { return (int_xlen_t)rs1 < (int_xlen_t)rs2 ? rs1 : rs2;
 187 }
 188 uint_xlen_t max(uint_xlen_t rs1, uint_xlen_t rs2)
 189 { return (int_xlen_t)rs1 > (int_xlen_t)rs2 ? rs1 : rs2;
 190 }
 191 uint_xlen_t minu(uint_xlen_t rs1, uint_xlen_t rs2)
 192 { return rs1 < rs2 ? rs1 : rs2;
 193 }
 194 uint_xlen_t maxu(uint_xlen_t rs1, uint_xlen_t rs2)
 195 { return rs1 > rs2 ? rs1 : rs2;
 196 }
 197 ```
 198
 199
 200 # ternary bitops
 201
 202 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
 203
 204 | 0.5|6.10|11.15|16.20| 21..25| 26..30   |31|
 205 | -- | -- | --- | --- | ----- | -------- |--|
 206 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc|
 207
 208     for i in range(64):
 209         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 210         RT[i] = (imm & (1<<idx)) != 0
 211
 212 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
 213
 214 a 4 operand variant which becomes more along the lines of an FPGA:
 215
 216 | 0.5|6.10|11.15|16.20|21.25| 26...30  |31|
 217 | -- | -- | --- | --- | --- | -------- |--|
 218 | NN | RT | RA  | RB  | RC  | mode 001 |Rc|
 219
 220     for i in range(64):
 221         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 222         RT[i] = (RC & (1<<idx)) != 0
 223
 224 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
 225 3 modes.
 226
 227 also, another possible variant involving swizzle and vec4:
 228
 229 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 230 | -- | -- | --- | ----- | ---- | ----- |--|
 231 | NN | RT | RA  | imm   | mask | 101   |1 |
 232
 233     for i in range(8):
 234         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 235         res = (imm & (1<<idx)) != 0
 236         for j in range(3):
 237              if mask[j]: RT[i+j*8] = res
 238
 239 another mode selection would be CRs not Ints.
 240
 241 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
 242 | -- | -- | --- | --- |- |-----|----- | -----|--|
 243 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 |
 244
 245     for i in range(4):
 246         if not mask[i] continue
 247         idx = crregs[BA][i] << 2 |
 248               crregs[BB][i] << 1 |
 249               crregs[BC][i]
 250         crregs[BA][i] = (imm & (1<<idx)) != 0
 251
 252 # bitmask set
 253
 254 based on RV bitmanip singlebit set, instruction format similar to shift
 255 [[isa/fixedshift]].  bmext is actually covered already (shift-with-mask rldicl but only immediate version).
 256 however bitmask-invert is not, and set/clr are not covered, although they can use the same Shift ALU.
 257
 258 bmext (RB) version is not the same as rldicl because bmext is a right shift by RC, where rldicl is a left rotate.  for the immediate version this does not matter, so a bmexti is not required.
 259 bmrev however there is no direct equivalent and consequently a bmrevi is required.
 260
 261 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31| name  |
 262 | -- | -- | --- | --- | --- | ------- |--| ----- |
 263 | NN | RT | RA  | RB  | RC  | mode 010 |Rc| bm*   |
 264 | NN | RT | RA  | RB  | RC  | 0 1  111 |Rc| bmrev |
 265
 266
 267 ```
 268 uint_xlen_t bmset(RA, RB, sh)
 269 {
 270     int shamt = RB & (XLEN - 1);
 271     mask = (2<<sh)-1;
 272     return RA | (mask << shamt);
 273 }
 274
 275 uint_xlen_t bmclr(RA, RB, sh)
 276 {
 277     int shamt = RB & (XLEN - 1);
 278     mask = (2<<sh)-1;
 279     return RA & ~(mask << shamt);
 280 }
 281
 282 uint_xlen_t bminv(RA, RB, sh)
 283 {
 284     int shamt = RB & (XLEN - 1);
 285     mask = (2<<sh)-1;
 286     return RA ^ (mask << shamt);
 287 }
 288
 289 uint_xlen_t bmext(RA, RB, sh)
 290 {
 291     int shamt = RB & (XLEN - 1);
 292     mask = (2<<sh)-1;
 293     return mask & (RA >> shamt);
 294 }
 295 ```
 296
 297 bitmask extract with reverse.  can be done by bitinverting all of RA and getting bits of RA from the opposite end.
 298
 299 ```
 300 msb = rb[5:0];
 301 rev[0:msb] = ra[msb:0];
 302 rt = ZE(rev[msb:0]);
 303
 304 uint_xlen_t bmextrev(RA, RB, sh)
 305 {
 306     int shamt = (RB & (XLEN - 1));
 307     shamt = (XLEN-1)-shamt;  # shift other end
 308     bra = bitreverse(RA)     # swap LSB-MSB
 309     mask = (2<<sh)-1;
 310     return mask & (bra >> shamt);
 311 }
 312 ```
 313
 314 | 0.5|6.10|11.15|16.20|21.26| 27..30  |31| name   |
 315 | -- | -- | --- | --- | --- | ------- |--| ------ |
 316 | NN | RT | RA  | RB  | sh  | 0   111 |Rc| bmrevi |
 317
 318
 319
 320 # grev
 321
 322 based on RV bitmanip
 323
 324 ```
 325 uint64_t grev64(uint64_t RA, uint64_t RB)
 326 {
 327     uint64_t x = RA;
 328     int shamt = RB & 63;
 329     if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
 330                         ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
 331     if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
 332                         ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
 333     if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
 334                         ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
 335     if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
 336                         ((x & 0xFF00FF00FF00FF00LL) >>  8);
 337     if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
 338                         ((x & 0xFFFF0000FFFF0000LL) >> 16);
 339     if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
 340                         ((x & 0xFFFFFFFF00000000LL) >> 32);
 341     return x;
 342 }
 343
 344 ```
 345
 346 # shuffle / unshuffle
 347
 348 based on RV bitmanip
 349
 350 ```
 351 uint32_t shfl32(uint32_t RA, uint32_t RB)
 352 {
 353     uint32_t x = RA;
 354     int shamt = RB & 15;
 355     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 356     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 357     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 358     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 359     return x;
 360 }
 361 uint32_t unshfl32(uint32_t RA, uint32_t RB)
 362 {
 363     uint32_t x = RA;
 364     int shamt = RB & 15;
 365     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 366     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 367     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 368     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 369     return x;
 370 }
 371
 372 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
 373 {
 374     uint64_t x = src & ~(maskL | maskR);
 375     x |= ((src << N) & maskL) | ((src >> N) & maskR);
 376     return x;
 377 }
 378 uint64_t shfl64(uint64_t RA, uint64_t RB)
 379 {
 380     uint64_t x = RA;
 381     int shamt = RB & 31;
 382     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 383                                            0x00000000ffff0000LL, 16);
 384     if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 385                                            0x0000ff000000ff00LL, 8);
 386     if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 387                                            0x00f000f000f000f0LL, 4);
 388     if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
 389                                            0x0c0c0c0c0c0c0c0cLL, 2);
 390     if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
 391                                            0x2222222222222222LL, 1);
 392     return x;
 393 }
 394 uint64_t unshfl64(uint64_t RA, uint64_t RB)
 395 {
 396     uint64_t x = RA;
 397     int shamt = RB & 31;
 398     if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
 399                                            0x2222222222222222LL, 1);
 400     if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
 401                                            0x0c0c0c0c0c0c0c0cLL, 2);
 402     if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 403                                            0x00f000f000f000f0LL, 4);
 404     if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 405                                            0x0000ff000000ff00LL, 8);
 406     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 407                                            0x00000000ffff0000LL, 16);
 408     return x;
 409 }
 410 ```
 411
 412 # xperm
 413
 414 based on RV bitmanip
 415
 416 ```
 417 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
 418 {
 419     uint_xlen_t r = 0;
 420     uint_xlen_t sz = 1LL << sz_log2;
 421     uint_xlen_t mask = (1LL << sz) - 1;
 422     for (int i = 0; i < XLEN; i += sz) {
 423         uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
 424         if (pos < XLEN)
 425             r |= ((RA >> pos) & mask) << i;
 426     }
 427     return r;
 428 }
 429 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
 430 {  return xperm(RA, RB, 2); }
 431 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
 432 {  return xperm(RA, RB, 3); }
 433 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
 434 {  return xperm(RA, RB, 4); }
 435 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
 436 {  return xperm(RA, RB, 5); }
 437 ```
 438
 439 # gorc
 440
 441 based on RV bitmanip
 442
 443 ```
 444 uint32_t gorc32(uint32_t RA, uint32_t RB)
 445 {
 446     uint32_t x = RA;
 447     int shamt = RB & 31;
 448     if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
 449     if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
 450     if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
 451     if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
 452     if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
 453     return x;
 454 }
 455 uint64_t gorc64(uint64_t RA, uint64_t RB)
 456 {
 457     uint64_t x = RA;
 458     int shamt = RB & 63;
 459     if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
 460                          ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
 461     if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
 462                          ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
 463     if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
 464                          ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
 465     if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
 466                          ((x & 0xFF00FF00FF00FF00LL)  >>  8);
 467     if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
 468                          ((x & 0xFFFF0000FFFF0000LL)  >> 16);
 469     if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
 470                          ((x & 0xFFFFFFFF00000000LL)  >> 32);
 471     return x;
 472 }
 473
 474 ```
 475
 476 # cmix
 477
 478 based on RV bitmanip, covered by ternary bitops
 479
 480 ```
 481 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
 482     return (RA & RB) | (RC & ~RB);
 483 }
 484 ```
 485
 486 # carryless mul
 487
 488 based on RV bitmanip
 489 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
 490
 491 ```
 492 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
 493 {
 494     uint_xlen_t x = 0;
 495     for (int i = 0; i < XLEN; i++)
 496         if ((RB >> i) & 1)
 497             x ^= RA << i;
 498     return x;
 499 }
 500 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
 501 {
 502     uint_xlen_t x = 0;
 503     for (int i = 1; i < XLEN; i++)
 504         if ((RB >> i) & 1)
 505             x ^= RA >> (XLEN-i);
 506     return x;
 507 }
 508 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
 509 {
 510     uint_xlen_t x = 0;
 511     for (int i = 0; i < XLEN; i++)
 512         if ((RB >> i) & 1)
 513             x ^= RA >> (XLEN-i-1);
 514     return x;
 515 }
 516 ```
 517 # Galois Field
 518
 519 see <https://courses.csail.mit.edu/6.857/2016/files/ffield.py>
 520
 521 ## Multiply
 522
 523 this requires 3 parameters and a "degree"
 524
 525     RT = GFMUL(RA, RB, gfdegree, modulo=RC)
 526
 527 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
 528
 529     RS = GFMUL(RS, RA, gfdegree, modulo=RB)
 530     RS = GFMUL(RS, RA, gfdegree=RC, modulo=RB)
 531
 532 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 533 | -- | -- | --- | --- | --- | ------- |--|
 534 | NN | RS | RA  | RB  | deg | 00  011 |Rc|
 535 | NN | RS | RA  | RB  | RC  | 11  011 |Rc|
 536
 537 where the SimpleV variant may override RS-as-src differently from RS-as-dest
 538
 539
 540
 541 ```
 542 from functools import reduce
 543
 544 # constants used in the multGF2 function
 545 mask1 = mask2 = polyred = None
 546
 547 def setGF2(degree, irPoly):
 548     """Define parameters of binary finite field GF(2^m)/g(x)
 549        - degree: extension degree of binary field
 550        - irPoly: coefficients of irreducible polynomial g(x)
 551     """
 552     def i2P(sInt):
 553         """Convert an integer into a polynomial"""
 554         return [(sInt >> i) & 1
 555                 for i in reversed(range(sInt.bit_length()))]
 556
 557     global mask1, mask2, polyred
 558     mask1 = mask2 = 1 << degree
 559     mask2 -= 1
 560     polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
 561
 562 def multGF2(p1, p2):
 563     """Multiply two polynomials in GF(2^m)/g(x)"""
 564     p = 0
 565     while p2:
 566         if p2 & 1:
 567             p ^= p1
 568         p1 <<= 1
 569         if p1 & mask1:
 570             p1 ^= polyred
 571         p2 >>= 1
 572     return p & mask2
 573
 574 if __name__ == "__main__":
 575
 576     # Define binary field GF(2^3)/x^3 + x + 1
 577     setGF2(3, 0b1011)
 578
 579     # Evaluate the product (x^2 + x + 1)(x^2 + 1)
 580     print("{:02x}".format(multGF2(0b111, 0b101)))
 581
 582     # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
 583     # (used in the Advanced Encryption Standard-AES)
 584     setGF2(8, 0b100011011)
 585
 586     # Evaluate the product (x^7)(x^7 + x + 1)
 587     print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
 588 ```
 589 ## GF add
 590
 591     RS = GFADDI(RS, RA|0, gfdegree, modulo=RB)
 592     RS = GFADD(RS, RA|0, gfdegree=RC, modulo=RB)
 593
 594 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31| name  |
 595 | -- | -- | --- | --- | --- | ------- |--| ----- |
 596 | NN | RS | RA  | RB  | deg | 0 1  011 |Rc| gfaddi |
 597 | NN | RS | RA  | RB  | RC  | 1 1  111 |Rc| gfadd |
 598
 599 GFMOD is a pseudo-op where RA=0
 600
 601 ## gf invert
 602
 603 ```
 604 def gf_degree(a) :
 605   res = 0
 606   a >>= 1
 607   while (a != 0) :
 608     a >>= 1;
 609     res += 1;
 610   return res
 611
 612 def gf_invert(a, mod=0x1B) :
 613   v = mod
 614   g1 = 1
 615   g2 = 0
 616   j = gf_degree(a) - 8
 617
 618   while (a != 1) :
 619     if (j < 0) :
 620       a, v = v, a
 621       g1, g2 = g2, g1
 622       j = -j
 623
 624     a ^= v << j
 625     g1 ^= g2 << j
 626
 627     a %= 256  # Emulating 8-bit overflow
 628     g1 %= 256 # Emulating 8-bit overflow
 629
 630     j = gf_degree(a) - gf_degree(v)
 631
 632   return g1
 633 ```
 634
 635 # bitmatrix
 636
 637 ```
 638 uint64_t bmatflip(uint64_t RA)
 639 {
 640     uint64_t x = RA;
 641     x = shfl64(x, 31);
 642     x = shfl64(x, 31);
 643     x = shfl64(x, 31);
 644     return x;
 645 }
 646 uint64_t bmatxor(uint64_t RA, uint64_t RB)
 647 {
 648     // transpose of RB
 649     uint64_t RBt = bmatflip(RB);
 650     uint8_t u[8]; // rows of RA
 651     uint8_t v[8]; // cols of RB
 652     for (int i = 0; i < 8; i++) {
 653         u[i] = RA >> (i*8);
 654         v[i] = RBt >> (i*8);
 655     }
 656     uint64_t x = 0;
 657     for (int i = 0; i < 64; i++) {
 658         if (pcnt(u[i / 8] & v[i % 8]) & 1)
 659             x |= 1LL << i;
 660     }
 661     return x;
 662 }
 663 uint64_t bmator(uint64_t RA, uint64_t RB)
 664 {
 665     // transpose of RB
 666     uint64_t RBt = bmatflip(RB);
 667     uint8_t u[8]; // rows of RA
 668     uint8_t v[8]; // cols of RB
 669     for (int i = 0; i < 8; i++) {
 670         u[i] = RA >> (i*8);
 671         v[i] = RBt >> (i*8);
 672     }
 673     uint64_t x = 0;
 674     for (int i = 0; i < 64; i++) {
 675         if ((u[i / 8] & v[i % 8]) != 0)
 676             x |= 1LL << i;
 677     }
 678     return x;
 679 }
 680
 681 ```