openpower/sv/bitmanip.mdwn

   1 [[!tag standards]]
   2
   3 # bitmanipulation
   4
   5 **DRAFT STATUS**
   6
   7 this extension amalgamates bitnanipulation primitives from many sources, including RISC-V bitmanip, Packed SIMD, AVX-512 and OpenPOWER VSX.  Vectorisation and SIMD are removed: these are straight scalar (element) operations.  Vectorisation Context is provided by [[openpower/sv]].
   8
   9 ternaryv is experimental and is the only operation that may be considered a "Packed SIMD".  It is added as a variant of the already well-justified ternary operation (done in AVX512 as an immediate only) "because it looks fun". As it is based on the LUT4 concept it will allow accelerated emulation of FPGAs.  Other vendors of ISAs are buying FPGA companies to achieve a similar objective.
  10
  11 general-purpose Galois Field operations are added so as to avoid huge opcode proliferation across many areas of Computer Science.  however for convenience and also to avoid setup costs, some of the more common operations (clmul, crc32) are also added.  The expectation is that these operations would all be covered by the same pipeline.
  12
  13 # summary
  14
  15 minor opcode allocation
  16
  17     |  28.30 |31| name      |
  18     | ------ |--| --------- |
  19     |   00   |Rc| ternaryi  |
  20     |  001   |Rc| ternary   |
  21     |  010   |Rc| bitmask   |
  22     |  011   |Rc| gf*       |
  23     |  101   |1 | ternaryv  |
  24     |  101   |0 | ternarycr |
  25     |  110   |Rc| 1/2-op    |
  26     |  111   |Rc| 3-op      |
  27
  28 1-op and variants
  29
  30 | dest | src1 | subop | op       |
  31 | ---- | ---- | ----- | -------- |
  32 | RT   | RA   | ..    | bmatflip |
  33
  34 2-op and variants
  35
  36 | dest | src1 | src2 | subop | op       |
  37 | ---- | ---- | ---- | ----- | -------- |
  38 | RT   | RA   | RB   | or    | bmatflip |
  39 | RT   | RA   | RB   | xor   | bmatflip |
  40 | RT   | RA   | RB   | bdep  | dep/ext  |
  41 | RT   | RA   | RB   | bext  | dep/ext  |
  42 | RT   | RA   | RB   |       | grev  |
  43 | RT   | RA   | RB   |       | clmul*  |
  44 | RT   | RA   | RB   |       | gorc |
  45 | RT   | RA   | RB   | shuf  | shuffle |
  46 | RT   | RA   | RB   | unshuf| shuffle |
  47 | RT   | RA   | RB   | width | xperm  |
  48 | RT   | RA   | RB   | type | minmax |
  49 | RT   | RA   | RB   |  |  |
  50 | RT   | RA   | RB   |  |  |
  51 | RT   | RA   | RB   |  |  |
  52
  53 3 ops
  54
  55 * bitmask set/extract
  56 * ternary bitops
  57 * GF
  58
  59 | 0.5|6.10|11.15|16.20|21..25 | 26....30 |31| name |
  60 | -- | -- | --- | --- | ----- | -------- |--| ------ |
  61 | NN | RT | RA  | RB  | RC    | mode 001 |Rc| ternary |
  62 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc| ternaryi |
  63 | NN | RS | RA  | RB  | RC    | 00  011  |Rc| gfmul |
  64 | NN | RS | RA  | RB  | RC    | 01  011  |Rc| gfadd |
  65 | NN | RT | RA  | RB  | deg   | 10  011  |Rc| gfinv |
  66 | NN | RS | RA  | RB  | deg   | 11  011  |Rc| gfmuli |
  67 | NN | RS | RA  | RB  | deg   | 11  111  |Rc| gfaddi |
  68
  69 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
  70 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
  71 | NN | RT | RA  | imm   | mask | 101   |1 | ternaryv |
  72
  73 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
  74 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
  75 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 | ternarycr |
  76
  77 ops
  78
  79 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
  80 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
  81 | NN | RA | RB  |     |       | 0  | 0000 110 |Rc| rsvd   |
  82 | NN | RA | RB  | RC  | itype | 1  | 0000 110 |Rc| xperm |
  83 | NN | RA | RB  | RC  | itype | 0  | 0100 110 |Rc| minmax |
  84 | NN | RA | RB  |     |       | 1  | 0100 110 |Rc| rsvd |
  85 | NN | RA | RB  | sh  | itype | SH | 1000 110 |Rc| bmopsi |
  86 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  87 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  88 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  89 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  90 | NN | RA | RB  |     |       | 0  | 0001 110 |Rc| rsvd |
  91 | NN | RA | RB  |     |       | 0  | 0101 110 |Rc| rsvd |
  92 | NN | RA | RB  | RC  | 00    | 0  | 0010 110 |Rc| gorc |
  93 | NN | RA | RB  | sh  | 00    | SH | 1010 110 |Rc| gorci |
  94 | NN | RA | RB  | RC  | 00    | 0  | 0110 110 |Rc| gorcw |
  95 | NN | RA | RB  | sh  | 00    | 0  | 1110 110 |Rc| gorcwi |
  96 | NN | RA | RB  | RC  | 00    | 1  | 1110 110 |Rc| bmator  |
  97 | NN | RA | RB  | RC  | 01    | 0  | 0010 110 |Rc| grev |
  98 | NN | RA | RB  | RC  | 01    | 1  | 0010 110 |Rc| clmul |
  99 | NN | RA | RB  | sh  | 01    | SH | 1010 110 |Rc| grevi |
 100 | NN | RA | RB  | RC  | 01    | 0  | 0110 110 |Rc| grevw |
 101 | NN | RA | RB  | sh  | 01    | 0  | 1110 110 |Rc| grevwi |
 102 | NN | RA | RB  | RC  | 01    | 1  | 1110 110 |Rc| bmatxor   |
 103 | NN | RA | RB  | RC  | 10    | 0  | 0010 110 |Rc| shfl |
 104 | NN | RA | RB  | sh  | 10    | SH | 1010 110 |Rc| shfli |
 105 | NN | RA | RB  | RC  | 10    | 0  | 0110 110 |Rc| shflw |
 106 | NN | RA | RB  | RC  | 10    | 0  | 1110 110 |Rc| bdep   |
 107 | NN | RA | RB  | RC  | 10    | 1  | 1110 110 |Rc| bext  |
 108 | NN | RA | RB  | RC  | 11    | 0  | 1110 110 |Rc| clmulr  |
 109 | NN | RA | RB  | RC  | 11    | 1  | 1110 110 |Rc| clmulh  |
 110 | NN | RA | RB  |     |       |    | NN11 110 |Rc| rsvd  |
 111
 112 # count leading/trailing zeros with mask
 113
 114 in v3.1 p105
 115
 116 ```
 117 count = 0
 118 do i = 0 to 63 if((RB)i=1) then do
 119 if((RS)i=1) then break end end count ← count + 1
 120 RA ← EXTZ64(count)
 121 ```
 122
 123 # bit to byte permute
 124
 125 similar to matrix permute in RV bitmanip, which has XOR and OR variants
 126
 127     do j = 0 to 7
 128       do k = 0 to 7
 129          b = VSR[VRB+32].dword[i].byte[k].bit[j]
 130          VSR[VRT+32].dword[i].byte[j].bit[k] = b
 131
 132 #  bit deposit
 133
 134 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep, found already in v3.1 p106
 135
 136     do while(m < 64)
 137        if VSR[VRB+32].dword[i].bit[63-m]=1 then do
 138           result = VSR[VRA+32].dword[i].bit[63-k]
 139           VSR[VRT+32].dword[i].bit[63-m] = result
 140           k = k + 1
 141        m = m + 1
 142
 143 ```
 144
 145 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
 146 {
 147     uint_xlen_t r = 0;
 148     for (int i = 0, j = 0; i < XLEN; i++)
 149         if ((RB >> i) & 1) {
 150             if ((RA >> j) & 1)
 151                 r |= uint_xlen_t(1) << i;
 152             j++;
 153         }
 154     return r;
 155 }
 156
 157 ```
 158
 159 # bit extract
 160
 161 other way round: identical to RV bext, found in v3.1 p196
 162
 163 ```
 164 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
 165 {
 166     uint_xlen_t r = 0;
 167     for (int i = 0, j = 0; i < XLEN; i++)
 168         if ((RB >> i) & 1) {
 169             if ((RA >> i) & 1)
 170                 r |= uint_xlen_t(1) << j;
 171             j++;
 172         }
 173     return r;
 174 }
 175 ```
 176
 177 # centrifuge
 178
 179 found in v3.1 p106
 180
 181 ```
 182 ptr0 ← 0 ptr1 ← 0 do i = 0 to 63 if((RB)i=0) then do
 183 resultptr0 ← (RS)i end ptr0 ← ptr0 + 1
 184 if((RB)63-i==1) then do
 185 result63-ptr1 ← (RS)63-i end end ptr1 ← ptr1 + 1
 186 RA ← result
 187 ```
 188
 189 # int min/max
 190
 191 signed and unsigned min/max for integer.  this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned.  when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
 192
 193 signed/unsigned min/max gives more flexibility.
 194
 195 ```
 196 uint_xlen_t min(uint_xlen_t rs1, uint_xlen_t rs2)
 197 { return (int_xlen_t)rs1 < (int_xlen_t)rs2 ? rs1 : rs2;
 198 }
 199 uint_xlen_t max(uint_xlen_t rs1, uint_xlen_t rs2)
 200 { return (int_xlen_t)rs1 > (int_xlen_t)rs2 ? rs1 : rs2;
 201 }
 202 uint_xlen_t minu(uint_xlen_t rs1, uint_xlen_t rs2)
 203 { return rs1 < rs2 ? rs1 : rs2;
 204 }
 205 uint_xlen_t maxu(uint_xlen_t rs1, uint_xlen_t rs2)
 206 { return rs1 > rs2 ? rs1 : rs2;
 207 }
 208 ```
 209
 210
 211 # ternary bitops
 212
 213 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
 214
 215 | 0.5|6.10|11.15|16.20| 21..25| 26..30   |31|
 216 | -- | -- | --- | --- | ----- | -------- |--|
 217 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc|
 218
 219     for i in range(64):
 220         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 221         RT[i] = (imm & (1<<idx)) != 0
 222
 223 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
 224
 225 a 4 operand variant which becomes more along the lines of an FPGA:
 226
 227 | 0.5|6.10|11.15|16.20|21.25| 26...30  |31|
 228 | -- | -- | --- | --- | --- | -------- |--|
 229 | NN | RT | RA  | RB  | RC  | mode 001 |Rc|
 230
 231     for i in range(64):
 232         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 233         RT[i] = (RC & (1<<idx)) != 0
 234
 235 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
 236 3 modes.
 237
 238 also, another possible variant involving swizzle and vec4:
 239
 240 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 241 | -- | -- | --- | ----- | ---- | ----- |--|
 242 | NN | RT | RA  | imm   | mask | 101   |1 |
 243
 244     for i in range(8):
 245         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 246         res = (imm & (1<<idx)) != 0
 247         for j in range(3):
 248              if mask[j]: RT[i+j*8] = res
 249
 250 another mode selection would be CRs not Ints.
 251
 252 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
 253 | -- | -- | --- | --- |- |-----|----- | -----|--|
 254 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 |
 255
 256     for i in range(4):
 257         if not mask[i] continue
 258         idx = crregs[BA][i] << 2 |
 259               crregs[BB][i] << 1 |
 260               crregs[BC][i]
 261         crregs[BA][i] = (imm & (1<<idx)) != 0
 262
 263 # bitmask set
 264
 265 based on RV bitmanip singlebit set, instruction format similar to shift
 266 [[isa/fixedshift]].  bmext is actually covered already (shift-with-mask rldicl but only immediate version).
 267 however bitmask-invert is not, and set/clr are not covered, although they can use the same Shift ALU.
 268
 269 bmext (RB) version is not the same as rldicl because bmext is a right shift by RC, where rldicl is a left rotate.  for the immediate version this does not matter, so a bmexti is not required.
 270 bmrev however there is no direct equivalent and consequently a bmrevi is required.
 271
 272 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31| name  |
 273 | -- | -- | --- | --- | --- | ------- |--| ----- |
 274 | NN | RT | RA  | RB  | RC  | mode 010 |Rc| bm*   |
 275 | NN | RT | RA  | RB  | RC  | 0 1  111 |Rc| bmrev |
 276
 277
 278 ```
 279 uint_xlen_t bmset(RA, RB, sh)
 280 {
 281     int shamt = RB & (XLEN - 1);
 282     mask = (2<<sh)-1;
 283     return RA | (mask << shamt);
 284 }
 285
 286 uint_xlen_t bmclr(RA, RB, sh)
 287 {
 288     int shamt = RB & (XLEN - 1);
 289     mask = (2<<sh)-1;
 290     return RA & ~(mask << shamt);
 291 }
 292
 293 uint_xlen_t bminv(RA, RB, sh)
 294 {
 295     int shamt = RB & (XLEN - 1);
 296     mask = (2<<sh)-1;
 297     return RA ^ (mask << shamt);
 298 }
 299
 300 uint_xlen_t bmext(RA, RB, sh)
 301 {
 302     int shamt = RB & (XLEN - 1);
 303     mask = (2<<sh)-1;
 304     return mask & (RA >> shamt);
 305 }
 306 ```
 307
 308 bitmask extract with reverse.  can be done by bitinverting all of RA and getting bits of RA from the opposite end.
 309
 310 ```
 311 msb = rb[5:0];
 312 rev[0:msb] = ra[msb:0];
 313 rt = ZE(rev[msb:0]);
 314
 315 uint_xlen_t bmextrev(RA, RB, sh)
 316 {
 317     int shamt = (RB & (XLEN - 1));
 318     shamt = (XLEN-1)-shamt;  # shift other end
 319     bra = bitreverse(RA)     # swap LSB-MSB
 320     mask = (2<<sh)-1;
 321     return mask & (bra >> shamt);
 322 }
 323 ```
 324
 325 | 0.5|6.10|11.15|16.20|21.26| 27..30  |31| name   |
 326 | -- | -- | --- | --- | --- | ------- |--| ------ |
 327 | NN | RT | RA  | RB  | sh  | 0   111 |Rc| bmrevi |
 328
 329
 330
 331 # grev
 332
 333 based on RV bitmanip
 334
 335 ```
 336 uint64_t grev64(uint64_t RA, uint64_t RB)
 337 {
 338     uint64_t x = RA;
 339     int shamt = RB & 63;
 340     if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
 341                         ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
 342     if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
 343                         ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
 344     if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
 345                         ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
 346     if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
 347                         ((x & 0xFF00FF00FF00FF00LL) >>  8);
 348     if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
 349                         ((x & 0xFFFF0000FFFF0000LL) >> 16);
 350     if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
 351                         ((x & 0xFFFFFFFF00000000LL) >> 32);
 352     return x;
 353 }
 354
 355 ```
 356
 357 # shuffle / unshuffle
 358
 359 based on RV bitmanip
 360
 361 ```
 362 uint32_t shfl32(uint32_t RA, uint32_t RB)
 363 {
 364     uint32_t x = RA;
 365     int shamt = RB & 15;
 366     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 367     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 368     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 369     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 370     return x;
 371 }
 372 uint32_t unshfl32(uint32_t RA, uint32_t RB)
 373 {
 374     uint32_t x = RA;
 375     int shamt = RB & 15;
 376     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 377     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 378     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 379     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 380     return x;
 381 }
 382
 383 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
 384 {
 385     uint64_t x = src & ~(maskL | maskR);
 386     x |= ((src << N) & maskL) | ((src >> N) & maskR);
 387     return x;
 388 }
 389 uint64_t shfl64(uint64_t RA, uint64_t RB)
 390 {
 391     uint64_t x = RA;
 392     int shamt = RB & 31;
 393     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 394                                            0x00000000ffff0000LL, 16);
 395     if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 396                                            0x0000ff000000ff00LL, 8);
 397     if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 398                                            0x00f000f000f000f0LL, 4);
 399     if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
 400                                            0x0c0c0c0c0c0c0c0cLL, 2);
 401     if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
 402                                            0x2222222222222222LL, 1);
 403     return x;
 404 }
 405 uint64_t unshfl64(uint64_t RA, uint64_t RB)
 406 {
 407     uint64_t x = RA;
 408     int shamt = RB & 31;
 409     if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
 410                                            0x2222222222222222LL, 1);
 411     if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
 412                                            0x0c0c0c0c0c0c0c0cLL, 2);
 413     if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 414                                            0x00f000f000f000f0LL, 4);
 415     if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 416                                            0x0000ff000000ff00LL, 8);
 417     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 418                                            0x00000000ffff0000LL, 16);
 419     return x;
 420 }
 421 ```
 422
 423 # xperm
 424
 425 based on RV bitmanip
 426
 427 ```
 428 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
 429 {
 430     uint_xlen_t r = 0;
 431     uint_xlen_t sz = 1LL << sz_log2;
 432     uint_xlen_t mask = (1LL << sz) - 1;
 433     for (int i = 0; i < XLEN; i += sz) {
 434         uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
 435         if (pos < XLEN)
 436             r |= ((RA >> pos) & mask) << i;
 437     }
 438     return r;
 439 }
 440 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
 441 {  return xperm(RA, RB, 2); }
 442 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
 443 {  return xperm(RA, RB, 3); }
 444 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
 445 {  return xperm(RA, RB, 4); }
 446 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
 447 {  return xperm(RA, RB, 5); }
 448 ```
 449
 450 # gorc
 451
 452 based on RV bitmanip
 453
 454 ```
 455 uint32_t gorc32(uint32_t RA, uint32_t RB)
 456 {
 457     uint32_t x = RA;
 458     int shamt = RB & 31;
 459     if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
 460     if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
 461     if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
 462     if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
 463     if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
 464     return x;
 465 }
 466 uint64_t gorc64(uint64_t RA, uint64_t RB)
 467 {
 468     uint64_t x = RA;
 469     int shamt = RB & 63;
 470     if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
 471                          ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
 472     if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
 473                          ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
 474     if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
 475                          ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
 476     if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
 477                          ((x & 0xFF00FF00FF00FF00LL)  >>  8);
 478     if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
 479                          ((x & 0xFFFF0000FFFF0000LL)  >> 16);
 480     if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
 481                          ((x & 0xFFFFFFFF00000000LL)  >> 32);
 482     return x;
 483 }
 484
 485 ```
 486
 487 # cmix
 488
 489 based on RV bitmanip, covered by ternary bitops
 490
 491 ```
 492 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
 493     return (RA & RB) | (RC & ~RB);
 494 }
 495 ```
 496
 497 # carryless mul
 498
 499 based on RV bitmanip
 500 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
 501
 502 ```
 503 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
 504 {
 505     uint_xlen_t x = 0;
 506     for (int i = 0; i < XLEN; i++)
 507         if ((RB >> i) & 1)
 508             x ^= RA << i;
 509     return x;
 510 }
 511 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
 512 {
 513     uint_xlen_t x = 0;
 514     for (int i = 1; i < XLEN; i++)
 515         if ((RB >> i) & 1)
 516             x ^= RA >> (XLEN-i);
 517     return x;
 518 }
 519 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
 520 {
 521     uint_xlen_t x = 0;
 522     for (int i = 0; i < XLEN; i++)
 523         if ((RB >> i) & 1)
 524             x ^= RA >> (XLEN-i-1);
 525     return x;
 526 }
 527 ```
 528 # Galois Field
 529
 530 see <https://courses.csail.mit.edu/6.857/2016/files/ffield.py>
 531
 532 ## Multiply
 533
 534 this requires 3 parameters and a "degree"
 535
 536     RT = GFMUL(RA, RB, gfdegree, modulo=RC)
 537
 538 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
 539
 540     RS = GFMUL(RS, RA, gfdegree, modulo=RB)
 541     RS = GFMUL(RS, RA, gfdegree=RC, modulo=RB)
 542
 543 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 544 | -- | -- | --- | --- | --- | ------- |--|
 545 | NN | RS | RA  | RB  | deg | 00  011 |Rc|
 546 | NN | RS | RA  | RB  | RC  | 11  011 |Rc|
 547
 548 where the SimpleV variant may override RS-as-src differently from RS-as-dest
 549
 550
 551
 552 ```
 553 from functools import reduce
 554
 555 # constants used in the multGF2 function
 556 mask1 = mask2 = polyred = None
 557
 558 def setGF2(degree, irPoly):
 559     """Define parameters of binary finite field GF(2^m)/g(x)
 560        - degree: extension degree of binary field
 561        - irPoly: coefficients of irreducible polynomial g(x)
 562     """
 563     def i2P(sInt):
 564         """Convert an integer into a polynomial"""
 565         return [(sInt >> i) & 1
 566                 for i in reversed(range(sInt.bit_length()))]
 567
 568     global mask1, mask2, polyred
 569     mask1 = mask2 = 1 << degree
 570     mask2 -= 1
 571     polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
 572
 573 def multGF2(p1, p2):
 574     """Multiply two polynomials in GF(2^m)/g(x)"""
 575     p = 0
 576     while p2:
 577         if p2 & 1:
 578             p ^= p1
 579         p1 <<= 1
 580         if p1 & mask1:
 581             p1 ^= polyred
 582         p2 >>= 1
 583     return p & mask2
 584
 585 if __name__ == "__main__":
 586
 587     # Define binary field GF(2^3)/x^3 + x + 1
 588     setGF2(3, 0b1011)
 589
 590     # Evaluate the product (x^2 + x + 1)(x^2 + 1)
 591     print("{:02x}".format(multGF2(0b111, 0b101)))
 592
 593     # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
 594     # (used in the Advanced Encryption Standard-AES)
 595     setGF2(8, 0b100011011)
 596
 597     # Evaluate the product (x^7)(x^7 + x + 1)
 598     print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
 599 ```
 600 ## GF add
 601
 602     RS = GFADDI(RS, RA|0, gfdegree, modulo=RB)
 603     RS = GFADD(RS, RA|0, gfdegree=RC, modulo=RB)
 604
 605 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31| name  |
 606 | -- | -- | --- | --- | --- | ------- |--| ----- |
 607 | NN | RS | RA  | RB  | deg | 0 1  011 |Rc| gfaddi |
 608 | NN | RS | RA  | RB  | RC  | 1 1  111 |Rc| gfadd |
 609
 610 GFMOD is a pseudo-op where RA=0
 611
 612 ## gf invert
 613
 614 ```
 615 def gf_degree(a) :
 616   res = 0
 617   a >>= 1
 618   while (a != 0) :
 619     a >>= 1;
 620     res += 1;
 621   return res
 622
 623 def gf_invert(a, mod=0x1B) :
 624   v = mod
 625   g1 = 1
 626   g2 = 0
 627   j = gf_degree(a) - 8
 628
 629   while (a != 1) :
 630     if (j < 0) :
 631       a, v = v, a
 632       g1, g2 = g2, g1
 633       j = -j
 634
 635     a ^= v << j
 636     g1 ^= g2 << j
 637
 638     a %= 256  # Emulating 8-bit overflow
 639     g1 %= 256 # Emulating 8-bit overflow
 640
 641     j = gf_degree(a) - gf_degree(v)
 642
 643   return g1
 644 ```
 645
 646 # bitmatrix
 647
 648 ```
 649 uint64_t bmatflip(uint64_t RA)
 650 {
 651     uint64_t x = RA;
 652     x = shfl64(x, 31);
 653     x = shfl64(x, 31);
 654     x = shfl64(x, 31);
 655     return x;
 656 }
 657 uint64_t bmatxor(uint64_t RA, uint64_t RB)
 658 {
 659     // transpose of RB
 660     uint64_t RBt = bmatflip(RB);
 661     uint8_t u[8]; // rows of RA
 662     uint8_t v[8]; // cols of RB
 663     for (int i = 0; i < 8; i++) {
 664         u[i] = RA >> (i*8);
 665         v[i] = RBt >> (i*8);
 666     }
 667     uint64_t x = 0;
 668     for (int i = 0; i < 64; i++) {
 669         if (pcnt(u[i / 8] & v[i % 8]) & 1)
 670             x |= 1LL << i;
 671     }
 672     return x;
 673 }
 674 uint64_t bmator(uint64_t RA, uint64_t RB)
 675 {
 676     // transpose of RB
 677     uint64_t RBt = bmatflip(RB);
 678     uint8_t u[8]; // rows of RA
 679     uint8_t v[8]; // cols of RB
 680     for (int i = 0; i < 8; i++) {
 681         u[i] = RA >> (i*8);
 682         v[i] = RBt >> (i*8);
 683     }
 684     uint64_t x = 0;
 685     for (int i = 0; i < 64; i++) {
 686         if ((u[i / 8] & v[i % 8]) != 0)
 687             x |= 1LL << i;
 688     }
 689     return x;
 690 }
 691
 692 ```