openpower/sv/bitmanip.mdwn

   1 [[!tag standards]]
   2
   3 # summary
   4
   5 minor opcode allocation
   6
   7     |  28.30 |31| name      |
   8     | ------ |--| --------- |
   9     |   00   |Rc| ternaryi  |
  10     |  001   |Rc| ternary   |
  11     |  010   |Rc| bitmask   |
  12     |  011   |Rc| gf*       |
  13     |  101   |1 | ternaryv  |
  14     |  101   |0 | ternarycr |
  15     |  110   |Rc| 1/2-op    |
  16     |  111   |Rc| reserved  |
  17
  18 1-op and variants
  19
  20 | dest | src1 | subop | op       |
  21 | ---- | ---- | ----- | -------- |
  22 | RT   | RA   | ..    | bmatflip |
  23
  24 2-op and variants
  25
  26 | dest | src1 | src2 | subop | op       |
  27 | ---- | ---- | ---- | ----- | -------- |
  28 | RT   | RA   | RB   | or    | bmatflip |
  29 | RT   | RA   | RB   | xor   | bmatflip |
  30 | RT   | RA   | RB   | bdep  | dep/ext  |
  31 | RT   | RA   | RB   | bext  | dep/ext  |
  32 | RT   | RA   | RB   |       | grev  |
  33 | RT   | RA   | RB   |       | clmul*  |
  34 | RT   | RA   | RB   |       | gorc |
  35 | RT   | RA   | RB   | shuf  | shuffle |
  36 | RT   | RA   | RB   | unshuf| shuffle |
  37 | RT   | RA   | RB   | width | xperm  |
  38 | RT   | RA   | RB   | type | minmax |
  39 | RT   | RA   | RB   |  |  |
  40 | RT   | RA   | RB   |  |  |
  41 | RT   | RA   | RB   |  |  |
  42
  43 3 ops
  44
  45 * bitmask set/extract
  46 * ternary bitops
  47 * GF
  48
  49 | 0.5|6.10|11.15|16.20|21..25 | 26....30 |31| name |
  50 | -- | -- | --- | --- | ----- | -------- |--| ------ |
  51 | NN | RT | RA  | RB  | RC    | mode 001 |Rc| ternary |
  52 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc| ternaryi |
  53 | NN | RS | RA  | RB  | deg   | 00  011  |Rc| gfmul |
  54 | NN | RS | RA  | RB  | deg   | 01  011  |Rc| gfadd |
  55 | NN | RT | RA  | RB  | deg   | 10  011  |Rc| gfinv |
  56 | NN | RS | RA  | RB  | deg   | 11  011  |Rc| gf rsvd |
  57
  58 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
  59 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
  60 | NN | RT | RA  | imm   | mask | 101   |1 | ternaryv |
  61
  62 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
  63 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
  64 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 | ternarycr |
  65
  66 ops
  67
  68 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
  69 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
  70 | NN | RA | RB  |     |       | 0  | 0000 110 |Rc| rsvd   |
  71 | NN | RA | RB  | RC  | itype | 1  | 0000 110 |Rc| xperm |
  72 | NN | RA | RB  | RC  | itype | 0  | 0100 110 |Rc| minmax |
  73 | NN | RA | RB  |     |       | 1  | 0100 110 |Rc| rsvd |
  74 | NN | RA | RB  | sh  | itype | SH | 1000 110 |Rc| bmopsi |
  75 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  76 | NN | RA | RB  |     |       | 0  | 0001 110 |Rc| rsvd |
  77 | NN | RA | RB  |     |       | 0  | 0101 110 |Rc| rsvd |
  78 | NN | RA | RB  | RC  | 00    | 0  | 0010 110 |Rc| gorc |
  79 | NN | RA | RB  | sh  | 00    | SH | 1010 110 |Rc| gorci |
  80 | NN | RA | RB  | RC  | 00    | 0  | 0110 110 |Rc| gorcw |
  81 | NN | RA | RB  | sh  | 00    | 0  | 1110 110 |Rc| gorcwi |
  82 | NN | RA | RB  | RC  | 00    | 1  | 1110 110 |Rc| bmator  |
  83 | NN | RA | RB  | RC  | 01    | 0  | 0010 110 |Rc| grev |
  84 | NN | RA | RB  | RC  | 01    | 1  | 0010 110 |Rc| clmul |
  85 | NN | RA | RB  | sh  | 01    | SH | 1010 110 |Rc| grevi |
  86 | NN | RA | RB  | RC  | 01    | 0  | 0110 110 |Rc| grevw |
  87 | NN | RA | RB  | sh  | 01    | 0  | 1110 110 |Rc| grevwi |
  88 | NN | RA | RB  | RC  | 01    | 1  | 1110 110 |Rc| bmatxor   |
  89 | NN | RA | RB  | RC  | 10    | 0  | 0010 110 |Rc| shfl |
  90 | NN | RA | RB  | sh  | 10    | SH | 1010 110 |Rc| shfli |
  91 | NN | RA | RB  | RC  | 10    | 0  | 0110 110 |Rc| shflw |
  92 | NN | RA | RB  | RC  | 10    | 0  | 1110 110 |Rc| bdep   |
  93 | NN | RA | RB  | RC  | 10    | 1  | 1110 110 |Rc| bext  |
  94 | NN | RA | RB  | RC  | 11    | 0  | 1110 110 |Rc| clmulr  |
  95 | NN | RA | RB  | RC  | 11    | 1  | 1110 110 |Rc| clmulh  |
  96 | NN | RA | RB  |     |       |    | NN11 110 |Rc| rsvd  |
  97
  98 # bit to byte permute
  99
 100 similar to matrix permute in RV bitmanip, which has XOR and OR variants
 101
 102     do j = 0 to 7
 103       do k = 0 to 7
 104          b = VSR[VRB+32].dword[i].byte[k].bit[j]
 105          VSR[VRT+32].dword[i].byte[j].bit[k] = b
 106
 107 # vector bit deposit
 108
 109 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
 110
 111     do while(m < 64)
 112        if VSR[VRB+32].dword[i].bit[63-m]=1 then do
 113           result = VSR[VRA+32].dword[i].bit[63-k]
 114           VSR[VRT+32].dword[i].bit[63-m] = result
 115           k = k + 1
 116        m = m + 1
 117
 118 ```
 119
 120 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
 121 {
 122     uint_xlen_t r = 0;
 123     for (int i = 0, j = 0; i < XLEN; i++)
 124         if ((RB >> i) & 1) {
 125             if ((RA >> j) & 1)
 126                 r |= uint_xlen_t(1) << i;
 127             j++;
 128         }
 129     return r;
 130 }
 131
 132 ```
 133
 134 # vector bit extract
 135
 136 other way round: identical to RV bext
 137
 138 ```
 139 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
 140 {
 141     uint_xlen_t r = 0;
 142     for (int i = 0, j = 0; i < XLEN; i++)
 143         if ((RB >> i) & 1) {
 144             if ((RA >> i) & 1)
 145                 r |= uint_xlen_t(1) << j;
 146             j++;
 147         }
 148     return r;
 149 }
 150 ```
 151
 152 # int min/max
 153
 154 signed and unsigned min/max for integer.  this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned.  when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
 155
 156 signed/unsigned min/max gives more flexibility.
 157
 158 ```
 159 uint_xlen_t min(uint_xlen_t rs1, uint_xlen_t rs2)
 160 { return (int_xlen_t)rs1 < (int_xlen_t)rs2 ? rs1 : rs2;
 161 }
 162 uint_xlen_t max(uint_xlen_t rs1, uint_xlen_t rs2)
 163 { return (int_xlen_t)rs1 > (int_xlen_t)rs2 ? rs1 : rs2;
 164 }
 165 uint_xlen_t minu(uint_xlen_t rs1, uint_xlen_t rs2)
 166 { return rs1 < rs2 ? rs1 : rs2;
 167 }
 168 uint_xlen_t maxu(uint_xlen_t rs1, uint_xlen_t rs2)
 169 { return rs1 > rs2 ? rs1 : rs2;
 170 }
 171 ```
 172
 173
 174 # ternary bitops
 175
 176 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
 177
 178 | 0.5|6.10|11.15|16.20| 21..25| 26..30   |31|
 179 | -- | -- | --- | --- | ----- | -------- |--|
 180 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc|
 181
 182     for i in range(64):
 183         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 184         RT[i] = (imm & (1<<idx)) != 0
 185
 186 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
 187
 188 a 4 operand variant which becomes more along the lines of an FPGA:
 189
 190 | 0.5|6.10|11.15|16.20|21.25| 26...30  |31|
 191 | -- | -- | --- | --- | --- | -------- |--|
 192 | NN | RT | RA  | RB  | RC  | mode 001 |Rc|
 193
 194     for i in range(64):
 195         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 196         RT[i] = (RC & (1<<idx)) != 0
 197
 198 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
 199 3 modes.
 200
 201 also, another possible variant involving swizzle and vec4:
 202
 203 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 204 | -- | -- | --- | ----- | ---- | ----- |--|
 205 | NN | RT | RA  | imm   | mask | 101   |1 |
 206
 207     for i in range(8):
 208         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 209         res = (imm & (1<<idx)) != 0
 210         for j in range(3):
 211              if mask[j]: RT[i+j*8] = res
 212
 213 another mode selection would be CRs not Ints.
 214
 215 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
 216 | -- | -- | --- | --- |- |-----|----- | -----|--|
 217 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 |
 218
 219     for i in range(4):
 220         if not mask[i] continue
 221         idx = crregs[BA][i] << 2 |
 222               crregs[BB][i] << 1 |
 223               crregs[BC][i]
 224         crregs[BA][i] = (imm & (1<<idx)) != 0
 225
 226 # bitmask set
 227
 228 based on RV bitmanip singlebit set, instruction format similar to shift
 229 [[isa/fixedshift]].  bmext is actually covered already (shift-with-mask rldicl but only immediate version).
 230 however bitmask-invert is not, and set/clr are not covered, although they can use the same Shift ALU.
 231
 232 bmext (RB) version is not the same as rldicl because bmext is a right shift by RC, where rldicl is a left rotate.  for the immediate version this does not matter.
 233
 234 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 235 | -- | -- | --- | --- | --- | ------- |--|
 236 | NN | RT | RA  | RB  | RC  | mode 010 |Rc|
 237
 238 ```
 239 uint_xlen_t bmset(RA, RB, sh)
 240 {
 241     int shamt = RB & (XLEN - 1);
 242     mask = (2<<sh)-1;
 243     return RA | (mask << shamt);
 244 }
 245
 246 uint_xlen_t bmclr(RA, RB, sh)
 247 {
 248     int shamt = RB & (XLEN - 1);
 249     mask = (2<<sh)-1;
 250     return RA & ~(mask << shamt);
 251 }
 252
 253 uint_xlen_t bminv(RA, RB, sh)
 254 {
 255     int shamt = RB & (XLEN - 1);
 256     mask = (2<<sh)-1;
 257     return RA ^ (mask << shamt);
 258 }
 259
 260 uint_xlen_t bmext(RA, RB, sh)
 261 {
 262     int shamt = RB & (XLEN - 1);
 263     mask = (2<<sh)-1;
 264     return mask & (RA >> shamt);
 265 }
 266 ```
 267
 268 # grev
 269
 270 based on RV bitmanip
 271
 272 ```
 273 uint64_t grev64(uint64_t RA, uint64_t RB)
 274 {
 275     uint64_t x = RA;
 276     int shamt = RB & 63;
 277     if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
 278                         ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
 279     if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
 280                         ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
 281     if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
 282                         ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
 283     if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
 284                         ((x & 0xFF00FF00FF00FF00LL) >>  8);
 285     if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
 286                         ((x & 0xFFFF0000FFFF0000LL) >> 16);
 287     if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
 288                         ((x & 0xFFFFFFFF00000000LL) >> 32);
 289     return x;
 290 }
 291
 292 ```
 293
 294 # shuffle / unshuffle
 295
 296 based on RV bitmanip
 297
 298 ```
 299 uint32_t shfl32(uint32_t RA, uint32_t RB)
 300 {
 301     uint32_t x = RA;
 302     int shamt = RB & 15;
 303     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 304     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 305     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 306     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 307     return x;
 308 }
 309 uint32_t unshfl32(uint32_t RA, uint32_t RB)
 310 {
 311     uint32_t x = RA;
 312     int shamt = RB & 15;
 313     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 314     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 315     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 316     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 317     return x;
 318 }
 319
 320 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
 321 {
 322     uint64_t x = src & ~(maskL | maskR);
 323     x |= ((src << N) & maskL) | ((src >> N) & maskR);
 324     return x;
 325 }
 326 uint64_t shfl64(uint64_t RA, uint64_t RB)
 327 {
 328     uint64_t x = RA;
 329     int shamt = RB & 31;
 330     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 331                                            0x00000000ffff0000LL, 16);
 332     if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 333                                            0x0000ff000000ff00LL, 8);
 334     if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 335                                            0x00f000f000f000f0LL, 4);
 336     if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
 337                                            0x0c0c0c0c0c0c0c0cLL, 2);
 338     if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
 339                                            0x2222222222222222LL, 1);
 340     return x;
 341 }
 342 uint64_t unshfl64(uint64_t RA, uint64_t RB)
 343 {
 344     uint64_t x = RA;
 345     int shamt = RB & 31;
 346     if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
 347                                            0x2222222222222222LL, 1);
 348     if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
 349                                            0x0c0c0c0c0c0c0c0cLL, 2);
 350     if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 351                                            0x00f000f000f000f0LL, 4);
 352     if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 353                                            0x0000ff000000ff00LL, 8);
 354     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 355                                            0x00000000ffff0000LL, 16);
 356     return x;
 357 }
 358 ```
 359
 360 # xperm
 361
 362 based on RV bitmanip
 363
 364 ```
 365 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
 366 {
 367     uint_xlen_t r = 0;
 368     uint_xlen_t sz = 1LL << sz_log2;
 369     uint_xlen_t mask = (1LL << sz) - 1;
 370     for (int i = 0; i < XLEN; i += sz) {
 371         uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
 372         if (pos < XLEN)
 373             r |= ((RA >> pos) & mask) << i;
 374     }
 375     return r;
 376 }
 377 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
 378 {  return xperm(RA, RB, 2); }
 379 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
 380 {  return xperm(RA, RB, 3); }
 381 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
 382 {  return xperm(RA, RB, 4); }
 383 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
 384 {  return xperm(RA, RB, 5); }
 385 ```
 386
 387 # gorc
 388
 389 based on RV bitmanip
 390
 391 ```
 392 uint32_t gorc32(uint32_t RA, uint32_t RB)
 393 {
 394     uint32_t x = RA;
 395     int shamt = RB & 31;
 396     if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
 397     if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
 398     if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
 399     if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
 400     if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
 401     return x;
 402 }
 403 uint64_t gorc64(uint64_t RA, uint64_t RB)
 404 {
 405     uint64_t x = RA;
 406     int shamt = RB & 63;
 407     if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
 408                          ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
 409     if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
 410                          ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
 411     if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
 412                          ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
 413     if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
 414                          ((x & 0xFF00FF00FF00FF00LL)  >>  8);
 415     if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
 416                          ((x & 0xFFFF0000FFFF0000LL)  >> 16);
 417     if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
 418                          ((x & 0xFFFFFFFF00000000LL)  >> 32);
 419     return x;
 420 }
 421
 422 ```
 423
 424 # cmix
 425
 426 based on RV bitmanip, covered by ternary bitops
 427
 428 ```
 429 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
 430     return (RA & RB) | (RC & ~RB);
 431 }
 432 ```
 433
 434 # carryless mul
 435
 436 based on RV bitmanip
 437 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
 438
 439 ```
 440 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
 441 {
 442     uint_xlen_t x = 0;
 443     for (int i = 0; i < XLEN; i++)
 444         if ((RB >> i) & 1)
 445             x ^= RA << i;
 446     return x;
 447 }
 448 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
 449 {
 450     uint_xlen_t x = 0;
 451     for (int i = 1; i < XLEN; i++)
 452         if ((RB >> i) & 1)
 453             x ^= RA >> (XLEN-i);
 454     return x;
 455 }
 456 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
 457 {
 458     uint_xlen_t x = 0;
 459     for (int i = 0; i < XLEN; i++)
 460         if ((RB >> i) & 1)
 461             x ^= RA >> (XLEN-i-1);
 462     return x;
 463 }
 464 ```
 465 # Galois Field
 466
 467 ## Multiply
 468
 469 this requires 3 parameters and a "degree"
 470
 471     RT = GFMUL(RA, RB, gfdegree, modulo=RC)
 472
 473 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
 474
 475     RS = GFMUL(RS, RA, gfdegree, modulo=RB)
 476
 477 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 478 | -- | -- | --- | --- | --- | ------- |--|
 479 | NN | RS | RA  | RB  | deg | 00  011 |Rc|
 480
 481 where the SimpleV variant may override RS-as-src differently from RS-as-dest
 482
 483
 484
 485 ```
 486 from functools import reduce
 487
 488 # constants used in the multGF2 function
 489 mask1 = mask2 = polyred = None
 490
 491 def setGF2(degree, irPoly):
 492     """Define parameters of binary finite field GF(2^m)/g(x)
 493        - degree: extension degree of binary field
 494        - irPoly: coefficients of irreducible polynomial g(x)
 495     """
 496     def i2P(sInt):
 497         """Convert an integer into a polynomial"""
 498         return [(sInt >> i) & 1
 499                 for i in reversed(range(sInt.bit_length()))]
 500
 501     global mask1, mask2, polyred
 502     mask1 = mask2 = 1 << degree
 503     mask2 -= 1
 504     polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
 505
 506 def multGF2(p1, p2):
 507     """Multiply two polynomials in GF(2^m)/g(x)"""
 508     p = 0
 509     while p2:
 510         if p2 & 1:
 511             p ^= p1
 512         p1 <<= 1
 513         if p1 & mask1:
 514             p1 ^= polyred
 515         p2 >>= 1
 516     return p & mask2
 517
 518 if __name__ == "__main__":
 519
 520     # Define binary field GF(2^3)/x^3 + x + 1
 521     setGF2(3, 0b1011)
 522
 523     # Evaluate the product (x^2 + x + 1)(x^2 + 1)
 524     print("{:02x}".format(multGF2(0b111, 0b101)))
 525
 526     # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
 527     # (used in the Advanced Encryption Standard-AES)
 528     setGF2(8, 0b100011011)
 529
 530     # Evaluate the product (x^7)(x^7 + x + 1)
 531     print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
 532 ```
 533 ## GF add
 534
 535     RS = GFADD(RS, RA|0, gfdegree, modulo=RB)
 536
 537 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 538 | -- | -- | --- | --- | --- | ------- |--|
 539 | NN | RS | RA  | RB  | deg | 01  011 |Rc|
 540
 541 ## gf invert
 542
 543 ```
 544 def gf_degree(a) :
 545   res = 0
 546   a >>= 1
 547   while (a != 0) :
 548     a >>= 1;
 549     res += 1;
 550   return res
 551
 552 def gf_invert(a, mod=0x1B) :
 553   v = mod
 554   g1 = 1
 555   g2 = 0
 556   j = gf_degree(a) - 8
 557
 558   while (a != 1) :
 559     if (j < 0) :
 560       a, v = v, a
 561       g1, g2 = g2, g1
 562       j = -j
 563
 564     a ^= v << j
 565     g1 ^= g2 << j
 566
 567     a %= 256  # Emulating 8-bit overflow
 568     g1 %= 256 # Emulating 8-bit overflow
 569
 570     j = gf_degree(a) - gf_degree(v)
 571
 572   return g1
 573 ```
 574
 575 # bitmatrix
 576
 577 ```
 578 uint64_t bmatflip(uint64_t RA)
 579 {
 580     uint64_t x = RA;
 581     x = shfl64(x, 31);
 582     x = shfl64(x, 31);
 583     x = shfl64(x, 31);
 584     return x;
 585 }
 586 uint64_t bmatxor(uint64_t RA, uint64_t RB)
 587 {
 588     // transpose of RB
 589     uint64_t RBt = bmatflip(RB);
 590     uint8_t u[8]; // rows of RA
 591     uint8_t v[8]; // cols of RB
 592     for (int i = 0; i < 8; i++) {
 593         u[i] = RA >> (i*8);
 594         v[i] = RBt >> (i*8);
 595     }
 596     uint64_t x = 0;
 597     for (int i = 0; i < 64; i++) {
 598         if (pcnt(u[i / 8] & v[i % 8]) & 1)
 599             x |= 1LL << i;
 600     }
 601     return x;
 602 }
 603 uint64_t bmator(uint64_t RA, uint64_t RB)
 604 {
 605     // transpose of RB
 606     uint64_t RBt = bmatflip(RB);
 607     uint8_t u[8]; // rows of RA
 608     uint8_t v[8]; // cols of RB
 609     for (int i = 0; i < 8; i++) {
 610         u[i] = RA >> (i*8);
 611         v[i] = RBt >> (i*8);
 612     }
 613     uint64_t x = 0;
 614     for (int i = 0; i < 64; i++) {
 615         if ((u[i / 8] & v[i % 8]) != 0)
 616             x |= 1LL << i;
 617     }
 618     return x;
 619 }
 620
 621 ```