openpower/sv/bitmanip.mdwn

   1 [[!tag standards]]
   2
   3 # summary
   4
   5 minor opcode allocation
   6
   7     |  28.30 |31| name      |
   8     | ------ |--| --------- |
   9     |   00   |Rc| ternaryi  |
  10     |  001   |Rc| ternary   |
  11     |  010   |Rc| bitmask   |
  12     |  011   |Rc| gf*       |
  13     |  101   |1 | ternaryv  |
  14     |  101   |0 | ternarycr |
  15     |  110   |Rc| 1/2-op    |
  16     |  111   |Rc| reserved  |
  17
  18 1-op and variants
  19
  20 | dest | src1 | subop | op       |
  21 | ---- | ---- | ----- | -------- |
  22 | RT   | RA   | ..    | bmatflip |
  23
  24 2-op and variants
  25
  26 | dest | src1 | src2 | subop | op       |
  27 | ---- | ---- | ---- | ----- | -------- |
  28 | RT   | RA   | RB   | or    | bmatflip |
  29 | RT   | RA   | RB   | xor   | bmatflip |
  30 | RT   | RA   | RB   | bdep  | dep/ext  |
  31 | RT   | RA   | RB   | bext  | dep/ext  |
  32 | RT   | RA   | RB   |       | grev  |
  33 | RT   | RA   | RB   |       | clmul*  |
  34 | RT   | RA   | RB   |       | gorc |
  35 | RT   | RA   | RB   | shuf  | shuffle |
  36 | RT   | RA   | RB   | unshuf| shuffle |
  37 | RT   | RA   | RB   | width | xperm  |
  38 | RT   | RA   | RB   | type | minmax |
  39 | RT   | RA   | RB   |  |  |
  40 | RT   | RA   | RB   |  |  |
  41 | RT   | RA   | RB   |  |  |
  42
  43 3 ops
  44
  45 * bitmask set/extract
  46 * ternary bitops
  47 * GF
  48
  49 | 0.5|6.10|11.15|16.20|21..25 | 26....30 |31| name |
  50 | -- | -- | --- | --- | ----- | -------- |--| ------ |
  51 | NN | RT | RA  | RB  | RC    | mode 001 |Rc| ternary |
  52 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc| ternaryi |
  53 | NN | RS | RA  | RB  | deg   | 00  011  |Rc| gfmul |
  54 | NN | RS | RA  | RB  | deg   | 01  011  |Rc| gfadd |
  55 | NN | RT | RA  | RB  | deg   | 10  011  |Rc| gfinv |
  56 | NN | RS | RA  | RB  | deg   | 11  011  |Rc| gf rsvd |
  57
  58 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
  59 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
  60 | NN | RT | RA  | imm   | mask | 101   |1 | ternaryv |
  61
  62 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
  63 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
  64 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 | ternarycr |
  65
  66 ops
  67
  68 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
  69 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
  70 | NN | RA | RB  |     |       | 0  | 0000 110 |Rc| rsvd   |
  71 | NN | RA | RB  | RC  | itype | 1  | 0000 110 |Rc| xperm |
  72 | NN | RA | RB  | RC  | itype | 0  | 0100 110 |Rc| minmax |
  73 | NN | RA | RB  |     |       | 1  | 0100 110 |Rc| rsvd |
  74 | NN | RA | RB  | sh  | itype | SH | 1000 110 |Rc| bmopsi |
  75 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  76 | NN | RA | RB  |     |       | 0  | 0001 110 |Rc| rsvd |
  77 | NN | RA | RB  |     |       | 0  | 0101 110 |Rc| rsvd |
  78 | NN | RA | RB  | RC  | 00    | 0  | 0010 110 |Rc| gorc |
  79 | NN | RA | RB  | sh  | 00    | SH | 1010 110 |Rc| gorci |
  80 | NN | RA | RB  | RC  | 00    | 0  | 0110 110 |Rc| gorcw |
  81 | NN | RA | RB  | sh  | 00    | 0  | 1110 110 |Rc| gorcwi |
  82 | NN | RA | RB  | RC  | 00    | 1  | 1110 110 |Rc| bmator  |
  83 | NN | RA | RB  | RC  | 01    | 0  | 0010 110 |Rc| grev |
  84 | NN | RA | RB  | RC  | 01    | 1  | 0010 110 |Rc| clmul |
  85 | NN | RA | RB  | sh  | 01    | SH | 1010 110 |Rc| grevi |
  86 | NN | RA | RB  | RC  | 01    | 0  | 0110 110 |Rc| grevw |
  87 | NN | RA | RB  | sh  | 01    | 0  | 1110 110 |Rc| grevwi |
  88 | NN | RA | RB  | RC  | 01    | 1  | 1110 110 |Rc| bmatxor   |
  89 | NN | RA | RB  | RC  | 10    | 0  | 0010 110 |Rc| shfl |
  90 | NN | RA | RB  | sh  | 10    | SH | 1010 110 |Rc| shfli |
  91 | NN | RA | RB  | RC  | 10    | 0  | 0110 110 |Rc| shflw |
  92 | NN | RA | RB  | RC  | 10    | 0  | 1110 110 |Rc| bdep   |
  93 | NN | RA | RB  | RC  | 10    | 1  | 1110 110 |Rc| bext  |
  94 | NN | RA | RB  | RC  | 11    | 0  | 1110 110 |Rc| clmulr  |
  95 | NN | RA | RB  | RC  | 11    | 1  | 1110 110 |Rc| clmulh  |
  96 | NN | RA | RB  |     |       |    | NN11 110 |Rc| rsvd  |
  97
  98 # bit to byte permute
  99
 100 similar to matrix permute in RV bitmanip, which has XOR and OR variants
 101
 102     do j = 0 to 7
 103       do k = 0 to 7
 104          b = VSR[VRB+32].dword[i].byte[k].bit[j]
 105          VSR[VRT+32].dword[i].byte[j].bit[k] = b
 106
 107 # vector bit deposit
 108
 109 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
 110
 111     do while(m < 64)
 112        if VSR[VRB+32].dword[i].bit[63-m]=1 then do
 113           result = VSR[VRA+32].dword[i].bit[63-k]
 114           VSR[VRT+32].dword[i].bit[63-m] = result
 115           k = k + 1
 116        m = m + 1
 117
 118 ```
 119
 120 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
 121 {
 122     uint_xlen_t r = 0;
 123     for (int i = 0, j = 0; i < XLEN; i++)
 124         if ((RB >> i) & 1) {
 125             if ((RA >> j) & 1)
 126                 r |= uint_xlen_t(1) << i;
 127             j++;
 128         }
 129     return r;
 130 }
 131
 132 ```
 133
 134 # vector bit extract
 135
 136 other way round: identical to RV bext
 137
 138 ```
 139 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
 140 {
 141     uint_xlen_t r = 0;
 142     for (int i = 0, j = 0; i < XLEN; i++)
 143         if ((RB >> i) & 1) {
 144             if ((RA >> i) & 1)
 145                 r |= uint_xlen_t(1) << j;
 146             j++;
 147         }
 148     return r;
 149 }
 150 ```
 151
 152 # int min/max
 153
 154 signed and unsigned min/max for integer.  this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned.  when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
 155
 156 signed/unsigned min/max gives more flexibility.
 157
 158 ```
 159 uint_xlen_t min(uint_xlen_t rs1, uint_xlen_t rs2)
 160 { return (int_xlen_t)rs1 < (int_xlen_t)rs2 ? rs1 : rs2;
 161 }
 162 uint_xlen_t max(uint_xlen_t rs1, uint_xlen_t rs2)
 163 { return (int_xlen_t)rs1 > (int_xlen_t)rs2 ? rs1 : rs2;
 164 }
 165 uint_xlen_t minu(uint_xlen_t rs1, uint_xlen_t rs2)
 166 { return rs1 < rs2 ? rs1 : rs2;
 167 }
 168 uint_xlen_t maxu(uint_xlen_t rs1, uint_xlen_t rs2)
 169 { return rs1 > rs2 ? rs1 : rs2;
 170 }
 171 ```
 172
 173
 174 # ternary bitops
 175
 176 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
 177
 178 | 0.5|6.10|11.15|16.20| 21..25| 26..30   |31|
 179 | -- | -- | --- | --- | ----- | -------- |--|
 180 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc|
 181
 182     for i in range(64):
 183         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 184         RT[i] = (imm & (1<<idx)) != 0
 185
 186 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
 187
 188 a 4 operand variant which becomes more along the lines of an FPGA:
 189
 190 | 0.5|6.10|11.15|16.20|21.25| 26...30  |31|
 191 | -- | -- | --- | --- | --- | -------- |--|
 192 | NN | RT | RA  | RB  | RC  | mode 001 |Rc|
 193
 194     for i in range(64):
 195         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 196         RT[i] = (RC & (1<<idx)) != 0
 197
 198 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
 199 3 modes.
 200
 201 also, another possible variant involving swizzle and vec4:
 202
 203 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 204 | -- | -- | --- | ----- | ---- | ----- |--|
 205 | NN | RT | RA  | imm   | mask | 101   |1 |
 206
 207     for i in range(8):
 208         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 209         res = (imm & (1<<idx)) != 0
 210         for j in range(3):
 211              if mask[j]: RT[i+j*8] = res
 212
 213 another mode selection would be CRs not Ints.
 214
 215 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
 216 | -- | -- | --- | --- |- |-----|----- | -----|--|
 217 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 |
 218
 219     for i in range(4):
 220         if not mask[i] continue
 221         idx = crregs[BA][i] << 2 |
 222               crregs[BB][i] << 1 |
 223               crregs[BC][i]
 224         crregs[BA][i] = (imm & (1<<idx)) != 0
 225
 226 # bitmask set
 227
 228 based on RV bitmanip singlebit set, instruction format similar to shift
 229 [[isa/fixedshift]].  bmext is actually covered already (shift-with-mask rldicl but only immediate version).
 230 however bitmask-invert is not, and set/clr are not covered, although they can use the same Shift ALU.
 231
 232 bmext (RB) version is not the same as rldicl because bmext is a right shift by RC, where rldicl is a left rotate.  for the immediate version this does not matter.
 233
 234 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 235 | -- | -- | --- | --- | --- | ------- |--|
 236 | NN | RT | RA  | RB  | RC  | mode 010 |Rc|
 237
 238 ```
 239 uint_xlen_t bmset(RA, RB, sh)
 240 {
 241     int shamt = RB & (XLEN - 1);
 242     mask = (2<<sh)-1;
 243     return RA | (mask << shamt);
 244 }
 245
 246 uint_xlen_t bmclr(RA, RB, sh)
 247 {
 248     int shamt = RB & (XLEN - 1);
 249     mask = (2<<sh)-1;
 250     return RA & ~(mask << shamt);
 251 }
 252
 253 uint_xlen_t bminv(RA, RB, sh)
 254 {
 255     int shamt = RB & (XLEN - 1);
 256     mask = (2<<sh)-1;
 257     return RA ^ (mask << shamt);
 258 }
 259
 260 uint_xlen_t bmext(RA, RB, sh)
 261 {
 262     int shamt = RB & (XLEN - 1);
 263     mask = (2<<sh)-1;
 264     return mask & (RA >> shamt);
 265 }
 266 ```
 267
 268 bitmask extract with reverse
 269 ```
 270 msb = rb[5:0];
 271 rev[0:msb] = ra[msb:0];
 272 rt = ZE(rev[msb:0]);
 273 ```
 274
 275 # grev
 276
 277 based on RV bitmanip
 278
 279 ```
 280 uint64_t grev64(uint64_t RA, uint64_t RB)
 281 {
 282     uint64_t x = RA;
 283     int shamt = RB & 63;
 284     if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
 285                         ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
 286     if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
 287                         ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
 288     if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
 289                         ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
 290     if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
 291                         ((x & 0xFF00FF00FF00FF00LL) >>  8);
 292     if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
 293                         ((x & 0xFFFF0000FFFF0000LL) >> 16);
 294     if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
 295                         ((x & 0xFFFFFFFF00000000LL) >> 32);
 296     return x;
 297 }
 298
 299 ```
 300
 301 # shuffle / unshuffle
 302
 303 based on RV bitmanip
 304
 305 ```
 306 uint32_t shfl32(uint32_t RA, uint32_t RB)
 307 {
 308     uint32_t x = RA;
 309     int shamt = RB & 15;
 310     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 311     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 312     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 313     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 314     return x;
 315 }
 316 uint32_t unshfl32(uint32_t RA, uint32_t RB)
 317 {
 318     uint32_t x = RA;
 319     int shamt = RB & 15;
 320     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 321     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 322     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 323     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 324     return x;
 325 }
 326
 327 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
 328 {
 329     uint64_t x = src & ~(maskL | maskR);
 330     x |= ((src << N) & maskL) | ((src >> N) & maskR);
 331     return x;
 332 }
 333 uint64_t shfl64(uint64_t RA, uint64_t RB)
 334 {
 335     uint64_t x = RA;
 336     int shamt = RB & 31;
 337     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 338                                            0x00000000ffff0000LL, 16);
 339     if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 340                                            0x0000ff000000ff00LL, 8);
 341     if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 342                                            0x00f000f000f000f0LL, 4);
 343     if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
 344                                            0x0c0c0c0c0c0c0c0cLL, 2);
 345     if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
 346                                            0x2222222222222222LL, 1);
 347     return x;
 348 }
 349 uint64_t unshfl64(uint64_t RA, uint64_t RB)
 350 {
 351     uint64_t x = RA;
 352     int shamt = RB & 31;
 353     if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
 354                                            0x2222222222222222LL, 1);
 355     if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
 356                                            0x0c0c0c0c0c0c0c0cLL, 2);
 357     if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 358                                            0x00f000f000f000f0LL, 4);
 359     if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 360                                            0x0000ff000000ff00LL, 8);
 361     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 362                                            0x00000000ffff0000LL, 16);
 363     return x;
 364 }
 365 ```
 366
 367 # xperm
 368
 369 based on RV bitmanip
 370
 371 ```
 372 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
 373 {
 374     uint_xlen_t r = 0;
 375     uint_xlen_t sz = 1LL << sz_log2;
 376     uint_xlen_t mask = (1LL << sz) - 1;
 377     for (int i = 0; i < XLEN; i += sz) {
 378         uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
 379         if (pos < XLEN)
 380             r |= ((RA >> pos) & mask) << i;
 381     }
 382     return r;
 383 }
 384 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
 385 {  return xperm(RA, RB, 2); }
 386 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
 387 {  return xperm(RA, RB, 3); }
 388 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
 389 {  return xperm(RA, RB, 4); }
 390 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
 391 {  return xperm(RA, RB, 5); }
 392 ```
 393
 394 # gorc
 395
 396 based on RV bitmanip
 397
 398 ```
 399 uint32_t gorc32(uint32_t RA, uint32_t RB)
 400 {
 401     uint32_t x = RA;
 402     int shamt = RB & 31;
 403     if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
 404     if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
 405     if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
 406     if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
 407     if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
 408     return x;
 409 }
 410 uint64_t gorc64(uint64_t RA, uint64_t RB)
 411 {
 412     uint64_t x = RA;
 413     int shamt = RB & 63;
 414     if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
 415                          ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
 416     if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
 417                          ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
 418     if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
 419                          ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
 420     if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
 421                          ((x & 0xFF00FF00FF00FF00LL)  >>  8);
 422     if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
 423                          ((x & 0xFFFF0000FFFF0000LL)  >> 16);
 424     if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
 425                          ((x & 0xFFFFFFFF00000000LL)  >> 32);
 426     return x;
 427 }
 428
 429 ```
 430
 431 # cmix
 432
 433 based on RV bitmanip, covered by ternary bitops
 434
 435 ```
 436 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
 437     return (RA & RB) | (RC & ~RB);
 438 }
 439 ```
 440
 441 # carryless mul
 442
 443 based on RV bitmanip
 444 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
 445
 446 ```
 447 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
 448 {
 449     uint_xlen_t x = 0;
 450     for (int i = 0; i < XLEN; i++)
 451         if ((RB >> i) & 1)
 452             x ^= RA << i;
 453     return x;
 454 }
 455 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
 456 {
 457     uint_xlen_t x = 0;
 458     for (int i = 1; i < XLEN; i++)
 459         if ((RB >> i) & 1)
 460             x ^= RA >> (XLEN-i);
 461     return x;
 462 }
 463 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
 464 {
 465     uint_xlen_t x = 0;
 466     for (int i = 0; i < XLEN; i++)
 467         if ((RB >> i) & 1)
 468             x ^= RA >> (XLEN-i-1);
 469     return x;
 470 }
 471 ```
 472 # Galois Field
 473
 474 ## Multiply
 475
 476 this requires 3 parameters and a "degree"
 477
 478     RT = GFMUL(RA, RB, gfdegree, modulo=RC)
 479
 480 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
 481
 482     RS = GFMUL(RS, RA, gfdegree, modulo=RB)
 483
 484 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 485 | -- | -- | --- | --- | --- | ------- |--|
 486 | NN | RS | RA  | RB  | deg | 00  011 |Rc|
 487
 488 where the SimpleV variant may override RS-as-src differently from RS-as-dest
 489
 490
 491
 492 ```
 493 from functools import reduce
 494
 495 # constants used in the multGF2 function
 496 mask1 = mask2 = polyred = None
 497
 498 def setGF2(degree, irPoly):
 499     """Define parameters of binary finite field GF(2^m)/g(x)
 500        - degree: extension degree of binary field
 501        - irPoly: coefficients of irreducible polynomial g(x)
 502     """
 503     def i2P(sInt):
 504         """Convert an integer into a polynomial"""
 505         return [(sInt >> i) & 1
 506                 for i in reversed(range(sInt.bit_length()))]
 507
 508     global mask1, mask2, polyred
 509     mask1 = mask2 = 1 << degree
 510     mask2 -= 1
 511     polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
 512
 513 def multGF2(p1, p2):
 514     """Multiply two polynomials in GF(2^m)/g(x)"""
 515     p = 0
 516     while p2:
 517         if p2 & 1:
 518             p ^= p1
 519         p1 <<= 1
 520         if p1 & mask1:
 521             p1 ^= polyred
 522         p2 >>= 1
 523     return p & mask2
 524
 525 if __name__ == "__main__":
 526
 527     # Define binary field GF(2^3)/x^3 + x + 1
 528     setGF2(3, 0b1011)
 529
 530     # Evaluate the product (x^2 + x + 1)(x^2 + 1)
 531     print("{:02x}".format(multGF2(0b111, 0b101)))
 532
 533     # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
 534     # (used in the Advanced Encryption Standard-AES)
 535     setGF2(8, 0b100011011)
 536
 537     # Evaluate the product (x^7)(x^7 + x + 1)
 538     print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
 539 ```
 540 ## GF add
 541
 542     RS = GFADD(RS, RA|0, gfdegree, modulo=RB)
 543
 544 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 545 | -- | -- | --- | --- | --- | ------- |--|
 546 | NN | RS | RA  | RB  | deg | 01  011 |Rc|
 547
 548 ## gf invert
 549
 550 ```
 551 def gf_degree(a) :
 552   res = 0
 553   a >>= 1
 554   while (a != 0) :
 555     a >>= 1;
 556     res += 1;
 557   return res
 558
 559 def gf_invert(a, mod=0x1B) :
 560   v = mod
 561   g1 = 1
 562   g2 = 0
 563   j = gf_degree(a) - 8
 564
 565   while (a != 1) :
 566     if (j < 0) :
 567       a, v = v, a
 568       g1, g2 = g2, g1
 569       j = -j
 570
 571     a ^= v << j
 572     g1 ^= g2 << j
 573
 574     a %= 256  # Emulating 8-bit overflow
 575     g1 %= 256 # Emulating 8-bit overflow
 576
 577     j = gf_degree(a) - gf_degree(v)
 578
 579   return g1
 580 ```
 581
 582 # bitmatrix
 583
 584 ```
 585 uint64_t bmatflip(uint64_t RA)
 586 {
 587     uint64_t x = RA;
 588     x = shfl64(x, 31);
 589     x = shfl64(x, 31);
 590     x = shfl64(x, 31);
 591     return x;
 592 }
 593 uint64_t bmatxor(uint64_t RA, uint64_t RB)
 594 {
 595     // transpose of RB
 596     uint64_t RBt = bmatflip(RB);
 597     uint8_t u[8]; // rows of RA
 598     uint8_t v[8]; // cols of RB
 599     for (int i = 0; i < 8; i++) {
 600         u[i] = RA >> (i*8);
 601         v[i] = RBt >> (i*8);
 602     }
 603     uint64_t x = 0;
 604     for (int i = 0; i < 64; i++) {
 605         if (pcnt(u[i / 8] & v[i % 8]) & 1)
 606             x |= 1LL << i;
 607     }
 608     return x;
 609 }
 610 uint64_t bmator(uint64_t RA, uint64_t RB)
 611 {
 612     // transpose of RB
 613     uint64_t RBt = bmatflip(RB);
 614     uint8_t u[8]; // rows of RA
 615     uint8_t v[8]; // cols of RB
 616     for (int i = 0; i < 8; i++) {
 617         u[i] = RA >> (i*8);
 618         v[i] = RBt >> (i*8);
 619     }
 620     uint64_t x = 0;
 621     for (int i = 0; i < 64; i++) {
 622         if ((u[i / 8] & v[i % 8]) != 0)
 623             x |= 1LL << i;
 624     }
 625     return x;
 626 }
 627
 628 ```