openpower/sv/bitmanip.mdwn

   1 [[!tag standards]]
   2
   3 # summary
   4
   5 1-op and variants
   6
   7 | dest | src1 | subop | op       |
   8 | ---- | ---- | ----- | -------- |
   9 | RT   | RA   | ..    | bmatflip |
  10 | RT   | RA   | size  | crc32    |
  11 | RT   | RA   | size  | crc32c   |
  12
  13 2-op and variants
  14
  15 | dest | src1 | src2 | subop | op       |
  16 | ---- | ---- | ---- | ----- | -------- |
  17 | RT   | RA   | RB   | or    | bmatflip |
  18 | RT   | RA   | RB   | xor   | bmatflip |
  19 | RT   | RA   | RB   | bdep  | dep/ext  |
  20 | RT   | RA   | RB   | bext  | dep/ext  |
  21 | RT   | RA   | RB   |       | grev  |
  22 | RT   | RA   | RB   |       | gorc |
  23 | RT   | RA   | RB   | shuf  | shuffle |
  24 | RT   | RA   | RB   | unshuf| shuffle |
  25 | RT   | RA   | RB   | width | xperm  |
  26 | RT   | RA   | RB   | type  | clmul |
  27 | RT   | RA   | RB   | type | minmax |
  28 | RT   | RA   | RB   |  |  |
  29 | RT   | RA   | RB   |  |  |
  30 | RT   | RA   | RB   |  |  |
  31
  32 3 ops
  33
  34 * bitmask swt/extract
  35 * ternary bitops
  36
  37 ops
  38
  39 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24..30  |31| name |
  40 | -- | -- | --- | --- | ----- | -- | ------- |--| ---- |
  41 | NN | RA | RB  |     |       | 0  | 0000110 |Rc| rsvd   |
  42 | NN | RA | RB  | RC  | itype | 1  | 0000110 |Rc| xperm |
  43 | NN | RA | RB  | RC  | itype | 0  | 0100110 |Rc| minmax |
  44 | NN | RA | RB  |     |       | 1  | 0100110 |Rc| rsvd |
  45 | NN | RA | RB  | sh  | itype | SH | 1000110 |Rc| bmopsi |
  46 | NN | RA | RB  |     |       |    | 1100110 |Rc| rsvd |
  47 | NN | RA | RB  | RC  | itype | 0  | 0001110 |Rc| clmul |
  48 | NN | RA | RB  | sh  | itype | 0  | 0101110 |Rc| clmulw |
  49 | NN | RA | RB  | RC  | 00    | 0  | 0010110 |Rc| gorc |
  50 | NN | RA | RB  | sh  | 00    | SH | 1010110 |Rc| gorci |
  51 | NN | RA | RB  | RC  | 00    | 0  | 0110110 |Rc| gorcw |
  52 | NN | RA | RB  | sh  | 00    | 0  | 1110110 |Rc| gorcwi |
  53 | NN | RA | RB  | RC  | 00    | 1  | 1110110 |Rc| bmator  |
  54 | NN | RA | RB  | RC  | 01    | 0  | 0010110 |Rc| grev |
  55 | NN | RA | RB  | sh  | 01    | SH | 1010110 |Rc| grevi |
  56 | NN | RA | RB  | RC  | 01    | 0  | 0110110 |Rc| grevw |
  57 | NN | RA | RB  | sh  | 01    | 0  | 1110110 |Rc| grevwi |
  58 | NN | RA | RB  | RC  | 01    | 1  | 1110110 |Rc| bmatxor   |
  59 | NN | RA | RB  | RC  | 10    | 0  | 0010110 |Rc| shfl |
  60 | NN | RA | RB  | sh  | 10    | SH | 1010110 |Rc| shfli |
  61 | NN | RA | RB  | RC  | 10    | 0  | 0110110 |Rc| shflw |
  62 | NN | RA | RB  | RC  | 10    | 0  | 1110110 |Rc| bdep   |
  63 | NN | RA | RB  | RC  | 10    | 1  | 1110110 |Rc| bext  |
  64 | NN | RA | RB  |     | 11    |    | 1110110 |Rc| rsvd  |
  65 | NN | RA | RB  |     |       |    | NN11110 |Rc| rsvd  |
  66
  67 # bit to byte permute
  68
  69 similar to matrix permute in RV bitmanip, which has XOR and OR variants
  70
  71     do j = 0 to 7
  72       do k = 0 to 7
  73          b = VSR[VRB+32].dword[i].byte[k].bit[j]
  74          VSR[VRT+32].dword[i].byte[j].bit[k] = b
  75
  76 # vector bit deposit
  77
  78 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
  79
  80     do while(m < 64)
  81        if VSR[VRB+32].dword[i].bit[63-m]=1 then do
  82           result = VSR[VRA+32].dword[i].bit[63-k]
  83           VSR[VRT+32].dword[i].bit[63-m] = result
  84           k = k + 1
  85        m = m + 1
  86
  87 ```
  88
  89 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
  90 {
  91     uint_xlen_t r = 0;
  92     for (int i = 0, j = 0; i < XLEN; i++)
  93         if ((RB >> i) & 1) {
  94             if ((RA >> j) & 1)
  95                 r |= uint_xlen_t(1) << i;
  96             j++;
  97         }
  98     return r;
  99 }
 100
 101 ```
 102
 103 # vector bit extract
 104
 105 other way round: identical to RV bext
 106
 107 ```
 108 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
 109 {
 110     uint_xlen_t r = 0;
 111     for (int i = 0, j = 0; i < XLEN; i++)
 112         if ((RB >> i) & 1) {
 113             if ((RA >> i) & 1)
 114                 r |= uint_xlen_t(1) << j;
 115             j++;
 116         }
 117     return r;
 118 }
 119 ```
 120
 121 # int min/max
 122
 123 signed and unsigned min/max for integer.  this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned.  when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
 124
 125 signed/unsigned min/max gives more flexibility.
 126
 127 # ternary bitops
 128
 129 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
 130
 131 | 0.5|6.10|11.15|16.20| 21..25| 26..30   |31|
 132 | -- | -- | --- | --- | ----- | -------- |--|
 133 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc|
 134
 135     for i in range(64):
 136         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 137         RT[i] = (imm & (1<<idx)) != 0
 138
 139 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
 140
 141 a 4 operand variant which becomes more along the lines of an FPGA:
 142
 143 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 144 | -- | -- | --- | --- | --- | ------- |--|
 145 | NN | RT | RA  | RB  | RC  | mode 1  |1 |
 146
 147     for i in range(64):
 148         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 149         RT[i] = (RC & (1<<idx)) != 0
 150
 151 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul.
 152
 153 also, another possible variant involving swizzle and vec4:
 154
 155     for i in range(8):
 156         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 157         RT[i] = (RA.w[i] & (1<<idx)) != 0
 158
 159 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 160 | -- | -- | --- | ----- | ---- | ----- |--|
 161 | NN | RT | RA  | xyzw  | mask | mode 1 |1 |
 162
 163     for i in range(8):
 164         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 165         res = (RA.w[i] & (1<<idx)) != 0
 166         for j in range(4):
 167              if mask[j]: RT[i+j*8] = res
 168
 169 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 170 | -- | -- | --- | ----- | ---- | ----- |--|
 171 | NN | RT | RA  | imm   | mask | mode 1 |1 |
 172
 173     for i in range(8):
 174         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 175         res = (imm & (1<<idx)) != 0
 176         for j in range(3):
 177              if mask[j]: RT[i+j*8] = res
 178
 179 another mode selection would be CRs not Ints.
 180
 181 | 0.5|6.8 | 9.11|12.14|15.17|18.20| 21..25| 26.29|30|31|
 182 | -- | -- | --- | --- | --- |-----| ----- | ---- |--|--|
 183 | NN | BT | BA  | BB  | BC  |im5-7| im0-4 | mask |1 |0 |
 184
 185     for i in range(4):
 186         if not mask[i] continue
 187         idx = crregs[BA][i] << 2 |
 188               crregs[BB][i] << 1 |
 189               crregs[BC][i]
 190         crregs[BT][i] = (imm & (1<<idx)) != 0
 191
 192 # bitmask set
 193
 194 based on RV bitmanip singlebit set, instruction format similar to shift
 195 [[isa/fixedshift]].  bmext is actually covered already (shift-with-mask).
 196 however bitmask-invert is not, and set/clr are not covered, although they can ise the same Shift ALU.
 197
 198 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 199 | -- | -- | --- | --- | --- | ------- |--|
 200 | NN | RT | RA  | RB  | RC  | mode 010 |Rc|
 201
 202 ```
 203 uint_xlen_t bmset(RA, RB, sh)
 204 {
 205     int shamt = RB & (XLEN - 1);
 206     mask = (2<<sh)-1;
 207     return RA | (mask << shamt);
 208 }
 209
 210 uint_xlen_t bmclr(RA, RB, sh)
 211 {
 212     int shamt = RB & (XLEN - 1);
 213     mask = (2<<sh)-1;
 214     return RA & ~(mask << shamt);
 215 }
 216
 217 uint_xlen_t bminv(RA, RB, sh)
 218 {
 219     int shamt = RB & (XLEN - 1);
 220     mask = (2<<sh)-1;
 221     return RA ^ (mask << shamt);
 222 }
 223
 224 uint_xlen_t bmext(RA, RB, sh)
 225 {
 226     int shamt = RB & (XLEN - 1);
 227     mask = (2<<sh)-1;
 228     return mask & (RA >> shamt);
 229 }
 230 ```
 231
 232 # grev
 233
 234 based on RV bitmanip
 235
 236 ```
 237 uint64_t grev64(uint64_t RA, uint64_t RB)
 238 {
 239     uint64_t x = RA;
 240     int shamt = RB & 63;
 241     if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
 242                         ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
 243     if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
 244                         ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
 245     if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
 246                         ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
 247     if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
 248                         ((x & 0xFF00FF00FF00FF00LL) >>  8);
 249     if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
 250                         ((x & 0xFFFF0000FFFF0000LL) >> 16);
 251     if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
 252                         ((x & 0xFFFFFFFF00000000LL) >> 32);
 253     return x;
 254 }
 255
 256 ```
 257
 258 # shuffle / unshuffle
 259
 260 based on RV bitmanip
 261
 262 ```
 263 uint32_t shfl32(uint32_t RA, uint32_t RB)
 264 {
 265     uint32_t x = RA;
 266     int shamt = RB & 15;
 267     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 268     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 269     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 270     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 271     return x;
 272 }
 273 uint32_t unshfl32(uint32_t RA, uint32_t RB)
 274 {
 275     uint32_t x = RA;
 276     int shamt = RB & 15;
 277     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 278     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 279     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 280     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 281     return x;
 282 }
 283
 284 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
 285 {
 286     uint64_t x = src & ~(maskL | maskR);
 287     x |= ((src << N) & maskL) | ((src >> N) & maskR);
 288     return x;
 289 }
 290 uint64_t shfl64(uint64_t RA, uint64_t RB)
 291 {
 292     uint64_t x = RA;
 293     int shamt = RB & 31;
 294     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 295                                            0x00000000ffff0000LL, 16);
 296     if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 297                                            0x0000ff000000ff00LL, 8);
 298     if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 299                                            0x00f000f000f000f0LL, 4);
 300     if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
 301                                            0x0c0c0c0c0c0c0c0cLL, 2);
 302     if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
 303                                            0x2222222222222222LL, 1);
 304     return x;
 305 }
 306 uint64_t unshfl64(uint64_t RA, uint64_t RB)
 307 {
 308     uint64_t x = RA;
 309     int shamt = RB & 31;
 310     if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
 311                                            0x2222222222222222LL, 1);
 312     if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
 313                                            0x0c0c0c0c0c0c0c0cLL, 2);
 314     if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 315                                            0x00f000f000f000f0LL, 4);
 316     if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 317                                            0x0000ff000000ff00LL, 8);
 318     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 319                                            0x00000000ffff0000LL, 16);
 320     return x;
 321 }
 322 ```
 323
 324 # xperm
 325
 326 based on RV bitmanip
 327
 328 ```
 329 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
 330 {
 331     uint_xlen_t r = 0;
 332     uint_xlen_t sz = 1LL << sz_log2;
 333     uint_xlen_t mask = (1LL << sz) - 1;
 334     for (int i = 0; i < XLEN; i += sz) {
 335         uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
 336         if (pos < XLEN)
 337             r |= ((RA >> pos) & mask) << i;
 338     }
 339     return r;
 340 }
 341 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
 342 {  return xperm(RA, RB, 2); }
 343 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
 344 {  return xperm(RA, RB, 3); }
 345 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
 346 {  return xperm(RA, RB, 4); }
 347 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
 348 {  return xperm(RA, RB, 5); }
 349 ```
 350
 351 # gorc
 352
 353 based on RV bitmanip
 354
 355 ```
 356 uint32_t gorc32(uint32_t RA, uint32_t RB)
 357 {
 358     uint32_t x = RA;
 359     int shamt = RB & 31;
 360     if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
 361     if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
 362     if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
 363     if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
 364     if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
 365     return x;
 366 }
 367 uint64_t gorc64(uint64_t RA, uint64_t RB)
 368 {
 369     uint64_t x = RA;
 370     int shamt = RB & 63;
 371     if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
 372                          ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
 373     if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
 374                          ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
 375     if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
 376                          ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
 377     if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
 378                          ((x & 0xFF00FF00FF00FF00LL)  >>  8);
 379     if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
 380                          ((x & 0xFFFF0000FFFF0000LL)  >> 16);
 381     if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
 382                          ((x & 0xFFFFFFFF00000000LL)  >> 32);
 383     return x;
 384 }
 385
 386 ```
 387
 388 # cmix
 389
 390 based on RV bitmanip, covered by ternary bitops
 391
 392 ```
 393 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
 394     return (RA & RB) | (RC & ~RB);
 395 }
 396 ```
 397
 398 # carryless mul
 399
 400 based on RV bitmanip
 401 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
 402
 403 ```
 404 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
 405 {
 406     uint_xlen_t x = 0;
 407     for (int i = 0; i < XLEN; i++)
 408         if ((RB >> i) & 1)
 409             x ^= RA << i;
 410     return x;
 411 }
 412 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
 413 {
 414     uint_xlen_t x = 0;
 415     for (int i = 1; i < XLEN; i++)
 416         if ((RB >> i) & 1)
 417             x ^= RA >> (XLEN-i);
 418     return x;
 419 }
 420 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
 421 {
 422     uint_xlen_t x = 0;
 423     for (int i = 0; i < XLEN; i++)
 424         if ((RB >> i) & 1)
 425             x ^= RA >> (XLEN-i-1);
 426     return x;
 427 }
 428 ```
 429 # Galois Field Multiply
 430
 431 ```
 432 from functools import reduce
 433
 434 # constants used in the multGF2 function
 435 mask1 = mask2 = polyred = None
 436
 437 def setGF2(degree, irPoly):
 438     """Define parameters of binary finite field GF(2^m)/g(x)
 439        - degree: extension degree of binary field
 440        - irPoly: coefficients of irreducible polynomial g(x)
 441     """
 442     def i2P(sInt):
 443         """Convert an integer into a polynomial"""
 444         return [(sInt >> i) & 1
 445                 for i in reversed(range(sInt.bit_length()))]
 446
 447     global mask1, mask2, polyred
 448     mask1 = mask2 = 1 << degree
 449     mask2 -= 1
 450     polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
 451
 452 def multGF2(p1, p2):
 453     """Multiply two polynomials in GF(2^m)/g(x)"""
 454     p = 0
 455     while p2:
 456         if p2 & 1:
 457             p ^= p1
 458         p1 <<= 1
 459         if p1 & mask1:
 460             p1 ^= polyred
 461         p2 >>= 1
 462     return p & mask2
 463
 464 if __name__ == "__main__":
 465
 466     # Define binary field GF(2^3)/x^3 + x + 1
 467     setGF2(3, 0b1011)
 468
 469     # Evaluate the product (x^2 + x + 1)(x^2 + 1)
 470     print("{:02x}".format(multGF2(0b111, 0b101)))
 471
 472     # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
 473     # (used in the Advanced Encryption Standard-AES)
 474     setGF2(8, 0b100011011)
 475
 476     # Evaluate the product (x^7)(x^7 + x + 1)
 477     print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
 478 ```
 479
 480 # crc
 481
 482 ```
 483 uint_xlen_t crc32(uint_xlen_t x, int nbits)
 484 {
 485     for (int i = 0; i < nbits; i++)
 486         x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
 487     return x;
 488 }
 489 uint_xlen_t crc32c(uint_xlen_t x, int nbits)
 490 {
 491     for (int i = 0; i < nbits; i++)
 492         x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
 493     return x;
 494 }
 495 uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); }
 496 uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); }
 497 uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); }
 498 uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); }
 499 uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); }
 500 uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); }
 501 #if XLEN > 32
 502 uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); }
 503 uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); }
 504 #endif
 505 ```
 506
 507 # bitmatrix
 508
 509 ```
 510 uint64_t bmatflip(uint64_t RA)
 511 {
 512     uint64_t x = RA;
 513     x = shfl64(x, 31);
 514     x = shfl64(x, 31);
 515     x = shfl64(x, 31);
 516     return x;
 517 }
 518 uint64_t bmatxor(uint64_t RA, uint64_t RB)
 519 {
 520     // transpose of RB
 521     uint64_t RBt = bmatflip(RB);
 522     uint8_t u[8]; // rows of RA
 523     uint8_t v[8]; // cols of RB
 524     for (int i = 0; i < 8; i++) {
 525         u[i] = RA >> (i*8);
 526         v[i] = RBt >> (i*8);
 527     }
 528     uint64_t x = 0;
 529     for (int i = 0; i < 64; i++) {
 530         if (pcnt(u[i / 8] & v[i % 8]) & 1)
 531             x |= 1LL << i;
 532     }
 533     return x;
 534 }
 535 uint64_t bmator(uint64_t RA, uint64_t RB)
 536 {
 537     // transpose of RB
 538     uint64_t RBt = bmatflip(RB);
 539     uint8_t u[8]; // rows of RA
 540     uint8_t v[8]; // cols of RB
 541     for (int i = 0; i < 8; i++) {
 542         u[i] = RA >> (i*8);
 543         v[i] = RBt >> (i*8);
 544     }
 545     uint64_t x = 0;
 546     for (int i = 0; i < 64; i++) {
 547         if ((u[i / 8] & v[i % 8]) != 0)
 548             x |= 1LL << i;
 549     }
 550     return x;
 551 }
 552
 553 ```