openpower/sv/bitmanip.mdwn

   1 [[!tag standards]]
   2
   3 # summary
   4
   5 1-op and variants
   6
   7 | dest | src1 | subop | op       |
   8 | ---- | ---- | ----- | -------- |
   9 | RT   | RA   | ..    | bmatflip |
  10 | RT   | RA   | size  | crc32    |
  11 | RT   | RA   | size  | crc32c   |
  12
  13 2-op and variants
  14
  15 | dest | src1 | src2 | subop | op       |
  16 | ---- | ---- | ---- | ----- | -------- |
  17 | RT   | RA   | RB   | or    | bmatflip |
  18 | RT   | RA   | RB   | xor   | bmatflip |
  19 | RT   | RA   | RB   | bdep  | dep/ext  |
  20 | RT   | RA   | RB   | bext  | dep/ext  |
  21 | RT   | RA   | RB   |       | grev  |
  22 | RT   | RA   | RB   |       | gorc |
  23 | RT   | RA   | RB   | shuf  | shuffle |
  24 | RT   | RA   | RB   | unshuf| shuffle |
  25 | RT   | RA   | RB   | width | xperm  |
  26 | RT   | RA   | RB   | type  | clmul |
  27 | RT   | RA   | RB   | type | minmax |
  28 | RT   | RA   | RB   |  |  |
  29 | RT   | RA   | RB   |  |  |
  30 | RT   | RA   | RB   |  |  |
  31
  32 3 ops
  33
  34 * bitmask swt/extract
  35 * ternary bitops
  36
  37 ops
  38
  39 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24..30  |31| name |
  40 | -- | -- | --- | --- | ----- | -- | ------- |--| ---- |
  41 | NN | RA | RB  |     |       | 0  | 0000110 |Rc| rsvd   |
  42 | NN | RA | RB  | RC  | itype | 1  | 0000110 |Rc| xperm |
  43 | NN | RA | RB  | RC  | itype | 0  | 0100110 |Rc| minmax |
  44 | NN | RA | RB  |     |       | 1  | 0100110 |Rc| rsvd |
  45 | NN | RA | RB  | sh  | itype | SH | 1000110 |Rc| bmopsi |
  46 | NN | RA | RB  |     |       |    | 1100110 |Rc| rsvd |
  47 | NN | RA | RB  | RC  | itype | 0  | 0001110 |Rc| clmul |
  48 | NN | RA | RB  | sh  | itype | 0  | 0101110 |Rc| clmulw |
  49 | NN | RA | RB  | RC  | 00    | 0  | 0010110 |Rc| gorc |
  50 | NN | RA | RB  | sh  | 00    | SH | 1010110 |Rc| gorci |
  51 | NN | RA | RB  | RC  | 00    | 0  | 0110110 |Rc| gorcw |
  52 | NN | RA | RB  | sh  | 00    | 0  | 1110110 |Rc| gorcwi |
  53 | NN | RA | RB  | RC  | 00    | 1  | 1110110 |Rc| bmator  |
  54 | NN | RA | RB  | RC  | 01    | 0  | 0010110 |Rc| grev |
  55 | NN | RA | RB  | sh  | 01    | SH | 1010110 |Rc| grevi |
  56 | NN | RA | RB  | RC  | 01    | 0  | 0110110 |Rc| grevw |
  57 | NN | RA | RB  | sh  | 01    | 0  | 1110110 |Rc| grevwi |
  58 | NN | RA | RB  | RC  | 01    | 1  | 1110110 |Rc| bmatxor   |
  59 | NN | RA | RB  | RC  | 10    | 0  | 0010110 |Rc| shfl |
  60 | NN | RA | RB  | sh  | 10    | SH | 1010110 |Rc| shfli |
  61 | NN | RA | RB  | RC  | 10    | 0  | 0110110 |Rc| shflw |
  62 | NN | RA | RB  | RC  | 10    | 0  | 1110110 |Rc| bdep   |
  63 | NN | RA | RB  | RC  | 10    | 1  | 1110110 |Rc| bext  |
  64 | NN | RA | RB  |     | 11    |    | 1110110 |Rc| rsvd  |
  65 | NN | RA | RB  |     |       |    | NN11110 |Rc| rsvd  |
  66
  67 # bit to byte permute
  68
  69 similar to matrix permute in RV bitmanip, which has XOR and OR variants
  70
  71     do j = 0 to 7
  72       do k = 0 to 7
  73          b = VSR[VRB+32].dword[i].byte[k].bit[j]
  74          VSR[VRT+32].dword[i].byte[j].bit[k] = b
  75
  76 # vector bit deposit
  77
  78 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
  79
  80     do while(m < 64)
  81        if VSR[VRB+32].dword[i].bit[63-m]=1 then do
  82           result = VSR[VRA+32].dword[i].bit[63-k]
  83           VSR[VRT+32].dword[i].bit[63-m] = result
  84           k = k + 1
  85        m = m + 1
  86
  87 ```
  88
  89 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
  90 {
  91     uint_xlen_t r = 0;
  92     for (int i = 0, j = 0; i < XLEN; i++)
  93         if ((RB >> i) & 1) {
  94             if ((RA >> j) & 1)
  95                 r |= uint_xlen_t(1) << i;
  96             j++;
  97         }
  98     return r;
  99 }
 100
 101 ```
 102
 103 # vector bit extract
 104
 105 other way round: identical to RV bext
 106
 107 ```
 108 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
 109 {
 110     uint_xlen_t r = 0;
 111     for (int i = 0, j = 0; i < XLEN; i++)
 112         if ((RB >> i) & 1) {
 113             if ((RA >> i) & 1)
 114                 r |= uint_xlen_t(1) << j;
 115             j++;
 116         }
 117     return r;
 118 }
 119 ```
 120
 121 # int min/max
 122
 123 signed and unsigned min/max for integer.  this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned.  when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
 124
 125 signed/unsigned min/max gives more flexibility.
 126
 127 # ternary bitops
 128
 129 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
 130
 131 | 0.5|6.10|11.15|16.20| 21..25| 26..30   |31|
 132 | -- | -- | --- | --- | ----- | -------- |--|
 133 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc|
 134
 135     for i in range(64):
 136         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 137         RT[i] = (imm & (1<<idx)) != 0
 138
 139 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
 140
 141 a 4 operand variant which becomes more along the lines of an FPGA:
 142
 143 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 144 | -- | -- | --- | --- | --- | ------- |--|
 145 | NN | RT | RA  | RB  | RC  | mode 1  |1 |
 146
 147     for i in range(64):
 148         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 149         RT[i] = (RC & (1<<idx)) != 0
 150
 151 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul.
 152
 153 also, another possible variant involving swizzle and vec4:
 154
 155     for i in range(8):
 156         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 157         RT[i] = (RA.w[i] & (1<<idx)) != 0
 158
 159 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 160 | -- | -- | --- | ----- | ---- | ----- |--|
 161 | NN | RT | RA  | xyzw  | mask | mode 1 |1 |
 162
 163     for i in range(8):
 164         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 165         res = (RA.w[i] & (1<<idx)) != 0
 166         for j in range(4):
 167              if mask[j]: RT[i+j*8] = res
 168
 169 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 170 | -- | -- | --- | ----- | ---- | ----- |--|
 171 | NN | RT | RA  | imm   | mask | mode 1 |1 |
 172
 173     for i in range(8):
 174         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 175         res = (imm & (1<<idx)) != 0
 176         for j in range(3):
 177              if mask[j]: RT[i+j*8] = res
 178
 179 another mode selection would be CRs not Ints.
 180
 181 | 0.5|6.8 | 9.11|12.14|15.17|18.20| 21..25| 26.29|30|31|
 182 | -- | -- | --- | --- | --- |-----| ----- | ---- |--|--|
 183 | NN | BT | BA  | BB  | BC  |im5-7| im0-4 | mask |1 |0 |
 184
 185     for i in range(4):
 186         if not mask[i] continue
 187         idx = crregs[BA][i] << 2 |
 188               crregs[BB][i] << 1 |
 189               crregs[BC][i]
 190         crregs[BT][i] = (imm & (1<<idx)) != 0
 191
 192 # bitmask set
 193
 194 based on RV bitmanip singlebit set, instruction format similar to shift
 195 [[isa/fixedshift]].  bmext is actually covered already (shift-with-mask).
 196 however bitmask-invert is not, and set/clr are not covered, although they can ise the same Shift ALU.
 197
 198 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 199 | -- | -- | --- | --- | --- | ------- |--|
 200 | NN | RT | RA  | RB  | RC  | mode 010 |Rc|
 201
 202 ```
 203 uint_xlen_t bmset(RA, RB, sh)
 204 {
 205     int shamt = RB & (XLEN - 1);
 206     mask = (2<<sh)-1;
 207     return RA | (mask << shamt);
 208 }
 209
 210 uint_xlen_t bmclr(RA, RB, sh)
 211 {
 212     int shamt = RB & (XLEN - 1);
 213     mask = (2<<sh)-1;
 214     return RA & ~(mask << shamt);
 215 }
 216
 217 uint_xlen_t bminv(RA, RB, sh)
 218 {
 219     int shamt = RB & (XLEN - 1);
 220     mask = (2<<sh)-1;
 221     return RA ^ (mask << shamt);
 222 }
 223
 224 uint_xlen_t bmext(RA, RB, sh)
 225 {
 226     int shamt = RB & (XLEN - 1);
 227     mask = (2<<sh)-1;
 228     return mask & (RA >> shamt);
 229 }
 230 ```
 231
 232 # grev
 233
 234 based on RV bitmanip
 235
 236 ```
 237 uint64_t grev64(uint64_t RA, uint64_t RB)
 238 {
 239     uint64_t x = RA;
 240     int shamt = RB & 63;
 241     if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
 242                         ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
 243     if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
 244                         ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
 245     if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
 246                         ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
 247     if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
 248                         ((x & 0xFF00FF00FF00FF00LL) >>  8);
 249     if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
 250                         ((x & 0xFFFF0000FFFF0000LL) >> 16);
 251     if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
 252                         ((x & 0xFFFFFFFF00000000LL) >> 32);
 253     return x;
 254 }
 255
 256 ```
 257
 258 # shuffle / unshuffle
 259
 260 based on RV bitmanip
 261
 262 ```
 263 uint32_t shfl32(uint32_t RA, uint32_t RB)
 264 {
 265     uint32_t x = RA;
 266     int shamt = RB & 15;
 267     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 268     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 269     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 270     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 271     return x;
 272 }
 273 uint32_t unshfl32(uint32_t RA, uint32_t RB)
 274 {
 275     uint32_t x = RA;
 276     int shamt = RB & 15;
 277     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 278     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 279     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 280     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 281     return x;
 282 }
 283
 284 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
 285 {
 286     uint64_t x = src & ~(maskL | maskR);
 287     x |= ((src << N) & maskL) | ((src >> N) & maskR);
 288     return x;
 289 }
 290 uint64_t shfl64(uint64_t RA, uint64_t RB)
 291 {
 292     uint64_t x = RA;
 293     int shamt = RB & 31;
 294     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 295                                            0x00000000ffff0000LL, 16);
 296     if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 297                                            0x0000ff000000ff00LL, 8);
 298     if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 299                                            0x00f000f000f000f0LL, 4);
 300     if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
 301                                            0x0c0c0c0c0c0c0c0cLL, 2);
 302     if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
 303                                            0x2222222222222222LL, 1);
 304     return x;
 305 }
 306 uint64_t unshfl64(uint64_t RA, uint64_t RB)
 307 {
 308     uint64_t x = RA;
 309     int shamt = RB & 31;
 310     if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
 311                                            0x2222222222222222LL, 1);
 312     if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
 313                                            0x0c0c0c0c0c0c0c0cLL, 2);
 314     if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 315                                            0x00f000f000f000f0LL, 4);
 316     if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 317                                            0x0000ff000000ff00LL, 8);
 318     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 319                                            0x00000000ffff0000LL, 16);
 320     return x;
 321 }
 322 ```
 323
 324 # xperm
 325
 326 based on RV bitmanip
 327
 328 ```
 329 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
 330 {
 331     uint_xlen_t r = 0;
 332     uint_xlen_t sz = 1LL << sz_log2;
 333     uint_xlen_t mask = (1LL << sz) - 1;
 334     for (int i = 0; i < XLEN; i += sz) {
 335         uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
 336         if (pos < XLEN)
 337             r |= ((RA >> pos) & mask) << i;
 338     }
 339     return r;
 340 }
 341 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
 342 {  return xperm(RA, RB, 2); }
 343 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
 344 {  return xperm(RA, RB, 3); }
 345 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
 346 {  return xperm(RA, RB, 4); }
 347 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
 348 {  return xperm(RA, RB, 5); }
 349 ```
 350
 351 # gorc
 352
 353 based on RV bitmanip
 354
 355 ```
 356 uint32_t gorc32(uint32_t RA, uint32_t RB)
 357 {
 358     uint32_t x = RA;
 359     int shamt = RB & 31;
 360     if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
 361     if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
 362     if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
 363     if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
 364     if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
 365     return x;
 366 }
 367 uint64_t gorc64(uint64_t RA, uint64_t RB)
 368 {
 369     uint64_t x = RA;
 370     int shamt = RB & 63;
 371     if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
 372                          ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
 373     if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
 374                          ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
 375     if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
 376                          ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
 377     if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
 378                          ((x & 0xFF00FF00FF00FF00LL)  >>  8);
 379     if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
 380                          ((x & 0xFFFF0000FFFF0000LL)  >> 16);
 381     if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
 382                          ((x & 0xFFFFFFFF00000000LL)  >> 32);
 383     return x;
 384 }
 385
 386 ```
 387
 388 # cmix
 389
 390 based on RV bitmanip, covered by ternary bitops
 391
 392 ```
 393 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
 394     return (RA & RB) | (RC & ~RB);
 395 }
 396 ```
 397
 398 # carryless mul
 399
 400 based on RV bitmanip
 401 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
 402
 403 ```
 404 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
 405 {
 406     uint_xlen_t x = 0;
 407     for (int i = 0; i < XLEN; i++)
 408         if ((RB >> i) & 1)
 409             x ^= RA << i;
 410     return x;
 411 }
 412 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
 413 {
 414     uint_xlen_t x = 0;
 415     for (int i = 1; i < XLEN; i++)
 416         if ((RB >> i) & 1)
 417             x ^= RA >> (XLEN-i);
 418     return x;
 419 }
 420 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
 421 {
 422     uint_xlen_t x = 0;
 423     for (int i = 0; i < XLEN; i++)
 424         if ((RB >> i) & 1)
 425             x ^= RA >> (XLEN-i-1);
 426     return x;
 427 }
 428 ```
 429 # Galois Field Multiply
 430
 431 this requires 3 parameters and a "degree"
 432
 433     RT = GFMUL(RA, RB, gfdegree, modulo=RC)
 434
 435 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
 436
 437     RS = GFMUL(RS, RA, gfdegree, modulo=RB)
 438
 439 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 440 | -- | -- | --- | --- | --- | ------- |--|
 441 | NN | RS | RA  | RB  | deg |         |  |
 442
 443 where the SimpleV variant may override RS-as-src differently from RS-as-dest
 444
 445
 446 ```
 447 from functools import reduce
 448
 449 # constants used in the multGF2 function
 450 mask1 = mask2 = polyred = None
 451
 452 def setGF2(degree, irPoly):
 453     """Define parameters of binary finite field GF(2^m)/g(x)
 454        - degree: extension degree of binary field
 455        - irPoly: coefficients of irreducible polynomial g(x)
 456     """
 457     def i2P(sInt):
 458         """Convert an integer into a polynomial"""
 459         return [(sInt >> i) & 1
 460                 for i in reversed(range(sInt.bit_length()))]
 461
 462     global mask1, mask2, polyred
 463     mask1 = mask2 = 1 << degree
 464     mask2 -= 1
 465     polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
 466
 467 def multGF2(p1, p2):
 468     """Multiply two polynomials in GF(2^m)/g(x)"""
 469     p = 0
 470     while p2:
 471         if p2 & 1:
 472             p ^= p1
 473         p1 <<= 1
 474         if p1 & mask1:
 475             p1 ^= polyred
 476         p2 >>= 1
 477     return p & mask2
 478
 479 if __name__ == "__main__":
 480
 481     # Define binary field GF(2^3)/x^3 + x + 1
 482     setGF2(3, 0b1011)
 483
 484     # Evaluate the product (x^2 + x + 1)(x^2 + 1)
 485     print("{:02x}".format(multGF2(0b111, 0b101)))
 486
 487     # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
 488     # (used in the Advanced Encryption Standard-AES)
 489     setGF2(8, 0b100011011)
 490
 491     # Evaluate the product (x^7)(x^7 + x + 1)
 492     print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
 493 ```
 494
 495 # crc
 496
 497 ```
 498 uint_xlen_t crc32(uint_xlen_t x, int nbits)
 499 {
 500     for (int i = 0; i < nbits; i++)
 501         x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
 502     return x;
 503 }
 504 uint_xlen_t crc32c(uint_xlen_t x, int nbits)
 505 {
 506     for (int i = 0; i < nbits; i++)
 507         x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
 508     return x;
 509 }
 510 uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); }
 511 uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); }
 512 uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); }
 513 uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); }
 514 uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); }
 515 uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); }
 516 #if XLEN > 32
 517 uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); }
 518 uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); }
 519 #endif
 520 ```
 521
 522 # bitmatrix
 523
 524 ```
 525 uint64_t bmatflip(uint64_t RA)
 526 {
 527     uint64_t x = RA;
 528     x = shfl64(x, 31);
 529     x = shfl64(x, 31);
 530     x = shfl64(x, 31);
 531     return x;
 532 }
 533 uint64_t bmatxor(uint64_t RA, uint64_t RB)
 534 {
 535     // transpose of RB
 536     uint64_t RBt = bmatflip(RB);
 537     uint8_t u[8]; // rows of RA
 538     uint8_t v[8]; // cols of RB
 539     for (int i = 0; i < 8; i++) {
 540         u[i] = RA >> (i*8);
 541         v[i] = RBt >> (i*8);
 542     }
 543     uint64_t x = 0;
 544     for (int i = 0; i < 64; i++) {
 545         if (pcnt(u[i / 8] & v[i % 8]) & 1)
 546             x |= 1LL << i;
 547     }
 548     return x;
 549 }
 550 uint64_t bmator(uint64_t RA, uint64_t RB)
 551 {
 552     // transpose of RB
 553     uint64_t RBt = bmatflip(RB);
 554     uint8_t u[8]; // rows of RA
 555     uint8_t v[8]; // cols of RB
 556     for (int i = 0; i < 8; i++) {
 557         u[i] = RA >> (i*8);
 558         v[i] = RBt >> (i*8);
 559     }
 560     uint64_t x = 0;
 561     for (int i = 0; i < 64; i++) {
 562         if ((u[i / 8] & v[i % 8]) != 0)
 563             x |= 1LL << i;
 564     }
 565     return x;
 566 }
 567
 568 ```