openpower/sv/bitmanip.mdwn

   1 [[!tag standards]]
   2
   3 # summary
   4
   5 1-op and variants
   6
   7 | dest | src1 | subop | op       |
   8 | ---- | ---- | ----- | -------- |
   9 | RT   | RA   | ..    | bmatflip |
  10 | RT   | RA   | size  | crc32    |
  11 | RT   | RA   | size  | crc32c   |
  12
  13 2-op and variants
  14
  15 | dest | src1 | src2 | subop | op       |
  16 | ---- | ---- | ---- | ----- | -------- |
  17 | RT   | RA   | RB   | or    | bmatflip |
  18 | RT   | RA   | RB   | xor   | bmatflip |
  19 | RT   | RA   | RB   | bdep  | dep/ext  |
  20 | RT   | RA   | RB   | bext  | dep/ext  |
  21 | RT   | RA   | RB   |       | grev  |
  22 | RT   | RA   | RB   |       | gorc |
  23 | RT   | RA   | RB   | shuf  | shuffle |
  24 | RT   | RA   | RB   | unshuf| shuffle |
  25 | RT   | RA   | RB   | width | xperm  |
  26 | RT   | RA   | RB   | type  | clmul |
  27 | RT   | RA   | RB   | type | minmax |
  28 | RT   | RA   | RB   |  |  |
  29 | RT   | RA   | RB   |  |  |
  30 | RT   | RA   | RB   |  |  |
  31
  32 3 ops
  33
  34 * bitmask swt/extract
  35 * ternary bitops
  36
  37 ops
  38
  39 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24..30  |31| name |
  40 | -- | -- | --- | --- | ----- | -- | ------- |--| ---- |
  41 | NN | RA | RB  |     |       | 0  | 0000110 |Rc| rsvd   |
  42 | NN | RA | RB  | RC  | itype | 1  | 0000110 |Rc| xperm |
  43 | NN | RA | RB  | RC  | itype | 0  | 0100110 |Rc| minmax |
  44 | NN | RA | RB  |     |       | 1  | 0100110 |Rc| rsvd |
  45 | NN | RA | RB  | sh  | itype | SH | 1000110 |Rc| bmopsi |
  46 | NN | RA | RB  |     |       |    | 1100110 |Rc| rsvd |
  47 | NN | RA | RB  | RC  | itype | 0  | 0001110 |Rc| clmul |
  48 | NN | RA | RB  | sh  | itype | 0  | 0101110 |Rc| clmulw |
  49 | NN | RA | RB  | RC  | 00    | 0  | 0010110 |Rc| gorc |
  50 | NN | RA | RB  | sh  | 00    | SH | 1010110 |Rc| gorci |
  51 | NN | RA | RB  | RC  | 00    | 0  | 0110110 |Rc| gorcw |
  52 | NN | RA | RB  | sh  | 00    | 0  | 1110110 |Rc| gorcwi |
  53 | NN | RA | RB  | RC  | 00    | 1  | 1110110 |Rc| bmator  |
  54 | NN | RA | RB  | RC  | 01    | 0  | 0010110 |Rc| grev |
  55 | NN | RA | RB  | sh  | 01    | SH | 1010110 |Rc| grevi |
  56 | NN | RA | RB  | RC  | 01    | 0  | 0110110 |Rc| grevw |
  57 | NN | RA | RB  | sh  | 01    | 0  | 1110110 |Rc| grevwi |
  58 | NN | RA | RB  | RC  | 01    | 1  | 1110110 |Rc| bmatxor   |
  59 | NN | RA | RB  | RC  | 10    | 0  | 0010110 |Rc| shfl |
  60 | NN | RA | RB  | sh  | 10    | SH | 1010110 |Rc| shfli |
  61 | NN | RA | RB  | RC  | 10    | 0  | 0110110 |Rc| shflw |
  62 | NN | RA | RB  | RC  | 10    | 0  | 1110110 |Rc| bdep   |
  63 | NN | RA | RB  | RC  | 10    | 1  | 1110110 |Rc| bext  |
  64 | NN | RA | RB  |     | 11    |    | 1110110 |Rc| rsvd  |
  65 | NN | RA | RB  |     |       |    | NN11110 |Rc| rsvd  |
  66
  67 # bit to byte permute
  68
  69 similar to matrix permute in RV bitmanip, which has XOR and OR variants
  70
  71     do j = 0 to 7
  72       do k = 0 to 7
  73          b = VSR[VRB+32].dword[i].byte[k].bit[j]
  74          VSR[VRT+32].dword[i].byte[j].bit[k] = b
  75
  76 # vector bit deposit
  77
  78 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
  79
  80     do while(m < 64)
  81        if VSR[VRB+32].dword[i].bit[63-m]=1 then do
  82           result = VSR[VRA+32].dword[i].bit[63-k]
  83           VSR[VRT+32].dword[i].bit[63-m] = result
  84           k = k + 1
  85        m = m + 1
  86
  87 ```
  88
  89 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
  90 {
  91     uint_xlen_t r = 0;
  92     for (int i = 0, j = 0; i < XLEN; i++)
  93         if ((RB >> i) & 1) {
  94             if ((RA >> j) & 1)
  95                 r |= uint_xlen_t(1) << i;
  96             j++;
  97         }
  98     return r;
  99 }
 100
 101 ```
 102
 103 # vector bit extract
 104
 105 other way round: identical to RV bext
 106
 107 ```
 108 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
 109 {
 110     uint_xlen_t r = 0;
 111     for (int i = 0, j = 0; i < XLEN; i++)
 112         if ((RB >> i) & 1) {
 113             if ((RA >> i) & 1)
 114                 r |= uint_xlen_t(1) << j;
 115             j++;
 116         }
 117     return r;
 118 }
 119 ```
 120
 121 # int min/max
 122
 123 signed and unsigned min/max for integer.  this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned.  when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
 124
 125 signed/unsigned min/max gives more flexibility.
 126
 127 # ternary bitops
 128
 129 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
 130
 131 | 0.5|6.10|11.15|16.20| 21..25| 26..30   |31|
 132 | -- | -- | --- | --- | ----- | -------- |--|
 133 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc|
 134
 135     for i in range(64):
 136         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 137         RT[i] = (imm & (1<<idx)) != 0
 138
 139 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
 140
 141 a 4 operand variant which becomes more along the lines of an FPGA:
 142
 143 | 0.5|6.10|11.15|16.20|21.25| 26...30  |31|
 144 | -- | -- | --- | --- | --- | -------- |--|
 145 | NN | RT | RA  | RB  | RC  | mode 001 |Rc|
 146
 147     for i in range(64):
 148         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 149         RT[i] = (RC & (1<<idx)) != 0
 150
 151 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
 152 3 modes.
 153
 154 also, another possible variant involving swizzle and vec4:
 155
 156 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 157 | -- | -- | --- | ----- | ---- | ----- |--|
 158 | NN | RT | RA  | imm   | mask | 101   |1 |
 159
 160     for i in range(8):
 161         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 162         res = (imm & (1<<idx)) != 0
 163         for j in range(3):
 164              if mask[j]: RT[i+j*8] = res
 165
 166 another mode selection would be CRs not Ints.
 167
 168 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
 169 | -- | -- | --- | --- |- |-----|----- | -----|--|
 170 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 |
 171
 172     for i in range(4):
 173         if not mask[i] continue
 174         idx = crregs[BA][i] << 2 |
 175               crregs[BB][i] << 1 |
 176               crregs[BC][i]
 177         crregs[BA][i] = (imm & (1<<idx)) != 0
 178
 179 # bitmask set
 180
 181 based on RV bitmanip singlebit set, instruction format similar to shift
 182 [[isa/fixedshift]].  bmext is actually covered already (shift-with-mask).
 183 however bitmask-invert is not, and set/clr are not covered, although they can ise the same Shift ALU.
 184
 185 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 186 | -- | -- | --- | --- | --- | ------- |--|
 187 | NN | RT | RA  | RB  | RC  | mode 010 |Rc|
 188
 189 ```
 190 uint_xlen_t bmset(RA, RB, sh)
 191 {
 192     int shamt = RB & (XLEN - 1);
 193     mask = (2<<sh)-1;
 194     return RA | (mask << shamt);
 195 }
 196
 197 uint_xlen_t bmclr(RA, RB, sh)
 198 {
 199     int shamt = RB & (XLEN - 1);
 200     mask = (2<<sh)-1;
 201     return RA & ~(mask << shamt);
 202 }
 203
 204 uint_xlen_t bminv(RA, RB, sh)
 205 {
 206     int shamt = RB & (XLEN - 1);
 207     mask = (2<<sh)-1;
 208     return RA ^ (mask << shamt);
 209 }
 210
 211 uint_xlen_t bmext(RA, RB, sh)
 212 {
 213     int shamt = RB & (XLEN - 1);
 214     mask = (2<<sh)-1;
 215     return mask & (RA >> shamt);
 216 }
 217 ```
 218
 219 # grev
 220
 221 based on RV bitmanip
 222
 223 ```
 224 uint64_t grev64(uint64_t RA, uint64_t RB)
 225 {
 226     uint64_t x = RA;
 227     int shamt = RB & 63;
 228     if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
 229                         ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
 230     if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
 231                         ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
 232     if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
 233                         ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
 234     if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
 235                         ((x & 0xFF00FF00FF00FF00LL) >>  8);
 236     if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
 237                         ((x & 0xFFFF0000FFFF0000LL) >> 16);
 238     if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
 239                         ((x & 0xFFFFFFFF00000000LL) >> 32);
 240     return x;
 241 }
 242
 243 ```
 244
 245 # shuffle / unshuffle
 246
 247 based on RV bitmanip
 248
 249 ```
 250 uint32_t shfl32(uint32_t RA, uint32_t RB)
 251 {
 252     uint32_t x = RA;
 253     int shamt = RB & 15;
 254     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 255     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 256     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 257     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 258     return x;
 259 }
 260 uint32_t unshfl32(uint32_t RA, uint32_t RB)
 261 {
 262     uint32_t x = RA;
 263     int shamt = RB & 15;
 264     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 265     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 266     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 267     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 268     return x;
 269 }
 270
 271 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
 272 {
 273     uint64_t x = src & ~(maskL | maskR);
 274     x |= ((src << N) & maskL) | ((src >> N) & maskR);
 275     return x;
 276 }
 277 uint64_t shfl64(uint64_t RA, uint64_t RB)
 278 {
 279     uint64_t x = RA;
 280     int shamt = RB & 31;
 281     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 282                                            0x00000000ffff0000LL, 16);
 283     if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 284                                            0x0000ff000000ff00LL, 8);
 285     if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 286                                            0x00f000f000f000f0LL, 4);
 287     if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
 288                                            0x0c0c0c0c0c0c0c0cLL, 2);
 289     if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
 290                                            0x2222222222222222LL, 1);
 291     return x;
 292 }
 293 uint64_t unshfl64(uint64_t RA, uint64_t RB)
 294 {
 295     uint64_t x = RA;
 296     int shamt = RB & 31;
 297     if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
 298                                            0x2222222222222222LL, 1);
 299     if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
 300                                            0x0c0c0c0c0c0c0c0cLL, 2);
 301     if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 302                                            0x00f000f000f000f0LL, 4);
 303     if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 304                                            0x0000ff000000ff00LL, 8);
 305     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 306                                            0x00000000ffff0000LL, 16);
 307     return x;
 308 }
 309 ```
 310
 311 # xperm
 312
 313 based on RV bitmanip
 314
 315 ```
 316 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
 317 {
 318     uint_xlen_t r = 0;
 319     uint_xlen_t sz = 1LL << sz_log2;
 320     uint_xlen_t mask = (1LL << sz) - 1;
 321     for (int i = 0; i < XLEN; i += sz) {
 322         uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
 323         if (pos < XLEN)
 324             r |= ((RA >> pos) & mask) << i;
 325     }
 326     return r;
 327 }
 328 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
 329 {  return xperm(RA, RB, 2); }
 330 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
 331 {  return xperm(RA, RB, 3); }
 332 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
 333 {  return xperm(RA, RB, 4); }
 334 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
 335 {  return xperm(RA, RB, 5); }
 336 ```
 337
 338 # gorc
 339
 340 based on RV bitmanip
 341
 342 ```
 343 uint32_t gorc32(uint32_t RA, uint32_t RB)
 344 {
 345     uint32_t x = RA;
 346     int shamt = RB & 31;
 347     if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
 348     if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
 349     if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
 350     if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
 351     if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
 352     return x;
 353 }
 354 uint64_t gorc64(uint64_t RA, uint64_t RB)
 355 {
 356     uint64_t x = RA;
 357     int shamt = RB & 63;
 358     if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
 359                          ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
 360     if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
 361                          ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
 362     if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
 363                          ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
 364     if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
 365                          ((x & 0xFF00FF00FF00FF00LL)  >>  8);
 366     if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
 367                          ((x & 0xFFFF0000FFFF0000LL)  >> 16);
 368     if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
 369                          ((x & 0xFFFFFFFF00000000LL)  >> 32);
 370     return x;
 371 }
 372
 373 ```
 374
 375 # cmix
 376
 377 based on RV bitmanip, covered by ternary bitops
 378
 379 ```
 380 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
 381     return (RA & RB) | (RC & ~RB);
 382 }
 383 ```
 384
 385 # carryless mul
 386
 387 based on RV bitmanip
 388 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
 389
 390 ```
 391 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
 392 {
 393     uint_xlen_t x = 0;
 394     for (int i = 0; i < XLEN; i++)
 395         if ((RB >> i) & 1)
 396             x ^= RA << i;
 397     return x;
 398 }
 399 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
 400 {
 401     uint_xlen_t x = 0;
 402     for (int i = 1; i < XLEN; i++)
 403         if ((RB >> i) & 1)
 404             x ^= RA >> (XLEN-i);
 405     return x;
 406 }
 407 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
 408 {
 409     uint_xlen_t x = 0;
 410     for (int i = 0; i < XLEN; i++)
 411         if ((RB >> i) & 1)
 412             x ^= RA >> (XLEN-i-1);
 413     return x;
 414 }
 415 ```
 416 # Galois Field
 417
 418 ## Multiply
 419
 420 this requires 3 parameters and a "degree"
 421
 422     RT = GFMUL(RA, RB, gfdegree, modulo=RC)
 423
 424 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
 425
 426     RS = GFMUL(RS, RA, gfdegree, modulo=RB)
 427
 428 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 429 | -- | -- | --- | --- | --- | ------- |--|
 430 | NN | RS | RA  | RB  | deg | 00  011 |Rc|
 431
 432 where the SimpleV variant may override RS-as-src differently from RS-as-dest
 433
 434
 435
 436 ```
 437 from functools import reduce
 438
 439 # constants used in the multGF2 function
 440 mask1 = mask2 = polyred = None
 441
 442 def setGF2(degree, irPoly):
 443     """Define parameters of binary finite field GF(2^m)/g(x)
 444        - degree: extension degree of binary field
 445        - irPoly: coefficients of irreducible polynomial g(x)
 446     """
 447     def i2P(sInt):
 448         """Convert an integer into a polynomial"""
 449         return [(sInt >> i) & 1
 450                 for i in reversed(range(sInt.bit_length()))]
 451
 452     global mask1, mask2, polyred
 453     mask1 = mask2 = 1 << degree
 454     mask2 -= 1
 455     polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
 456
 457 def multGF2(p1, p2):
 458     """Multiply two polynomials in GF(2^m)/g(x)"""
 459     p = 0
 460     while p2:
 461         if p2 & 1:
 462             p ^= p1
 463         p1 <<= 1
 464         if p1 & mask1:
 465             p1 ^= polyred
 466         p2 >>= 1
 467     return p & mask2
 468
 469 if __name__ == "__main__":
 470
 471     # Define binary field GF(2^3)/x^3 + x + 1
 472     setGF2(3, 0b1011)
 473
 474     # Evaluate the product (x^2 + x + 1)(x^2 + 1)
 475     print("{:02x}".format(multGF2(0b111, 0b101)))
 476
 477     # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
 478     # (used in the Advanced Encryption Standard-AES)
 479     setGF2(8, 0b100011011)
 480
 481     # Evaluate the product (x^7)(x^7 + x + 1)
 482     print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
 483 ```
 484 ## GF add
 485
 486     RS = GFADD(RS, RA|0, gfdegree, modulo=RB)
 487
 488 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 489 | -- | -- | --- | --- | --- | ------- |--|
 490 | NN | RS | RA  | RB  | deg | 01  011 |Rc|
 491
 492 ## gf invert
 493
 494 ```
 495 def gf_degree(a) :
 496   res = 0
 497   a >>= 1
 498   while (a != 0) :
 499     a >>= 1;
 500     res += 1;
 501   return res
 502
 503 def gf_invert(a, mod=0x1B) :
 504   v = mod
 505   g1 = 1
 506   g2 = 0
 507   j = gf_degree(a) - 8
 508
 509   while (a != 1) :
 510     if (j < 0) :
 511       a, v = v, a
 512       g1, g2 = g2, g1
 513       j = -j
 514
 515     a ^= v << j
 516     g1 ^= g2 << j
 517
 518     a %= 256  # Emulating 8-bit overflow
 519     g1 %= 256 # Emulating 8-bit overflow
 520
 521     j = gf_degree(a) - gf_degree(v)
 522
 523   return g1
 524 ```
 525
 526 # crc
 527
 528 * <https://stackoverflow.com/questions/21171733/calculating-constants-for-crc32-using-pclmulqdq>
 529 * <https://en.wikipedia.org/wiki/Cyclic_redundancy_check>
 530
 531 ```
 532 uint_xlen_t crc32(uint_xlen_t x, int nbits)
 533 {
 534     for (int i = 0; i < nbits; i++)
 535         x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
 536     return x;
 537 }
 538 uint_xlen_t crc32c(uint_xlen_t x, int nbits)
 539 {
 540     for (int i = 0; i < nbits; i++)
 541         x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
 542     return x;
 543 }
 544 uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); }
 545 uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); }
 546 uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); }
 547 uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); }
 548 uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); }
 549 uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); }
 550 #if XLEN > 32
 551 uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); }
 552 uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); }
 553 #endif
 554 ```
 555
 556 # bitmatrix
 557
 558 ```
 559 uint64_t bmatflip(uint64_t RA)
 560 {
 561     uint64_t x = RA;
 562     x = shfl64(x, 31);
 563     x = shfl64(x, 31);
 564     x = shfl64(x, 31);
 565     return x;
 566 }
 567 uint64_t bmatxor(uint64_t RA, uint64_t RB)
 568 {
 569     // transpose of RB
 570     uint64_t RBt = bmatflip(RB);
 571     uint8_t u[8]; // rows of RA
 572     uint8_t v[8]; // cols of RB
 573     for (int i = 0; i < 8; i++) {
 574         u[i] = RA >> (i*8);
 575         v[i] = RBt >> (i*8);
 576     }
 577     uint64_t x = 0;
 578     for (int i = 0; i < 64; i++) {
 579         if (pcnt(u[i / 8] & v[i % 8]) & 1)
 580             x |= 1LL << i;
 581     }
 582     return x;
 583 }
 584 uint64_t bmator(uint64_t RA, uint64_t RB)
 585 {
 586     // transpose of RB
 587     uint64_t RBt = bmatflip(RB);
 588     uint8_t u[8]; // rows of RA
 589     uint8_t v[8]; // cols of RB
 590     for (int i = 0; i < 8; i++) {
 591         u[i] = RA >> (i*8);
 592         v[i] = RBt >> (i*8);
 593     }
 594     uint64_t x = 0;
 595     for (int i = 0; i < 64; i++) {
 596         if ((u[i / 8] & v[i % 8]) != 0)
 597             x |= 1LL << i;
 598     }
 599     return x;
 600 }
 601
 602 ```