openpower/sv/bitmanip.mdwn

   1 [[!tag standards]]
   2
   3 # summary
   4
   5 1-op and variants
   6
   7 | dest | src1 | subop | op       |
   8 | ---- | ---- | ----- | -------- |
   9 | RT   | RA   | ..    | bmatflip |
  10 | RT   | RA   | size  | crc32    |
  11 | RT   | RA   | size  | crc32c   |
  12
  13 2-op and variants
  14
  15 | dest | src1 | src2 | subop | op       |
  16 | ---- | ---- | ---- | ----- | -------- |
  17 | RT   | RA   | RB   | or    | bmatflip |
  18 | RT   | RA   | RB   | xor   | bmatflip |
  19 | RT   | RA   | RB   | bdep  | dep/ext  |
  20 | RT   | RA   | RB   | bext  | dep/ext  |
  21 | RT   | RA   | RB   |       | grev  |
  22 | RT   | RA   | RB   |       | gorc |
  23 | RT   | RA   | RB   | shuf  | shuffle |
  24 | RT   | RA   | RB   | unshuf| shuffle |
  25 | RT   | RA   | RB   | width | xperm  |
  26 | RT   | RA   | RB   | type  | clmul |
  27 | RT   | RA   | RB   | type | minmax |
  28 | RT   | RA   | RB   |  |  |
  29 | RT   | RA   | RB   |  |  |
  30 | RT   | RA   | RB   |  |  |
  31
  32 3 ops
  33
  34 * bitmask swt/extract
  35 * ternary bitops
  36 * GF
  37
  38 | 0.5|6.10|11.15|16.20|21.25| 26....30 |31| name |
  39 | -- | -- | --- | --- | --- | -------- |--| ------ |
  40 | NN | RT | RA  | RB  | RC  | mode 001 |Rc| ternary |
  41
  42 ops
  43
  44 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
  45 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
  46 | NN | RA | RB  |     |       | 0  | 0000 110 |Rc| rsvd   |
  47 | NN | RA | RB  | RC  | itype | 1  | 0000 110 |Rc| xperm |
  48 | NN | RA | RB  | RC  | itype | 0  | 0100 110 |Rc| minmax |
  49 | NN | RA | RB  |     |       | 1  | 0100 110 |Rc| rsvd |
  50 | NN | RA | RB  | sh  | itype | SH | 1000 110 |Rc| bmopsi |
  51 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  52 | NN | RA | RB  |     |       | 0  | 0001 110 |Rc| rsvd |
  53 | NN | RA | RB  |     |       | 0  | 0101 110 |Rc| rsvd |
  54 | NN | RA | RB  | RC  | 00    | 0  | 0010 110 |Rc| gorc |
  55 | NN | RA | RB  | sh  | 00    | SH | 1010 110 |Rc| gorci |
  56 | NN | RA | RB  | RC  | 00    | 0  | 0110 110 |Rc| gorcw |
  57 | NN | RA | RB  | sh  | 00    | 0  | 1110 110 |Rc| gorcwi |
  58 | NN | RA | RB  | RC  | 00    | 1  | 1110 110 |Rc| bmator  |
  59 | NN | RA | RB  | RC  | 01    | 0  | 0010 110 |Rc| grev |
  60 | NN | RA | RB  | sh  | 01    | SH | 1010 110 |Rc| grevi |
  61 | NN | RA | RB  | RC  | 01    | 0  | 0110 110 |Rc| grevw |
  62 | NN | RA | RB  | sh  | 01    | 0  | 1110 110 |Rc| grevwi |
  63 | NN | RA | RB  | RC  | 01    | 1  | 1110 110 |Rc| bmatxor   |
  64 | NN | RA | RB  | RC  | 10    | 0  | 0010 110 |Rc| shfl |
  65 | NN | RA | RB  | sh  | 10    | SH | 1010 110 |Rc| shfli |
  66 | NN | RA | RB  | RC  | 10    | 0  | 0110 110 |Rc| shflw |
  67 | NN | RA | RB  | RC  | 10    | 0  | 1110 110 |Rc| bdep   |
  68 | NN | RA | RB  | RC  | 10    | 1  | 1110 110 |Rc| bext  |
  69 | NN | RA | RB  |     | 11    |    | 1110 110 |Rc| rsvd  |
  70 | NN | RA | RB  |     |       |    | NN11 110 |Rc| rsvd  |
  71
  72 # bit to byte permute
  73
  74 similar to matrix permute in RV bitmanip, which has XOR and OR variants
  75
  76     do j = 0 to 7
  77       do k = 0 to 7
  78          b = VSR[VRB+32].dword[i].byte[k].bit[j]
  79          VSR[VRT+32].dword[i].byte[j].bit[k] = b
  80
  81 # vector bit deposit
  82
  83 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
  84
  85     do while(m < 64)
  86        if VSR[VRB+32].dword[i].bit[63-m]=1 then do
  87           result = VSR[VRA+32].dword[i].bit[63-k]
  88           VSR[VRT+32].dword[i].bit[63-m] = result
  89           k = k + 1
  90        m = m + 1
  91
  92 ```
  93
  94 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
  95 {
  96     uint_xlen_t r = 0;
  97     for (int i = 0, j = 0; i < XLEN; i++)
  98         if ((RB >> i) & 1) {
  99             if ((RA >> j) & 1)
 100                 r |= uint_xlen_t(1) << i;
 101             j++;
 102         }
 103     return r;
 104 }
 105
 106 ```
 107
 108 # vector bit extract
 109
 110 other way round: identical to RV bext
 111
 112 ```
 113 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
 114 {
 115     uint_xlen_t r = 0;
 116     for (int i = 0, j = 0; i < XLEN; i++)
 117         if ((RB >> i) & 1) {
 118             if ((RA >> i) & 1)
 119                 r |= uint_xlen_t(1) << j;
 120             j++;
 121         }
 122     return r;
 123 }
 124 ```
 125
 126 # int min/max
 127
 128 signed and unsigned min/max for integer.  this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned.  when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
 129
 130 signed/unsigned min/max gives more flexibility.
 131
 132 # ternary bitops
 133
 134 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
 135
 136 | 0.5|6.10|11.15|16.20| 21..25| 26..30   |31|
 137 | -- | -- | --- | --- | ----- | -------- |--|
 138 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc|
 139
 140     for i in range(64):
 141         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 142         RT[i] = (imm & (1<<idx)) != 0
 143
 144 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
 145
 146 a 4 operand variant which becomes more along the lines of an FPGA:
 147
 148 | 0.5|6.10|11.15|16.20|21.25| 26...30  |31|
 149 | -- | -- | --- | --- | --- | -------- |--|
 150 | NN | RT | RA  | RB  | RC  | mode 001 |Rc|
 151
 152     for i in range(64):
 153         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 154         RT[i] = (RC & (1<<idx)) != 0
 155
 156 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
 157 3 modes.
 158
 159 also, another possible variant involving swizzle and vec4:
 160
 161 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 162 | -- | -- | --- | ----- | ---- | ----- |--|
 163 | NN | RT | RA  | imm   | mask | 101   |1 |
 164
 165     for i in range(8):
 166         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 167         res = (imm & (1<<idx)) != 0
 168         for j in range(3):
 169              if mask[j]: RT[i+j*8] = res
 170
 171 another mode selection would be CRs not Ints.
 172
 173 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
 174 | -- | -- | --- | --- |- |-----|----- | -----|--|
 175 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 |
 176
 177     for i in range(4):
 178         if not mask[i] continue
 179         idx = crregs[BA][i] << 2 |
 180               crregs[BB][i] << 1 |
 181               crregs[BC][i]
 182         crregs[BA][i] = (imm & (1<<idx)) != 0
 183
 184 # bitmask set
 185
 186 based on RV bitmanip singlebit set, instruction format similar to shift
 187 [[isa/fixedshift]].  bmext is actually covered already (shift-with-mask).
 188 however bitmask-invert is not, and set/clr are not covered, although they can ise the same Shift ALU.
 189
 190 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 191 | -- | -- | --- | --- | --- | ------- |--|
 192 | NN | RT | RA  | RB  | RC  | mode 010 |Rc|
 193
 194 ```
 195 uint_xlen_t bmset(RA, RB, sh)
 196 {
 197     int shamt = RB & (XLEN - 1);
 198     mask = (2<<sh)-1;
 199     return RA | (mask << shamt);
 200 }
 201
 202 uint_xlen_t bmclr(RA, RB, sh)
 203 {
 204     int shamt = RB & (XLEN - 1);
 205     mask = (2<<sh)-1;
 206     return RA & ~(mask << shamt);
 207 }
 208
 209 uint_xlen_t bminv(RA, RB, sh)
 210 {
 211     int shamt = RB & (XLEN - 1);
 212     mask = (2<<sh)-1;
 213     return RA ^ (mask << shamt);
 214 }
 215
 216 uint_xlen_t bmext(RA, RB, sh)
 217 {
 218     int shamt = RB & (XLEN - 1);
 219     mask = (2<<sh)-1;
 220     return mask & (RA >> shamt);
 221 }
 222 ```
 223
 224 # grev
 225
 226 based on RV bitmanip
 227
 228 ```
 229 uint64_t grev64(uint64_t RA, uint64_t RB)
 230 {
 231     uint64_t x = RA;
 232     int shamt = RB & 63;
 233     if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
 234                         ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
 235     if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
 236                         ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
 237     if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
 238                         ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
 239     if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
 240                         ((x & 0xFF00FF00FF00FF00LL) >>  8);
 241     if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
 242                         ((x & 0xFFFF0000FFFF0000LL) >> 16);
 243     if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
 244                         ((x & 0xFFFFFFFF00000000LL) >> 32);
 245     return x;
 246 }
 247
 248 ```
 249
 250 # shuffle / unshuffle
 251
 252 based on RV bitmanip
 253
 254 ```
 255 uint32_t shfl32(uint32_t RA, uint32_t RB)
 256 {
 257     uint32_t x = RA;
 258     int shamt = RB & 15;
 259     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 260     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 261     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 262     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 263     return x;
 264 }
 265 uint32_t unshfl32(uint32_t RA, uint32_t RB)
 266 {
 267     uint32_t x = RA;
 268     int shamt = RB & 15;
 269     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 270     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 271     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 272     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 273     return x;
 274 }
 275
 276 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
 277 {
 278     uint64_t x = src & ~(maskL | maskR);
 279     x |= ((src << N) & maskL) | ((src >> N) & maskR);
 280     return x;
 281 }
 282 uint64_t shfl64(uint64_t RA, uint64_t RB)
 283 {
 284     uint64_t x = RA;
 285     int shamt = RB & 31;
 286     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 287                                            0x00000000ffff0000LL, 16);
 288     if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 289                                            0x0000ff000000ff00LL, 8);
 290     if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 291                                            0x00f000f000f000f0LL, 4);
 292     if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
 293                                            0x0c0c0c0c0c0c0c0cLL, 2);
 294     if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
 295                                            0x2222222222222222LL, 1);
 296     return x;
 297 }
 298 uint64_t unshfl64(uint64_t RA, uint64_t RB)
 299 {
 300     uint64_t x = RA;
 301     int shamt = RB & 31;
 302     if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
 303                                            0x2222222222222222LL, 1);
 304     if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
 305                                            0x0c0c0c0c0c0c0c0cLL, 2);
 306     if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 307                                            0x00f000f000f000f0LL, 4);
 308     if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 309                                            0x0000ff000000ff00LL, 8);
 310     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 311                                            0x00000000ffff0000LL, 16);
 312     return x;
 313 }
 314 ```
 315
 316 # xperm
 317
 318 based on RV bitmanip
 319
 320 ```
 321 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
 322 {
 323     uint_xlen_t r = 0;
 324     uint_xlen_t sz = 1LL << sz_log2;
 325     uint_xlen_t mask = (1LL << sz) - 1;
 326     for (int i = 0; i < XLEN; i += sz) {
 327         uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
 328         if (pos < XLEN)
 329             r |= ((RA >> pos) & mask) << i;
 330     }
 331     return r;
 332 }
 333 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
 334 {  return xperm(RA, RB, 2); }
 335 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
 336 {  return xperm(RA, RB, 3); }
 337 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
 338 {  return xperm(RA, RB, 4); }
 339 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
 340 {  return xperm(RA, RB, 5); }
 341 ```
 342
 343 # gorc
 344
 345 based on RV bitmanip
 346
 347 ```
 348 uint32_t gorc32(uint32_t RA, uint32_t RB)
 349 {
 350     uint32_t x = RA;
 351     int shamt = RB & 31;
 352     if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
 353     if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
 354     if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
 355     if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
 356     if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
 357     return x;
 358 }
 359 uint64_t gorc64(uint64_t RA, uint64_t RB)
 360 {
 361     uint64_t x = RA;
 362     int shamt = RB & 63;
 363     if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
 364                          ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
 365     if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
 366                          ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
 367     if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
 368                          ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
 369     if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
 370                          ((x & 0xFF00FF00FF00FF00LL)  >>  8);
 371     if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
 372                          ((x & 0xFFFF0000FFFF0000LL)  >> 16);
 373     if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
 374                          ((x & 0xFFFFFFFF00000000LL)  >> 32);
 375     return x;
 376 }
 377
 378 ```
 379
 380 # cmix
 381
 382 based on RV bitmanip, covered by ternary bitops
 383
 384 ```
 385 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
 386     return (RA & RB) | (RC & ~RB);
 387 }
 388 ```
 389
 390 # carryless mul
 391
 392 based on RV bitmanip
 393 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
 394
 395 ```
 396 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
 397 {
 398     uint_xlen_t x = 0;
 399     for (int i = 0; i < XLEN; i++)
 400         if ((RB >> i) & 1)
 401             x ^= RA << i;
 402     return x;
 403 }
 404 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
 405 {
 406     uint_xlen_t x = 0;
 407     for (int i = 1; i < XLEN; i++)
 408         if ((RB >> i) & 1)
 409             x ^= RA >> (XLEN-i);
 410     return x;
 411 }
 412 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
 413 {
 414     uint_xlen_t x = 0;
 415     for (int i = 0; i < XLEN; i++)
 416         if ((RB >> i) & 1)
 417             x ^= RA >> (XLEN-i-1);
 418     return x;
 419 }
 420 ```
 421 # Galois Field
 422
 423 ## Multiply
 424
 425 this requires 3 parameters and a "degree"
 426
 427     RT = GFMUL(RA, RB, gfdegree, modulo=RC)
 428
 429 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
 430
 431     RS = GFMUL(RS, RA, gfdegree, modulo=RB)
 432
 433 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 434 | -- | -- | --- | --- | --- | ------- |--|
 435 | NN | RS | RA  | RB  | deg | 00  011 |Rc|
 436
 437 where the SimpleV variant may override RS-as-src differently from RS-as-dest
 438
 439
 440
 441 ```
 442 from functools import reduce
 443
 444 # constants used in the multGF2 function
 445 mask1 = mask2 = polyred = None
 446
 447 def setGF2(degree, irPoly):
 448     """Define parameters of binary finite field GF(2^m)/g(x)
 449        - degree: extension degree of binary field
 450        - irPoly: coefficients of irreducible polynomial g(x)
 451     """
 452     def i2P(sInt):
 453         """Convert an integer into a polynomial"""
 454         return [(sInt >> i) & 1
 455                 for i in reversed(range(sInt.bit_length()))]
 456
 457     global mask1, mask2, polyred
 458     mask1 = mask2 = 1 << degree
 459     mask2 -= 1
 460     polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
 461
 462 def multGF2(p1, p2):
 463     """Multiply two polynomials in GF(2^m)/g(x)"""
 464     p = 0
 465     while p2:
 466         if p2 & 1:
 467             p ^= p1
 468         p1 <<= 1
 469         if p1 & mask1:
 470             p1 ^= polyred
 471         p2 >>= 1
 472     return p & mask2
 473
 474 if __name__ == "__main__":
 475
 476     # Define binary field GF(2^3)/x^3 + x + 1
 477     setGF2(3, 0b1011)
 478
 479     # Evaluate the product (x^2 + x + 1)(x^2 + 1)
 480     print("{:02x}".format(multGF2(0b111, 0b101)))
 481
 482     # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
 483     # (used in the Advanced Encryption Standard-AES)
 484     setGF2(8, 0b100011011)
 485
 486     # Evaluate the product (x^7)(x^7 + x + 1)
 487     print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
 488 ```
 489 ## GF add
 490
 491     RS = GFADD(RS, RA|0, gfdegree, modulo=RB)
 492
 493 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 494 | -- | -- | --- | --- | --- | ------- |--|
 495 | NN | RS | RA  | RB  | deg | 01  011 |Rc|
 496
 497 ## gf invert
 498
 499 ```
 500 def gf_degree(a) :
 501   res = 0
 502   a >>= 1
 503   while (a != 0) :
 504     a >>= 1;
 505     res += 1;
 506   return res
 507
 508 def gf_invert(a, mod=0x1B) :
 509   v = mod
 510   g1 = 1
 511   g2 = 0
 512   j = gf_degree(a) - 8
 513
 514   while (a != 1) :
 515     if (j < 0) :
 516       a, v = v, a
 517       g1, g2 = g2, g1
 518       j = -j
 519
 520     a ^= v << j
 521     g1 ^= g2 << j
 522
 523     a %= 256  # Emulating 8-bit overflow
 524     g1 %= 256 # Emulating 8-bit overflow
 525
 526     j = gf_degree(a) - gf_degree(v)
 527
 528   return g1
 529 ```
 530
 531 # crc
 532
 533 * <https://stackoverflow.com/questions/21171733/calculating-constants-for-crc32-using-pclmulqdq>
 534 * <https://en.wikipedia.org/wiki/Cyclic_redundancy_check>
 535
 536 ```
 537 uint_xlen_t crc32(uint_xlen_t x, int nbits)
 538 {
 539     for (int i = 0; i < nbits; i++)
 540         x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
 541     return x;
 542 }
 543 uint_xlen_t crc32c(uint_xlen_t x, int nbits)
 544 {
 545     for (int i = 0; i < nbits; i++)
 546         x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
 547     return x;
 548 }
 549 uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); }
 550 uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); }
 551 uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); }
 552 uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); }
 553 uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); }
 554 uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); }
 555 #if XLEN > 32
 556 uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); }
 557 uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); }
 558 #endif
 559 ```
 560
 561 # bitmatrix
 562
 563 ```
 564 uint64_t bmatflip(uint64_t RA)
 565 {
 566     uint64_t x = RA;
 567     x = shfl64(x, 31);
 568     x = shfl64(x, 31);
 569     x = shfl64(x, 31);
 570     return x;
 571 }
 572 uint64_t bmatxor(uint64_t RA, uint64_t RB)
 573 {
 574     // transpose of RB
 575     uint64_t RBt = bmatflip(RB);
 576     uint8_t u[8]; // rows of RA
 577     uint8_t v[8]; // cols of RB
 578     for (int i = 0; i < 8; i++) {
 579         u[i] = RA >> (i*8);
 580         v[i] = RBt >> (i*8);
 581     }
 582     uint64_t x = 0;
 583     for (int i = 0; i < 64; i++) {
 584         if (pcnt(u[i / 8] & v[i % 8]) & 1)
 585             x |= 1LL << i;
 586     }
 587     return x;
 588 }
 589 uint64_t bmator(uint64_t RA, uint64_t RB)
 590 {
 591     // transpose of RB
 592     uint64_t RBt = bmatflip(RB);
 593     uint8_t u[8]; // rows of RA
 594     uint8_t v[8]; // cols of RB
 595     for (int i = 0; i < 8; i++) {
 596         u[i] = RA >> (i*8);
 597         v[i] = RBt >> (i*8);
 598     }
 599     uint64_t x = 0;
 600     for (int i = 0; i < 64; i++) {
 601         if ((u[i / 8] & v[i % 8]) != 0)
 602             x |= 1LL << i;
 603     }
 604     return x;
 605 }
 606
 607 ```