openpower/sv/bitmanip.mdwn

   1 [[!tag standards]]
   2
   3 # summary
   4
   5 minor opcode allocation
   6
   7     |  28.30 |31| name      |
   8     | ------ |--| --------- |
   9     |   00   |Rc| ternaryi  |
  10     |  001   |Rc| ternary   |
  11     |  010   |Rc| bitmask   |
  12     |  011   |Rc| gf*       |
  13     |  101   |1 | ternaryv  |
  14     |  101   |0 | ternarycr |
  15     |  110   |1 | 1/2-op    |
  16     |  111   |Rc| reserved  |
  17
  18 1-op and variants
  19
  20 | dest | src1 | subop | op       |
  21 | ---- | ---- | ----- | -------- |
  22 | RT   | RA   | ..    | bmatflip |
  23 | RT   | RA   | size  | crc32    |
  24 | RT   | RA   | size  | crc32c   |
  25
  26 2-op and variants
  27
  28 | dest | src1 | src2 | subop | op       |
  29 | ---- | ---- | ---- | ----- | -------- |
  30 | RT   | RA   | RB   | or    | bmatflip |
  31 | RT   | RA   | RB   | xor   | bmatflip |
  32 | RT   | RA   | RB   | bdep  | dep/ext  |
  33 | RT   | RA   | RB   | bext  | dep/ext  |
  34 | RT   | RA   | RB   |       | grev  |
  35 | RT   | RA   | RB   |       | gorc |
  36 | RT   | RA   | RB   | shuf  | shuffle |
  37 | RT   | RA   | RB   | unshuf| shuffle |
  38 | RT   | RA   | RB   | width | xperm  |
  39 | RT   | RA   | RB   | type | minmax |
  40 | RT   | RA   | RB   |  |  |
  41 | RT   | RA   | RB   |  |  |
  42 | RT   | RA   | RB   |  |  |
  43
  44 3 ops
  45
  46 * bitmask set/extract
  47 * ternary bitops
  48 * GF
  49
  50 | 0.5|6.10|11.15|16.20|21..25 | 26....30 |31| name |
  51 | -- | -- | --- | --- | ----- | -------- |--| ------ |
  52 | NN | RT | RA  | RB  | RC    | mode 001 |Rc| ternary |
  53 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc| ternaryi |
  54 | NN | RS | RA  | RB  | deg   | 00  011  |Rc| gfmul |
  55 | NN | RS | RA  | RB  | deg   | 01  011  |Rc| gfadd |
  56 | NN | RT | RA  | RB  | deg   | 10  011  |Rc| gfinv |
  57 | NN | RS | RA  | RB  | deg   | 11  011  |Rc| gf rsvd |
  58
  59 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
  60 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
  61 | NN | RT | RA  | imm   | mask | 101   |1 | ternaryv |
  62
  63 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
  64 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
  65 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 | ternarycr |
  66
  67 ops
  68
  69 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
  70 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
  71 | NN | RA | RB  |     |       | 0  | 0000 110 |Rc| rsvd   |
  72 | NN | RA | RB  | RC  | itype | 1  | 0000 110 |Rc| xperm |
  73 | NN | RA | RB  | RC  | itype | 0  | 0100 110 |Rc| minmax |
  74 | NN | RA | RB  |     |       | 1  | 0100 110 |Rc| rsvd |
  75 | NN | RA | RB  | sh  | itype | SH | 1000 110 |Rc| bmopsi |
  76 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  77 | NN | RA | RB  |     |       | 0  | 0001 110 |Rc| rsvd |
  78 | NN | RA | RB  |     |       | 0  | 0101 110 |Rc| rsvd |
  79 | NN | RA | RB  | RC  | 00    | 0  | 0010 110 |Rc| gorc |
  80 | NN | RA | RB  | sh  | 00    | SH | 1010 110 |Rc| gorci |
  81 | NN | RA | RB  | RC  | 00    | 0  | 0110 110 |Rc| gorcw |
  82 | NN | RA | RB  | sh  | 00    | 0  | 1110 110 |Rc| gorcwi |
  83 | NN | RA | RB  | RC  | 00    | 1  | 1110 110 |Rc| bmator  |
  84 | NN | RA | RB  | RC  | 01    | 0  | 0010 110 |Rc| grev |
  85 | NN | RA | RB  | sh  | 01    | SH | 1010 110 |Rc| grevi |
  86 | NN | RA | RB  | RC  | 01    | 0  | 0110 110 |Rc| grevw |
  87 | NN | RA | RB  | sh  | 01    | 0  | 1110 110 |Rc| grevwi |
  88 | NN | RA | RB  | RC  | 01    | 1  | 1110 110 |Rc| bmatxor   |
  89 | NN | RA | RB  | RC  | 10    | 0  | 0010 110 |Rc| shfl |
  90 | NN | RA | RB  | sh  | 10    | SH | 1010 110 |Rc| shfli |
  91 | NN | RA | RB  | RC  | 10    | 0  | 0110 110 |Rc| shflw |
  92 | NN | RA | RB  | RC  | 10    | 0  | 1110 110 |Rc| bdep   |
  93 | NN | RA | RB  | RC  | 10    | 1  | 1110 110 |Rc| bext  |
  94 | NN | RA | RB  |     | 11    |    | 1110 110 |Rc| rsvd  |
  95 | NN | RA | RB  |     |       |    | NN11 110 |Rc| rsvd  |
  96
  97 # bit to byte permute
  98
  99 similar to matrix permute in RV bitmanip, which has XOR and OR variants
 100
 101     do j = 0 to 7
 102       do k = 0 to 7
 103          b = VSR[VRB+32].dword[i].byte[k].bit[j]
 104          VSR[VRT+32].dword[i].byte[j].bit[k] = b
 105
 106 # vector bit deposit
 107
 108 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
 109
 110     do while(m < 64)
 111        if VSR[VRB+32].dword[i].bit[63-m]=1 then do
 112           result = VSR[VRA+32].dword[i].bit[63-k]
 113           VSR[VRT+32].dword[i].bit[63-m] = result
 114           k = k + 1
 115        m = m + 1
 116
 117 ```
 118
 119 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
 120 {
 121     uint_xlen_t r = 0;
 122     for (int i = 0, j = 0; i < XLEN; i++)
 123         if ((RB >> i) & 1) {
 124             if ((RA >> j) & 1)
 125                 r |= uint_xlen_t(1) << i;
 126             j++;
 127         }
 128     return r;
 129 }
 130
 131 ```
 132
 133 # vector bit extract
 134
 135 other way round: identical to RV bext
 136
 137 ```
 138 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
 139 {
 140     uint_xlen_t r = 0;
 141     for (int i = 0, j = 0; i < XLEN; i++)
 142         if ((RB >> i) & 1) {
 143             if ((RA >> i) & 1)
 144                 r |= uint_xlen_t(1) << j;
 145             j++;
 146         }
 147     return r;
 148 }
 149 ```
 150
 151 # int min/max
 152
 153 signed and unsigned min/max for integer.  this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned.  when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
 154
 155 signed/unsigned min/max gives more flexibility.
 156
 157 # ternary bitops
 158
 159 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
 160
 161 | 0.5|6.10|11.15|16.20| 21..25| 26..30   |31|
 162 | -- | -- | --- | --- | ----- | -------- |--|
 163 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc|
 164
 165     for i in range(64):
 166         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 167         RT[i] = (imm & (1<<idx)) != 0
 168
 169 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
 170
 171 a 4 operand variant which becomes more along the lines of an FPGA:
 172
 173 | 0.5|6.10|11.15|16.20|21.25| 26...30  |31|
 174 | -- | -- | --- | --- | --- | -------- |--|
 175 | NN | RT | RA  | RB  | RC  | mode 001 |Rc|
 176
 177     for i in range(64):
 178         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 179         RT[i] = (RC & (1<<idx)) != 0
 180
 181 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
 182 3 modes.
 183
 184 also, another possible variant involving swizzle and vec4:
 185
 186 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 187 | -- | -- | --- | ----- | ---- | ----- |--|
 188 | NN | RT | RA  | imm   | mask | 101   |1 |
 189
 190     for i in range(8):
 191         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 192         res = (imm & (1<<idx)) != 0
 193         for j in range(3):
 194              if mask[j]: RT[i+j*8] = res
 195
 196 another mode selection would be CRs not Ints.
 197
 198 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
 199 | -- | -- | --- | --- |- |-----|----- | -----|--|
 200 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 |
 201
 202     for i in range(4):
 203         if not mask[i] continue
 204         idx = crregs[BA][i] << 2 |
 205               crregs[BB][i] << 1 |
 206               crregs[BC][i]
 207         crregs[BA][i] = (imm & (1<<idx)) != 0
 208
 209 # bitmask set
 210
 211 based on RV bitmanip singlebit set, instruction format similar to shift
 212 [[isa/fixedshift]].  bmext is actually covered already (shift-with-mask).
 213 however bitmask-invert is not, and set/clr are not covered, although they can ise the same Shift ALU.
 214
 215 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 216 | -- | -- | --- | --- | --- | ------- |--|
 217 | NN | RT | RA  | RB  | RC  | mode 010 |Rc|
 218
 219 ```
 220 uint_xlen_t bmset(RA, RB, sh)
 221 {
 222     int shamt = RB & (XLEN - 1);
 223     mask = (2<<sh)-1;
 224     return RA | (mask << shamt);
 225 }
 226
 227 uint_xlen_t bmclr(RA, RB, sh)
 228 {
 229     int shamt = RB & (XLEN - 1);
 230     mask = (2<<sh)-1;
 231     return RA & ~(mask << shamt);
 232 }
 233
 234 uint_xlen_t bminv(RA, RB, sh)
 235 {
 236     int shamt = RB & (XLEN - 1);
 237     mask = (2<<sh)-1;
 238     return RA ^ (mask << shamt);
 239 }
 240
 241 uint_xlen_t bmext(RA, RB, sh)
 242 {
 243     int shamt = RB & (XLEN - 1);
 244     mask = (2<<sh)-1;
 245     return mask & (RA >> shamt);
 246 }
 247 ```
 248
 249 # grev
 250
 251 based on RV bitmanip
 252
 253 ```
 254 uint64_t grev64(uint64_t RA, uint64_t RB)
 255 {
 256     uint64_t x = RA;
 257     int shamt = RB & 63;
 258     if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
 259                         ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
 260     if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
 261                         ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
 262     if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
 263                         ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
 264     if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
 265                         ((x & 0xFF00FF00FF00FF00LL) >>  8);
 266     if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
 267                         ((x & 0xFFFF0000FFFF0000LL) >> 16);
 268     if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
 269                         ((x & 0xFFFFFFFF00000000LL) >> 32);
 270     return x;
 271 }
 272
 273 ```
 274
 275 # shuffle / unshuffle
 276
 277 based on RV bitmanip
 278
 279 ```
 280 uint32_t shfl32(uint32_t RA, uint32_t RB)
 281 {
 282     uint32_t x = RA;
 283     int shamt = RB & 15;
 284     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 285     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 286     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 287     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 288     return x;
 289 }
 290 uint32_t unshfl32(uint32_t RA, uint32_t RB)
 291 {
 292     uint32_t x = RA;
 293     int shamt = RB & 15;
 294     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 295     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 296     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 297     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 298     return x;
 299 }
 300
 301 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
 302 {
 303     uint64_t x = src & ~(maskL | maskR);
 304     x |= ((src << N) & maskL) | ((src >> N) & maskR);
 305     return x;
 306 }
 307 uint64_t shfl64(uint64_t RA, uint64_t RB)
 308 {
 309     uint64_t x = RA;
 310     int shamt = RB & 31;
 311     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 312                                            0x00000000ffff0000LL, 16);
 313     if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 314                                            0x0000ff000000ff00LL, 8);
 315     if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 316                                            0x00f000f000f000f0LL, 4);
 317     if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
 318                                            0x0c0c0c0c0c0c0c0cLL, 2);
 319     if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
 320                                            0x2222222222222222LL, 1);
 321     return x;
 322 }
 323 uint64_t unshfl64(uint64_t RA, uint64_t RB)
 324 {
 325     uint64_t x = RA;
 326     int shamt = RB & 31;
 327     if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
 328                                            0x2222222222222222LL, 1);
 329     if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
 330                                            0x0c0c0c0c0c0c0c0cLL, 2);
 331     if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 332                                            0x00f000f000f000f0LL, 4);
 333     if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 334                                            0x0000ff000000ff00LL, 8);
 335     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 336                                            0x00000000ffff0000LL, 16);
 337     return x;
 338 }
 339 ```
 340
 341 # xperm
 342
 343 based on RV bitmanip
 344
 345 ```
 346 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
 347 {
 348     uint_xlen_t r = 0;
 349     uint_xlen_t sz = 1LL << sz_log2;
 350     uint_xlen_t mask = (1LL << sz) - 1;
 351     for (int i = 0; i < XLEN; i += sz) {
 352         uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
 353         if (pos < XLEN)
 354             r |= ((RA >> pos) & mask) << i;
 355     }
 356     return r;
 357 }
 358 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
 359 {  return xperm(RA, RB, 2); }
 360 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
 361 {  return xperm(RA, RB, 3); }
 362 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
 363 {  return xperm(RA, RB, 4); }
 364 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
 365 {  return xperm(RA, RB, 5); }
 366 ```
 367
 368 # gorc
 369
 370 based on RV bitmanip
 371
 372 ```
 373 uint32_t gorc32(uint32_t RA, uint32_t RB)
 374 {
 375     uint32_t x = RA;
 376     int shamt = RB & 31;
 377     if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
 378     if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
 379     if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
 380     if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
 381     if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
 382     return x;
 383 }
 384 uint64_t gorc64(uint64_t RA, uint64_t RB)
 385 {
 386     uint64_t x = RA;
 387     int shamt = RB & 63;
 388     if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
 389                          ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
 390     if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
 391                          ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
 392     if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
 393                          ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
 394     if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
 395                          ((x & 0xFF00FF00FF00FF00LL)  >>  8);
 396     if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
 397                          ((x & 0xFFFF0000FFFF0000LL)  >> 16);
 398     if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
 399                          ((x & 0xFFFFFFFF00000000LL)  >> 32);
 400     return x;
 401 }
 402
 403 ```
 404
 405 # cmix
 406
 407 based on RV bitmanip, covered by ternary bitops
 408
 409 ```
 410 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
 411     return (RA & RB) | (RC & ~RB);
 412 }
 413 ```
 414
 415 # carryless mul
 416
 417 based on RV bitmanip
 418 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
 419
 420 ```
 421 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
 422 {
 423     uint_xlen_t x = 0;
 424     for (int i = 0; i < XLEN; i++)
 425         if ((RB >> i) & 1)
 426             x ^= RA << i;
 427     return x;
 428 }
 429 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
 430 {
 431     uint_xlen_t x = 0;
 432     for (int i = 1; i < XLEN; i++)
 433         if ((RB >> i) & 1)
 434             x ^= RA >> (XLEN-i);
 435     return x;
 436 }
 437 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
 438 {
 439     uint_xlen_t x = 0;
 440     for (int i = 0; i < XLEN; i++)
 441         if ((RB >> i) & 1)
 442             x ^= RA >> (XLEN-i-1);
 443     return x;
 444 }
 445 ```
 446 # Galois Field
 447
 448 ## Multiply
 449
 450 this requires 3 parameters and a "degree"
 451
 452     RT = GFMUL(RA, RB, gfdegree, modulo=RC)
 453
 454 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
 455
 456     RS = GFMUL(RS, RA, gfdegree, modulo=RB)
 457
 458 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 459 | -- | -- | --- | --- | --- | ------- |--|
 460 | NN | RS | RA  | RB  | deg | 00  011 |Rc|
 461
 462 where the SimpleV variant may override RS-as-src differently from RS-as-dest
 463
 464
 465
 466 ```
 467 from functools import reduce
 468
 469 # constants used in the multGF2 function
 470 mask1 = mask2 = polyred = None
 471
 472 def setGF2(degree, irPoly):
 473     """Define parameters of binary finite field GF(2^m)/g(x)
 474        - degree: extension degree of binary field
 475        - irPoly: coefficients of irreducible polynomial g(x)
 476     """
 477     def i2P(sInt):
 478         """Convert an integer into a polynomial"""
 479         return [(sInt >> i) & 1
 480                 for i in reversed(range(sInt.bit_length()))]
 481
 482     global mask1, mask2, polyred
 483     mask1 = mask2 = 1 << degree
 484     mask2 -= 1
 485     polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
 486
 487 def multGF2(p1, p2):
 488     """Multiply two polynomials in GF(2^m)/g(x)"""
 489     p = 0
 490     while p2:
 491         if p2 & 1:
 492             p ^= p1
 493         p1 <<= 1
 494         if p1 & mask1:
 495             p1 ^= polyred
 496         p2 >>= 1
 497     return p & mask2
 498
 499 if __name__ == "__main__":
 500
 501     # Define binary field GF(2^3)/x^3 + x + 1
 502     setGF2(3, 0b1011)
 503
 504     # Evaluate the product (x^2 + x + 1)(x^2 + 1)
 505     print("{:02x}".format(multGF2(0b111, 0b101)))
 506
 507     # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
 508     # (used in the Advanced Encryption Standard-AES)
 509     setGF2(8, 0b100011011)
 510
 511     # Evaluate the product (x^7)(x^7 + x + 1)
 512     print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
 513 ```
 514 ## GF add
 515
 516     RS = GFADD(RS, RA|0, gfdegree, modulo=RB)
 517
 518 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 519 | -- | -- | --- | --- | --- | ------- |--|
 520 | NN | RS | RA  | RB  | deg | 01  011 |Rc|
 521
 522 ## gf invert
 523
 524 ```
 525 def gf_degree(a) :
 526   res = 0
 527   a >>= 1
 528   while (a != 0) :
 529     a >>= 1;
 530     res += 1;
 531   return res
 532
 533 def gf_invert(a, mod=0x1B) :
 534   v = mod
 535   g1 = 1
 536   g2 = 0
 537   j = gf_degree(a) - 8
 538
 539   while (a != 1) :
 540     if (j < 0) :
 541       a, v = v, a
 542       g1, g2 = g2, g1
 543       j = -j
 544
 545     a ^= v << j
 546     g1 ^= g2 << j
 547
 548     a %= 256  # Emulating 8-bit overflow
 549     g1 %= 256 # Emulating 8-bit overflow
 550
 551     j = gf_degree(a) - gf_degree(v)
 552
 553   return g1
 554 ```
 555
 556 # crc
 557
 558 * <https://stackoverflow.com/questions/21171733/calculating-constants-for-crc32-using-pclmulqdq>
 559 * <https://en.wikipedia.org/wiki/Cyclic_redundancy_check>
 560
 561 ```
 562 uint_xlen_t crc32(uint_xlen_t x, int nbits)
 563 {
 564     for (int i = 0; i < nbits; i++)
 565         x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
 566     return x;
 567 }
 568 uint_xlen_t crc32c(uint_xlen_t x, int nbits)
 569 {
 570     for (int i = 0; i < nbits; i++)
 571         x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
 572     return x;
 573 }
 574 uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); }
 575 uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); }
 576 uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); }
 577 uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); }
 578 uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); }
 579 uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); }
 580 #if XLEN > 32
 581 uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); }
 582 uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); }
 583 #endif
 584 ```
 585
 586 # bitmatrix
 587
 588 ```
 589 uint64_t bmatflip(uint64_t RA)
 590 {
 591     uint64_t x = RA;
 592     x = shfl64(x, 31);
 593     x = shfl64(x, 31);
 594     x = shfl64(x, 31);
 595     return x;
 596 }
 597 uint64_t bmatxor(uint64_t RA, uint64_t RB)
 598 {
 599     // transpose of RB
 600     uint64_t RBt = bmatflip(RB);
 601     uint8_t u[8]; // rows of RA
 602     uint8_t v[8]; // cols of RB
 603     for (int i = 0; i < 8; i++) {
 604         u[i] = RA >> (i*8);
 605         v[i] = RBt >> (i*8);
 606     }
 607     uint64_t x = 0;
 608     for (int i = 0; i < 64; i++) {
 609         if (pcnt(u[i / 8] & v[i % 8]) & 1)
 610             x |= 1LL << i;
 611     }
 612     return x;
 613 }
 614 uint64_t bmator(uint64_t RA, uint64_t RB)
 615 {
 616     // transpose of RB
 617     uint64_t RBt = bmatflip(RB);
 618     uint8_t u[8]; // rows of RA
 619     uint8_t v[8]; // cols of RB
 620     for (int i = 0; i < 8; i++) {
 621         u[i] = RA >> (i*8);
 622         v[i] = RBt >> (i*8);
 623     }
 624     uint64_t x = 0;
 625     for (int i = 0; i < 64; i++) {
 626         if ((u[i / 8] & v[i % 8]) != 0)
 627             x |= 1LL << i;
 628     }
 629     return x;
 630 }
 631
 632 ```