openpower/sv/bitmanip.mdwn

   1 [[!tag standards]]
   2
   3 # summary
   4
   5 1-op and variants
   6
   7 | dest | src1 | subop | op       |
   8 | ---- | ---- | ----- | -------- |
   9 | RT   | RA   | ..    | bmatflip |
  10 | RT   | RA   | size  | crc32    |
  11 | RT   | RA   | size  | crc32c   |
  12
  13 2-op and variants
  14
  15 | dest | src1 | src2 | subop | op       |
  16 | ---- | ---- | ---- | ----- | -------- |
  17 | RT   | RA   | RB   | or    | bmatflip |
  18 | RT   | RA   | RB   | xor   | bmatflip |
  19 | RT   | RA   | RB   | bdep  | dep/ext  |
  20 | RT   | RA   | RB   | bext  | dep/ext  |
  21 | RT   | RA   | RB   |       | grev  |
  22 | RT   | RA   | RB   |       | gorc |
  23 | RT   | RA   | RB   | shuf  | shuffle |
  24 | RT   | RA   | RB   | unshuf| shuffle |
  25 | RT   | RA   | RB   | width | xperm  |
  26 | RT   | RA   | RB   | type  | clmul |
  27 | RT   | RA   | RB   | type | minmax |
  28 | RT   | RA   | RB   |  |  |
  29 | RT   | RA   | RB   |  |  |
  30 | RT   | RA   | RB   |  |  |
  31
  32 3 ops
  33
  34 * bitmask swt/extract
  35 * ternary bitops
  36
  37 ops
  38
  39 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24..30  |31| name |
  40 | -- | -- | --- | --- | ----- | -- | ------- |--| ---- |
  41 | NN | RA | RB  |     |       | 0  | 0000110 |Rc| rsvd   |
  42 | NN | RA | RB  | RC  | itype | 1  | 0000110 |Rc| xperm |
  43 | NN | RA | RB  | RC  | itype | 0  | 0100110 |Rc| minmax |
  44 | NN | RA | RB  |     |       | 1  | 0100110 |Rc| rsvd |
  45 | NN | RA | RB  | sh  | itype | SH | 1000110 |Rc| bmopsi |
  46 | NN | RA | RB  |     |       |    | 1100110 |Rc| rsvd |
  47 | NN | RA | RB  | RC  | itype | 0  | 0001110 |Rc| clmul |
  48 | NN | RA | RB  | sh  | itype | 0  | 0101110 |Rc| clmulw |
  49 | NN | RA | RB  | RC  | 00    | 0  | 0010110 |Rc| gorc |
  50 | NN | RA | RB  | sh  | 00    | SH | 1010110 |Rc| gorci |
  51 | NN | RA | RB  | RC  | 00    | 0  | 0110110 |Rc| gorcw |
  52 | NN | RA | RB  | sh  | 00    | 0  | 1110110 |Rc| gorcwi |
  53 | NN | RA | RB  | RC  | 00    | 1  | 1110110 |Rc| bmator  |
  54 | NN | RA | RB  | RC  | 01    | 0  | 0010110 |Rc| grev |
  55 | NN | RA | RB  | sh  | 01    | SH | 1010110 |Rc| grevi |
  56 | NN | RA | RB  | RC  | 01    | 0  | 0110110 |Rc| grevw |
  57 | NN | RA | RB  | sh  | 01    | 0  | 1110110 |Rc| grevwi |
  58 | NN | RA | RB  | RC  | 01    | 1  | 1110110 |Rc| bmatxor   |
  59 | NN | RA | RB  | RC  | 10    | 0  | 0010110 |Rc| shfl |
  60 | NN | RA | RB  | sh  | 10    | SH | 1010110 |Rc| shfli |
  61 | NN | RA | RB  | RC  | 10    | 0  | 0110110 |Rc| shflw |
  62 | NN | RA | RB  | RC  | 10    | 0  | 1110110 |Rc| bdep   |
  63 | NN | RA | RB  | RC  | 10    | 1  | 1110110 |Rc| bext  |
  64 | NN | RA | RB  |     | 11    |    | 1110110 |Rc| rsvd  |
  65 | NN | RA | RB  |     |       |    | NN11110 |Rc| rsvd  |
  66
  67 # bit to byte permute
  68
  69 similar to matrix permute in RV bitmanip, which has XOR and OR variants
  70
  71     do j = 0 to 7
  72       do k = 0 to 7
  73          b = VSR[VRB+32].dword[i].byte[k].bit[j]
  74          VSR[VRT+32].dword[i].byte[j].bit[k] = b
  75
  76 # vector bit deposit
  77
  78 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
  79
  80     do while(m < 64)
  81        if VSR[VRB+32].dword[i].bit[63-m]=1 then do
  82           result = VSR[VRA+32].dword[i].bit[63-k]
  83           VSR[VRT+32].dword[i].bit[63-m] = result
  84           k = k + 1
  85        m = m + 1
  86
  87 ```
  88
  89 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
  90 {
  91     uint_xlen_t r = 0;
  92     for (int i = 0, j = 0; i < XLEN; i++)
  93         if ((RB >> i) & 1) {
  94             if ((RA >> j) & 1)
  95                 r |= uint_xlen_t(1) << i;
  96             j++;
  97         }
  98     return r;
  99 }
 100
 101 ```
 102
 103 # vector bit extract
 104
 105 other way round: identical to RV bext
 106
 107 ```
 108 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
 109 {
 110     uint_xlen_t r = 0;
 111     for (int i = 0, j = 0; i < XLEN; i++)
 112         if ((RB >> i) & 1) {
 113             if ((RA >> i) & 1)
 114                 r |= uint_xlen_t(1) << j;
 115             j++;
 116         }
 117     return r;
 118 }
 119 ```
 120
 121 # int min/max
 122
 123 signed and unsigned min/max for integer.  this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned.  when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
 124
 125 signed/unsigned min/max gives more flexibility.
 126
 127 # ternary bitops
 128
 129 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
 130
 131 | 0.5|6.10|11.15|16.20| 21..25| 26..30   |31|
 132 | -- | -- | --- | --- | ----- | -------- |--|
 133 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc|
 134
 135     for i in range(64):
 136         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 137         RT[i] = (imm & (1<<idx)) != 0
 138
 139 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
 140
 141 a 4 operand variant which becomes more along the lines of an FPGA:
 142
 143 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 144 | -- | -- | --- | --- | --- | ------- |--|
 145 | NN | RT | RA  | RB  | RC  | mode 1  |1 |
 146
 147     for i in range(64):
 148         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 149         RT[i] = (RC & (1<<idx)) != 0
 150
 151 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul.
 152
 153 also, another possible variant involving swizzle and vec4:
 154
 155     for i in range(8):
 156         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 157         RT[i] = (RA.w[i] & (1<<idx)) != 0
 158
 159 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 160 | -- | -- | --- | ----- | ---- | ----- |--|
 161 | NN | RT | RA  | xyzw  | mask | mode 1 |1 |
 162
 163     for i in range(8):
 164         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 165         res = (RA.w[i] & (1<<idx)) != 0
 166         for j in range(4):
 167              if mask[j]: RT[i+j*8] = res
 168
 169 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 170 | -- | -- | --- | ----- | ---- | ----- |--|
 171 | NN | RT | RA  | imm   | mask | mode 1 |1 |
 172
 173     for i in range(8):
 174         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 175         res = (imm & (1<<idx)) != 0
 176         for j in range(3):
 177              if mask[j]: RT[i+j*8] = res
 178
 179 another mode selection would be CRs not Ints.
 180
 181 | 0.5|6.8 | 9.11|12.14|15.17|18.20| 21..25| 26.29|30|31|
 182 | -- | -- | --- | --- | --- |-----| ----- | ---- |--|--|
 183 | NN | BT | BA  | BB  | BC  |im5-7| im0-4 | mask |1 |0 |
 184
 185     for i in range(4):
 186         if not mask[i] continue
 187         idx = crregs[BA][i] << 2 |
 188               crregs[BB][i] << 1 |
 189               crregs[BC][i]
 190         crregs[BT][i] = (imm & (1<<idx)) != 0
 191
 192 # bitmask set
 193
 194 based on RV bitmanip singlebit set, instruction format similar to shift
 195 [[isa/fixedshift]].  bmext is actually covered already (shift-with-mask).
 196 however bitmask-invert is not, and set/clr are not covered, although they can ise the same Shift ALU.
 197
 198 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 199 | -- | -- | --- | --- | --- | ------- |--|
 200 | NN | RT | RA  | RB  | RC  | mode 010 |Rc|
 201
 202 ```
 203 uint_xlen_t bmset(RA, RB, sh)
 204 {
 205     int shamt = RB & (XLEN - 1);
 206     mask = (2<<sh)-1;
 207     return RA | (mask << shamt);
 208 }
 209
 210 uint_xlen_t bmclr(RA, RB, sh)
 211 {
 212     int shamt = RB & (XLEN - 1);
 213     mask = (2<<sh)-1;
 214     return RA & ~(mask << shamt);
 215 }
 216
 217 uint_xlen_t bminv(RA, RB, sh)
 218 {
 219     int shamt = RB & (XLEN - 1);
 220     mask = (2<<sh)-1;
 221     return RA ^ (mask << shamt);
 222 }
 223
 224 uint_xlen_t bmext(RA, RB, sh)
 225 {
 226     int shamt = RB & (XLEN - 1);
 227     mask = (2<<sh)-1;
 228     return mask & (RA >> shamt);
 229 }
 230 ```
 231
 232 # grev
 233
 234 based on RV bitmanip
 235
 236 ```
 237 uint64_t grev64(uint64_t RA, uint64_t RB)
 238 {
 239     uint64_t x = RA;
 240     int shamt = RB & 63;
 241     if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
 242                         ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
 243     if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
 244                         ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
 245     if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
 246                         ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
 247     if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
 248                         ((x & 0xFF00FF00FF00FF00LL) >>  8);
 249     if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
 250                         ((x & 0xFFFF0000FFFF0000LL) >> 16);
 251     if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
 252                         ((x & 0xFFFFFFFF00000000LL) >> 32);
 253     return x;
 254 }
 255
 256 ```
 257
 258 # shuffle / unshuffle
 259
 260 based on RV bitmanip
 261
 262 ```
 263 uint32_t shfl32(uint32_t RA, uint32_t RB)
 264 {
 265     uint32_t x = RA;
 266     int shamt = RB & 15;
 267     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 268     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 269     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 270     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 271     return x;
 272 }
 273 uint32_t unshfl32(uint32_t RA, uint32_t RB)
 274 {
 275     uint32_t x = RA;
 276     int shamt = RB & 15;
 277     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 278     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 279     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 280     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 281     return x;
 282 }
 283
 284 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
 285 {
 286     uint64_t x = src & ~(maskL | maskR);
 287     x |= ((src << N) & maskL) | ((src >> N) & maskR);
 288     return x;
 289 }
 290 uint64_t shfl64(uint64_t RA, uint64_t RB)
 291 {
 292     uint64_t x = RA;
 293     int shamt = RB & 31;
 294     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 295                                            0x00000000ffff0000LL, 16);
 296     if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 297                                            0x0000ff000000ff00LL, 8);
 298     if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 299                                            0x00f000f000f000f0LL, 4);
 300     if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
 301                                            0x0c0c0c0c0c0c0c0cLL, 2);
 302     if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
 303                                            0x2222222222222222LL, 1);
 304     return x;
 305 }
 306 uint64_t unshfl64(uint64_t RA, uint64_t RB)
 307 {
 308     uint64_t x = RA;
 309     int shamt = RB & 31;
 310     if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
 311                                            0x2222222222222222LL, 1);
 312     if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
 313                                            0x0c0c0c0c0c0c0c0cLL, 2);
 314     if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 315                                            0x00f000f000f000f0LL, 4);
 316     if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 317                                            0x0000ff000000ff00LL, 8);
 318     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 319                                            0x00000000ffff0000LL, 16);
 320     return x;
 321 }
 322 ```
 323
 324 # xperm
 325
 326 based on RV bitmanip
 327
 328 ```
 329 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
 330 {
 331     uint_xlen_t r = 0;
 332     uint_xlen_t sz = 1LL << sz_log2;
 333     uint_xlen_t mask = (1LL << sz) - 1;
 334     for (int i = 0; i < XLEN; i += sz) {
 335         uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
 336         if (pos < XLEN)
 337             r |= ((RA >> pos) & mask) << i;
 338     }
 339     return r;
 340 }
 341 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
 342 {  return xperm(RA, RB, 2); }
 343 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
 344 {  return xperm(RA, RB, 3); }
 345 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
 346 {  return xperm(RA, RB, 4); }
 347 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
 348 {  return xperm(RA, RB, 5); }
 349 ```
 350
 351 # gorc
 352
 353 based on RV bitmanip
 354
 355 ```
 356 uint32_t gorc32(uint32_t RA, uint32_t RB)
 357 {
 358     uint32_t x = RA;
 359     int shamt = RB & 31;
 360     if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
 361     if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
 362     if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
 363     if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
 364     if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
 365     return x;
 366 }
 367 uint64_t gorc64(uint64_t RA, uint64_t RB)
 368 {
 369     uint64_t x = RA;
 370     int shamt = RB & 63;
 371     if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
 372                          ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
 373     if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
 374                          ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
 375     if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
 376                          ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
 377     if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
 378                          ((x & 0xFF00FF00FF00FF00LL)  >>  8);
 379     if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
 380                          ((x & 0xFFFF0000FFFF0000LL)  >> 16);
 381     if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
 382                          ((x & 0xFFFFFFFF00000000LL)  >> 32);
 383     return x;
 384 }
 385
 386 ```
 387
 388 # cmix
 389
 390 based on RV bitmanip, covered by ternary bitops
 391
 392 ```
 393 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
 394     return (RA & RB) | (RC & ~RB);
 395 }
 396 ```
 397
 398 # carryless mul
 399
 400 based on RV bitmanip
 401
 402 ```
 403 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
 404 {
 405     uint_xlen_t x = 0;
 406     for (int i = 0; i < XLEN; i++)
 407         if ((RB >> i) & 1)
 408             x ^= RA << i;
 409     return x;
 410 }
 411 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
 412 {
 413     uint_xlen_t x = 0;
 414     for (int i = 1; i < XLEN; i++)
 415         if ((RB >> i) & 1)
 416             x ^= RA >> (XLEN-i);
 417     return x;
 418 }
 419 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
 420 {
 421     uint_xlen_t x = 0;
 422     for (int i = 0; i < XLEN; i++)
 423         if ((RB >> i) & 1)
 424             x ^= RA >> (XLEN-i-1);
 425     return x;
 426 }
 427 ```
 428
 429 # crc
 430
 431 ```
 432 uint_xlen_t crc32(uint_xlen_t x, int nbits)
 433 {
 434     for (int i = 0; i < nbits; i++)
 435         x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
 436     return x;
 437 }
 438 uint_xlen_t crc32c(uint_xlen_t x, int nbits)
 439 {
 440     for (int i = 0; i < nbits; i++)
 441         x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
 442     return x;
 443 }
 444 uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); }
 445 uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); }
 446 uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); }
 447 uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); }
 448 uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); }
 449 uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); }
 450 #if XLEN > 32
 451 uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); }
 452 uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); }
 453 #endif
 454 ```
 455
 456 # bitmatrix
 457
 458 ```
 459 uint64_t bmatflip(uint64_t RA)
 460 {
 461     uint64_t x = RA;
 462     x = shfl64(x, 31);
 463     x = shfl64(x, 31);
 464     x = shfl64(x, 31);
 465     return x;
 466 }
 467 uint64_t bmatxor(uint64_t RA, uint64_t RB)
 468 {
 469     // transpose of RB
 470     uint64_t RBt = bmatflip(RB);
 471     uint8_t u[8]; // rows of RA
 472     uint8_t v[8]; // cols of RB
 473     for (int i = 0; i < 8; i++) {
 474         u[i] = RA >> (i*8);
 475         v[i] = RBt >> (i*8);
 476     }
 477     uint64_t x = 0;
 478     for (int i = 0; i < 64; i++) {
 479         if (pcnt(u[i / 8] & v[i % 8]) & 1)
 480             x |= 1LL << i;
 481     }
 482     return x;
 483 }
 484 uint64_t bmator(uint64_t RA, uint64_t RB)
 485 {
 486     // transpose of RB
 487     uint64_t RBt = bmatflip(RB);
 488     uint8_t u[8]; // rows of RA
 489     uint8_t v[8]; // cols of RB
 490     for (int i = 0; i < 8; i++) {
 491         u[i] = RA >> (i*8);
 492         v[i] = RBt >> (i*8);
 493     }
 494     uint64_t x = 0;
 495     for (int i = 0; i < 64; i++) {
 496         if ((u[i / 8] & v[i % 8]) != 0)
 497             x |= 1LL << i;
 498     }
 499     return x;
 500 }
 501
 502 ```