[[!tag standards]] # bit to byte permute similar to matrix permute in RV bitmanip, which has XOR and OR variants do j = 0 to 7 do k = 0 to 7 b = VSR[VRB+32].dword[i].byte[k].bit[j] VSR[VRT+32].dword[i].byte[j].bit[k] = b # vector bit deposit vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep do while(m < 64) if VSR[VRB+32].dword[i].bit[63-m]=1 then do result = VSR[VRA+32].dword[i].bit[63-k] VSR[VRT+32].dword[i].bit[63-m] = result k = k + 1 m = m + 1 ``` uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB) { uint_xlen_t r = 0; for (int i = 0, j = 0; i < XLEN; i++) if ((RB >> i) & 1) { if ((RA >> j) & 1) r |= uint_xlen_t(1) << i; j++; } return r; } ``` # vector bit extract other way round: identical to RV bext ``` uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB) { uint_xlen_t r = 0; for (int i = 0, j = 0; i < XLEN; i++) if ((RB >> i) & 1) { if ((RA >> i) & 1) r |= uint_xlen_t(1) << j; j++; } return r; } ``` # ternary bitops Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register | 0.5|6.10|11.15|16.20| 21..25| 26..30 |31| | -- | -- | --- | --- | ----- | -------- |--| | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc| for i in range(64): idx = RT[i] << 2 | RA[i] << 1 | RB[i] RT[i] = (imm & (1<> shamt); } ``` # grev based on RV bitmanip ``` uint64_t grev64(uint64_t RA, uint64_t RB) { uint64_t x = RA; int shamt = RB & 63; if (shamt & 1) x = ((x & 0x5555555555555555LL) << 1) | ((x & 0xAAAAAAAAAAAAAAAALL) >> 1); if (shamt & 2) x = ((x & 0x3333333333333333LL) << 2) | ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2); if (shamt & 4) x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) | ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4); if (shamt & 8) x = ((x & 0x00FF00FF00FF00FFLL) << 8) | ((x & 0xFF00FF00FF00FF00LL) >> 8); if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) | ((x & 0xFFFF0000FFFF0000LL) >> 16); if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) | ((x & 0xFFFFFFFF00000000LL) >> 32); return x; } ``` # shuffle / unshuffle based on RV bitmanip ``` uint32_t shfl32(uint32_t RA, uint32_t RB) { uint32_t x = RA; int shamt = RB & 15; if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8); if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4); if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2); if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1); return x; } uint32_t unshfl32(uint32_t RA, uint32_t RB) { uint32_t x = RA; int shamt = RB & 15; if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1); if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2); if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4); if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8); return x; } uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N) { uint64_t x = src & ~(maskL | maskR); x |= ((src << N) & maskL) | ((src >> N) & maskR); return x; } uint64_t shfl64(uint64_t RA, uint64_t RB) { uint64_t x = RA; int shamt = RB & 31; if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL, 0x00000000ffff0000LL, 16); if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL, 0x0000ff000000ff00LL, 8); if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL, 0x00f000f000f000f0LL, 4); if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL, 0x0c0c0c0c0c0c0c0cLL, 2); if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL, 0x2222222222222222LL, 1); return x; } uint64_t unshfl64(uint64_t RA, uint64_t RB) { uint64_t x = RA; int shamt = RB & 31; if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL, 0x2222222222222222LL, 1); if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL, 0x0c0c0c0c0c0c0c0cLL, 2); if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL, 0x00f000f000f000f0LL, 4); if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL, 0x0000ff000000ff00LL, 8); if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL, 0x00000000ffff0000LL, 16); return x; } ``` # xperm based on RV bitmanip ``` uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2) { uint_xlen_t r = 0; uint_xlen_t sz = 1LL << sz_log2; uint_xlen_t mask = (1LL << sz) - 1; for (int i = 0; i < XLEN; i += sz) { uint_xlen_t pos = ((RB >> i) & mask) << sz_log2; if (pos < XLEN) r |= ((RA >> pos) & mask) << i; } return r; } uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB) { return xperm(RA, RB, 2); } uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB) { return xperm(RA, RB, 3); } uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB) { return xperm(RA, RB, 4); } uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB) { return xperm(RA, RB, 5); } ``` # gorc based on RV bitmanip ``` uint32_t gorc32(uint32_t RA, uint32_t RB) { uint32_t x = RA; int shamt = RB & 31; if (shamt & 1) x |= ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1); if (shamt & 2) x |= ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2); if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4); if (shamt & 8) x |= ((x & 0x00FF00FF) << 8) | ((x & 0xFF00FF00) >> 8); if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) | ((x & 0xFFFF0000) >> 16); return x; } uint64_t gorc64(uint64_t RA, uint64_t RB) { uint64_t x = RA; int shamt = RB & 63; if (shamt & 1) x |= ((x & 0x5555555555555555LL) << 1) | ((x & 0xAAAAAAAAAAAAAAAALL) >> 1); if (shamt & 2) x |= ((x & 0x3333333333333333LL) << 2) | ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2); if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) | ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4); if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL) << 8) | ((x & 0xFF00FF00FF00FF00LL) >> 8); if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL) << 16) | ((x & 0xFFFF0000FFFF0000LL) >> 16); if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL) << 32) | ((x & 0xFFFFFFFF00000000LL) >> 32); return x; } ``` # cmix based on RV bitmanip, covered by ternary bitops ``` uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) { return (RA & RB) | (RC & ~RB); } ``` # carryless mul based on RV bitmanip ``` uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB) { uint_xlen_t x = 0; for (int i = 0; i < XLEN; i++) if ((RB >> i) & 1) x ^= RA << i; return x; } uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB) { uint_xlen_t x = 0; for (int i = 1; i < XLEN; i++) if ((RB >> i) & 1) x ^= RA >> (XLEN-i); return x; } uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB) { uint_xlen_t x = 0; for (int i = 0; i < XLEN; i++) if ((RB >> i) & 1) x ^= RA >> (XLEN-i-1); return x; } ``` # crc ``` uint_xlen_t crc32(uint_xlen_t x, int nbits) { for (int i = 0; i < nbits; i++) x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1)); return x; } uint_xlen_t crc32c(uint_xlen_t x, int nbits) { for (int i = 0; i < nbits; i++) x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1)); return x; } uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); } uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); } uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); } uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); } uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); } uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); } #if XLEN > 32 uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); } uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); } #endif ``` # bitmatrix ``` uint64_t bmatflip(uint64_t RA) { uint64_t x = RA; x = shfl64(x, 31); x = shfl64(x, 31); x = shfl64(x, 31); return x; } uint64_t bmatxor(uint64_t RA, uint64_t RB) { // transpose of RB uint64_t RBt = bmatflip(RB); uint8_t u[8]; // rows of RA uint8_t v[8]; // cols of RB for (int i = 0; i < 8; i++) { u[i] = RA >> (i*8); v[i] = RBt >> (i*8); } uint64_t x = 0; for (int i = 0; i < 64; i++) { if (pcnt(u[i / 8] & v[i % 8]) & 1) x |= 1LL << i; } return x; } uint64_t bmator(uint64_t RA, uint64_t RB) { // transpose of RB uint64_t RBt = bmatflip(RB); uint8_t u[8]; // rows of RA uint8_t v[8]; // cols of RB for (int i = 0; i < 8; i++) { u[i] = RA >> (i*8); v[i] = RBt >> (i*8); } uint64_t x = 0; for (int i = 0; i < 64; i++) { if ((u[i / 8] & v[i % 8]) != 0) x |= 1LL << i; } return x; } ```