[[!tag standards]]

# bit to byte permute

similar to matrix permute in RV bitmanip, which has XOR and OR variants

    do j = 0 to 7
      do k = 0 to 7
         b = VSR[VRB+32].dword[i].byte[k].bit[j]
         VSR[VRT+32].dword[i].byte[j].bit[k] = b

# vector bit deposit

vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep

    do while(m < 64)
       if VSR[VRB+32].dword[i].bit[63-m]=1 then do
          result = VSR[VRA+32].dword[i].bit[63-k]
          VSR[VRT+32].dword[i].bit[63-m] = result
          k = k + 1
       m = m + 1

```

uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
{
    uint_xlen_t r = 0;
    for (int i = 0, j = 0; i < XLEN; i++)
        if ((RB >> i) & 1) {
            if ((RA >> j) & 1)
                r |= uint_xlen_t(1) << i;
            j++;
        }
    return r;
}

```

# vector bit extract

other way round: identical to RV bext

```
uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
{
    uint_xlen_t r = 0;
    for (int i = 0, j = 0; i < XLEN; i++)
        if ((RB >> i) & 1) {
            if ((RA >> i) & 1)
                r |= uint_xlen_t(1) << j;
            j++;
        }
    return r;
}
```
# ternary bitops

Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register

| 0.5|6.10|11.15|16.20| 21..25| 26..30   |31|
| -- | -- | --- | --- | ----- | -------- |--|
| NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc|

    for i in range(64):
        idx = RT[i] << 2 | RA[i] << 1 | RB[i]
        RT[i] = (imm & (1<<idx)) != 0

bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.

a 4 operand variant which becomes more along the lines of an FPGA:

| 0.5|6.10|11.15|16.20|21.25| 26..30   |31|
| -- | -- | --- | --- | --- | -------- |--|
| NN | RT | RA  | RB  | RC  | mode  10 |Rc|

    for i in range(64):
        idx = RT[i] << 2 | RA[i] << 1 | RB[i]
        RT[i] = (RC & (1<<idx)) != 0

mode (3 bit) may be used to do inversion of ordering, similar to carryless mul.

also, another possible variant involving swizzle and vec4:

    for i in range(8):
        idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
        RT[i] = (RA.w[i] & (1<<idx)) != 0


another mode selection would be CRs not Ints. 

| 0.5|6.8 | 9.11|12.14|15.17|18.20| 21..25| 26.29|30|31|
| -- | -- | --- | --- | --- |-----| ----- | ---- |--|--|
| NN | BT | BA  | BB  | BC  |im5-7| im0-4 | mask |1 |Rc|

    for i in range(4):
        if not mask[i] continue
        idx = crregs[BA][i] << 2 |
              crregs[BB][i] << 1 |
              crregs[BC][i]
        crregs[BT][i] = (imm & (1<<idx)) != 0

# bitmask set

based on RV bitmanip singlebit set, instruction format similar to shift

| 0.5|6.10|11.15|16.20| 21.25 | 26.27 | 28.30 |31|
| -- | -- | --- | --- | ----- | ----- | ----- |--|
| NN | RT | RA  | RB  | RC    | itype | 0  00 |Rc|
| NN | RT | RA  | RB  | sh    | itype | SH 01 |Rc|

```
uint_xlen_t bmset(RA, RB, sh)
{
    int shamt = RB & (XLEN - 1);
    mask = (2<<sh)-1;
    return RA | (mask << shamt);
}

uint_xlen_t bmclr(RA, RB, sh)
{
    int shamt = RB & (XLEN - 1);
    mask = (2<<sh)-1;
    return RA & ~(mask << shamt);
}

uint_xlen_t bminv(RA, RB, sh)
{
    int shamt = RB & (XLEN - 1);
    mask = (2<<sh)-1;
    return RA ^ (mask << shamt);
}

uint_xlen_t bmext(RA, RB, sh)
{
    int shamt = RB & (XLEN - 1);
    mask = (2<<sh)-1;
    return mask & (RA >> shamt);
}
```

# grev

based on RV bitmanip

```
uint64_t grev64(uint64_t RA, uint64_t RB)
{
    uint64_t x = RA;
    int shamt = RB & 63;
    if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
                        ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
    if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
                        ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
    if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
                        ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
    if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
                        ((x & 0xFF00FF00FF00FF00LL) >>  8);
    if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
                        ((x & 0xFFFF0000FFFF0000LL) >> 16);
    if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
                        ((x & 0xFFFFFFFF00000000LL) >> 32);
    return x;
}

```

# shuffle / unshuffle

based on RV bitmanip

```
uint32_t shfl32(uint32_t RA, uint32_t RB)
{
    uint32_t x = RA;
    int shamt = RB & 15;
    if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
    if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
    if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
    if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
    return x;
}
uint32_t unshfl32(uint32_t RA, uint32_t RB)
{
    uint32_t x = RA;
    int shamt = RB & 15;
    if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
    if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
    if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
    if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
    return x;
}

uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
{
    uint64_t x = src & ~(maskL | maskR);
    x |= ((src << N) & maskL) | ((src >> N) & maskR);
    return x;
}
uint64_t shfl64(uint64_t RA, uint64_t RB)
{
    uint64_t x = RA;
    int shamt = RB & 31;
    if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
                                           0x00000000ffff0000LL, 16);
    if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
                                           0x0000ff000000ff00LL, 8);
    if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
                                           0x00f000f000f000f0LL, 4);
    if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
                                           0x0c0c0c0c0c0c0c0cLL, 2);
    if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
                                           0x2222222222222222LL, 1);
    return x;
}
uint64_t unshfl64(uint64_t RA, uint64_t RB)
{
    uint64_t x = RA;
    int shamt = RB & 31;
    if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
                                           0x2222222222222222LL, 1);
    if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
                                           0x0c0c0c0c0c0c0c0cLL, 2);
    if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
                                           0x00f000f000f000f0LL, 4);
    if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
                                           0x0000ff000000ff00LL, 8);
    if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
                                           0x00000000ffff0000LL, 16);
    return x;
}
```

# xperm

based on RV bitmanip

```
uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
{
    uint_xlen_t r = 0;
    uint_xlen_t sz = 1LL << sz_log2;
    uint_xlen_t mask = (1LL << sz) - 1;
    for (int i = 0; i < XLEN; i += sz) {
        uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
        if (pos < XLEN)
            r |= ((RA >> pos) & mask) << i;
    }
    return r;
}
uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB) {  return xperm(RA, RB, 2); }
uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB) {  return xperm(RA, RB, 3); }
uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB) {  return xperm(RA, RB, 4); }
uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB) {  return xperm(RA, RB, 5); }
```

# gorc

based on RV bitmanip

```
uint32_t gorc32(uint32_t RA, uint32_t RB)
{
    uint32_t x = RA;
    int shamt = RB & 31;
    if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
    if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
    if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
    if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
    if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
    return x;
}
uint64_t gorc64(uint64_t RA, uint64_t RB)
{
    uint64_t x = RA;
    int shamt = RB & 63;
    if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
                         ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
    if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
                         ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
    if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
                         ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
    if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
                         ((x & 0xFF00FF00FF00FF00LL)  >>  8);
    if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
                         ((x & 0xFFFF0000FFFF0000LL)  >> 16);
    if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
                         ((x & 0xFFFFFFFF00000000LL)  >> 32);
    return x;
}

```

# cmix

based on RV bitmanip, covered by ternary bitops

```
uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
    return (RA & RB) | (RC & ~RB);
}
```

# carryless mul

based on RV bitmanip

```
uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
{
    uint_xlen_t x = 0;
    for (int i = 0; i < XLEN; i++)
        if ((RB >> i) & 1)
            x ^= RA << i;
    return x;
}
uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
{
    uint_xlen_t x = 0;
    for (int i = 1; i < XLEN; i++)
        if ((RB >> i) & 1)
            x ^= RA >> (XLEN-i);
    return x;
}
uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
{
    uint_xlen_t x = 0;
    for (int i = 0; i < XLEN; i++)
        if ((RB >> i) & 1)
            x ^= RA >> (XLEN-i-1);
    return x;
}
```

# crc

```
uint_xlen_t crc32(uint_xlen_t x, int nbits)
{
    for (int i = 0; i < nbits; i++)
        x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
    return x;
}
uint_xlen_t crc32c(uint_xlen_t x, int nbits)
{
    for (int i = 0; i < nbits; i++)
        x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
    return x;
}
uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); }
uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); }
uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); }
uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); }
uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); }
uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); }
#if XLEN > 32
uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); }
uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); }
#endif
```

# bitmatrix

```
uint64_t bmatflip(uint64_t RA)
{
    uint64_t x = RA;
    x = shfl64(x, 31);
    x = shfl64(x, 31);
    x = shfl64(x, 31);
    return x;
}
uint64_t bmatxor(uint64_t RA, uint64_t RB)
{
    // transpose of RB
    uint64_t RBt = bmatflip(RB);
    uint8_t u[8]; // rows of RA
    uint8_t v[8]; // cols of RB
    for (int i = 0; i < 8; i++) {
        u[i] = RA >> (i*8);
        v[i] = RBt >> (i*8);
    }
    uint64_t x = 0;
    for (int i = 0; i < 64; i++) {
        if (pcnt(u[i / 8] & v[i % 8]) & 1)
            x |= 1LL << i;
    }
    return x;
}
uint64_t bmator(uint64_t RA, uint64_t RB)
{
    // transpose of RB
    uint64_t RBt = bmatflip(RB);
    uint8_t u[8]; // rows of RA
    uint8_t v[8]; // cols of RB
    for (int i = 0; i < 8; i++) {
        u[i] = RA >> (i*8);
        v[i] = RBt >> (i*8);
    }
    uint64_t x = 0;
    for (int i = 0; i < 64; i++) {
        if ((u[i / 8] & v[i % 8]) != 0)
            x |= 1LL << i;
    }
    return x;
}

```