# summary
+minor opcode allocation
+
+ | 28.30 |31| name |
+ | ------ |--| --------- |
+ | 00 |Rc| ternaryi |
+ | 001 |Rc| ternary |
+ | 010 |Rc| bitmask |
+ | 011 |Rc| gf* |
+ | 101 |1 | ternaryv |
+ | 101 |0 | ternarycr |
+ | 110 |Rc| 1/2-op |
+ | 111 |Rc| bitmaski |
+
1-op and variants
| dest | src1 | subop | op |
| ---- | ---- | ----- | -------- |
| RT | RA | .. | bmatflip |
-| RT | RA | size | crc32 |
-| RT | RA | size | crc32c |
2-op and variants
| RT | RA | RB | bdep | dep/ext |
| RT | RA | RB | bext | dep/ext |
| RT | RA | RB | | grev |
+| RT | RA | RB | | clmul* |
| RT | RA | RB | | gorc |
| RT | RA | RB | shuf | shuffle |
| RT | RA | RB | unshuf| shuffle |
| RT | RA | RB | width | xperm |
-| RT | RA | RB | type | clmul |
| RT | RA | RB | type | minmax |
| RT | RA | RB | | |
| RT | RA | RB | | |
| RT | RA | RB | | |
+3 ops
+
+* bitmask set/extract
+* ternary bitops
+* GF
+
+| 0.5|6.10|11.15|16.20|21..25 | 26....30 |31| name |
+| -- | -- | --- | --- | ----- | -------- |--| ------ |
+| NN | RT | RA | RB | RC | mode 001 |Rc| ternary |
+| NN | RT | RA | RB | im0-4 | im5-7 00 |Rc| ternaryi |
+| NN | RS | RA | RB | deg | 00 011 |Rc| gfmul |
+| NN | RS | RA | RB | deg | 01 011 |Rc| gfadd |
+| NN | RT | RA | RB | deg | 10 011 |Rc| gfinv |
+| NN | RS | RA | RB | deg | 11 011 |Rc| gf rsvd |
+
+| 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
+| -- | -- | --- | ----- | ---- | ----- |--| ------ |
+| NN | RT | RA | imm | mask | 101 |1 | ternaryv |
+
+| 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
+| -- | -- | --- | --- |- |-----|----- | -----|--| -------|
+| NN | BA | BB | BC |0 |imm | mask | 101 |0 | ternarycr |
+
ops
-| 0.5|6.10|11.15|16.20| 21.22 | 23 | 24..30 |31| name |
-| -- | -- | --- | --- | ----- | -- | ------- |--| ---- |
-| NN | RA | RB | RC | itype | 0 | 0000110 |Rc| bmops |
-| NN | RA | RB | RC | itype | 1 | 0000110 |Rc| xperm |
-| NN | RA | RB | RC | itype | 0 | 0100110 |Rc| minmax |
-| NN | RA | RB | | | 1 | 0100110 |Rc| rsvd |
-| NN | RA | RB | sh | itype | SH | 1000110 |Rc| bmopsi |
-| NN | RA | RB | | | | 1100110 |Rc| rsvd |
-| NN | RA | RB | RC | itype | 0 | 0001110 |Rc| clmul |
-| NN | RA | RB | sh | itype | 0 | 0101110 |Rc| clmulw |
-| NN | RA | RB | RC | 00 | 0 | 0010110 |Rc| gorc |
-| NN | RA | RB | sh | 00 | SH | 1010110 |Rc| gorci |
-| NN | RA | RB | RC | 00 | 0 | 0110110 |Rc| gorcw |
-| NN | RA | RB | sh | 00 | 0 | 1110110 |Rc| gorcwi |
-| NN | RA | RB | RC | 00 | 1 | 1110110 |Rc| bmator |
-| NN | RA | RB | RC | 01 | 0 | 0010110 |Rc| grev |
-| NN | RA | RB | sh | 01 | SH | 1010110 |Rc| grevi |
-| NN | RA | RB | RC | 01 | 0 | 0110110 |Rc| grevw |
-| NN | RA | RB | sh | 01 | 0 | 1110110 |Rc| grevwi |
-| NN | RA | RB | RC | 01 | 1 | 1110110 |Rc| bmatxor |
-| NN | RA | RB | RC | 10 | 0 | 0010110 |Rc| shfl |
-| NN | RA | RB | sh | 10 | SH | 1010110 |Rc| shfli |
-| NN | RA | RB | RC | 10 | 0 | 0110110 |Rc| shflw |
-| NN | RA | RB | RC | 10 | 0 | 1110110 |Rc| bdep |
-| NN | RA | RB | RC | 10 | 1 | 1110110 |Rc| bext |
-| NN | RA | RB | | 11 | | 1110110 |Rc| rsvd |
-| NN | RA | RB | | | | NN11110 |Rc| rsvd |
+| 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
+| -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
+| NN | RA | RB | | | 0 | 0000 110 |Rc| rsvd |
+| NN | RA | RB | RC | itype | 1 | 0000 110 |Rc| xperm |
+| NN | RA | RB | RC | itype | 0 | 0100 110 |Rc| minmax |
+| NN | RA | RB | | | 1 | 0100 110 |Rc| rsvd |
+| NN | RA | RB | sh | itype | SH | 1000 110 |Rc| bmopsi |
+| NN | RA | RB | | | | 1100 110 |Rc| rsvd |
+| NN | RA | RB | | | 0 | 0001 110 |Rc| rsvd |
+| NN | RA | RB | | | 0 | 0101 110 |Rc| rsvd |
+| NN | RA | RB | RC | 00 | 0 | 0010 110 |Rc| gorc |
+| NN | RA | RB | sh | 00 | SH | 1010 110 |Rc| gorci |
+| NN | RA | RB | RC | 00 | 0 | 0110 110 |Rc| gorcw |
+| NN | RA | RB | sh | 00 | 0 | 1110 110 |Rc| gorcwi |
+| NN | RA | RB | RC | 00 | 1 | 1110 110 |Rc| bmator |
+| NN | RA | RB | RC | 01 | 0 | 0010 110 |Rc| grev |
+| NN | RA | RB | RC | 01 | 1 | 0010 110 |Rc| clmul |
+| NN | RA | RB | sh | 01 | SH | 1010 110 |Rc| grevi |
+| NN | RA | RB | RC | 01 | 0 | 0110 110 |Rc| grevw |
+| NN | RA | RB | sh | 01 | 0 | 1110 110 |Rc| grevwi |
+| NN | RA | RB | RC | 01 | 1 | 1110 110 |Rc| bmatxor |
+| NN | RA | RB | RC | 10 | 0 | 0010 110 |Rc| shfl |
+| NN | RA | RB | sh | 10 | SH | 1010 110 |Rc| shfli |
+| NN | RA | RB | RC | 10 | 0 | 0110 110 |Rc| shflw |
+| NN | RA | RB | RC | 10 | 0 | 1110 110 |Rc| bdep |
+| NN | RA | RB | RC | 10 | 1 | 1110 110 |Rc| bext |
+| NN | RA | RB | RC | 11 | 0 | 1110 110 |Rc| clmulr |
+| NN | RA | RB | RC | 11 | 1 | 1110 110 |Rc| clmulh |
+| NN | RA | RB | | | | NN11 110 |Rc| rsvd |
# bit to byte permute
signed/unsigned min/max gives more flexibility.
+```
+uint_xlen_t min(uint_xlen_t rs1, uint_xlen_t rs2)
+{ return (int_xlen_t)rs1 < (int_xlen_t)rs2 ? rs1 : rs2;
+}
+uint_xlen_t max(uint_xlen_t rs1, uint_xlen_t rs2)
+{ return (int_xlen_t)rs1 > (int_xlen_t)rs2 ? rs1 : rs2;
+}
+uint_xlen_t minu(uint_xlen_t rs1, uint_xlen_t rs2)
+{ return rs1 < rs2 ? rs1 : rs2;
+}
+uint_xlen_t maxu(uint_xlen_t rs1, uint_xlen_t rs2)
+{ return rs1 > rs2 ? rs1 : rs2;
+}
+```
+
+
# ternary bitops
Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
a 4 operand variant which becomes more along the lines of an FPGA:
-| 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
-| -- | -- | --- | --- | --- | ------- |--|
-| NN | RT | RA | RB | RC | mode 010 |Rc|
+| 0.5|6.10|11.15|16.20|21.25| 26...30 |31|
+| -- | -- | --- | --- | --- | -------- |--|
+| NN | RT | RA | RB | RC | mode 001 |Rc|
for i in range(64):
idx = RT[i] << 2 | RA[i] << 1 | RB[i]
RT[i] = (RC & (1<<idx)) != 0
-mode (2 bit) may be used to do inversion of ordering, similar to carryless mul.
+mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
+3 modes.
also, another possible variant involving swizzle and vec4:
- for i in range(8):
- idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
- RT[i] = (RA.w[i] & (1<<idx)) != 0
-
-| 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
-| -- | -- | --- | ----- | ---- | ----- |--|
-| NN | RT | RA | xyzw | mask | 010 |0 |
-
- for i in range(8):
- idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
- res = (RA.w[i] & (1<<idx)) != 0
- for j in range(4):
- if mask[j]: RT[i+j*8] = res
-
| 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
| -- | -- | --- | ----- | ---- | ----- |--|
-| NN | RT | RA | imm | mask | 010 |1 |
+| NN | RT | RA | imm | mask | 101 |1 |
for i in range(8):
idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
another mode selection would be CRs not Ints.
-| 0.5|6.8 | 9.11|12.14|15.17|18.20| 21..25| 26.29|30|31|
-| -- | -- | --- | --- | --- |-----| ----- | ---- |--|--|
-| NN | BT | BA | BB | BC |im5-7| im0-4 | mask |1 |Rc|
+| 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
+| -- | -- | --- | --- |- |-----|----- | -----|--|
+| NN | BA | BB | BC |0 |imm | mask | 101 |0 |
for i in range(4):
if not mask[i] continue
idx = crregs[BA][i] << 2 |
crregs[BB][i] << 1 |
crregs[BC][i]
- crregs[BT][i] = (imm & (1<<idx)) != 0
+ crregs[BA][i] = (imm & (1<<idx)) != 0
# bitmask set
based on RV bitmanip singlebit set, instruction format similar to shift
+[[isa/fixedshift]]. bmext is actually covered already (shift-with-mask rldicl but only immediate version).
+however bitmask-invert is not, and set/clr are not covered, although they can use the same Shift ALU.
+
+bmext (RB) version is not the same as rldicl because bmext is a right shift by RC, where rldicl is a left rotate. for the immediate version this does not matter.
+| 0.5|6.10|11.15|16.20|21.25| 26..30 |31| name |
+| -- | -- | --- | --- | --- | ------- |--| ----- |
+| NN | RT | RA | RB | RC | mode 010 |Rc| bm* |
+| NN | RT | RA | RB | RC | 0 1 111 |Rc| bmrev |
+| NN | | | | | 1 1 111 |Rc| rsvd |
```
uint_xlen_t bmset(RA, RB, sh)
}
```
+bitmask extract with reverse
+
+```
+msb = rb[5:0];
+rev[0:msb] = ra[msb:0];
+rt = ZE(rev[msb:0]);
+```
+
+| 0.5|6.10|11.15|16.20|21.26| 27..30 |31| name |
+| -- | -- | --- | --- | --- | ------- |--| ------ |
+| NN | RT | RA | RB | sh | 0 111 |Rc| bmrevi |
+
+
+
# grev
based on RV bitmanip
# carryless mul
based on RV bitmanip
+see https://en.wikipedia.org/wiki/CLMUL_instruction_set
```
uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
return x;
}
```
+# Galois Field
+
+## Multiply
+
+this requires 3 parameters and a "degree"
+
+ RT = GFMUL(RA, RB, gfdegree, modulo=RC)
+
+realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
+
+ RS = GFMUL(RS, RA, gfdegree, modulo=RB)
+
+| 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
+| -- | -- | --- | --- | --- | ------- |--|
+| NN | RS | RA | RB | deg | 00 011 |Rc|
+
+where the SimpleV variant may override RS-as-src differently from RS-as-dest
+
+
+
+```
+from functools import reduce
+
+# constants used in the multGF2 function
+mask1 = mask2 = polyred = None
+
+def setGF2(degree, irPoly):
+ """Define parameters of binary finite field GF(2^m)/g(x)
+ - degree: extension degree of binary field
+ - irPoly: coefficients of irreducible polynomial g(x)
+ """
+ def i2P(sInt):
+ """Convert an integer into a polynomial"""
+ return [(sInt >> i) & 1
+ for i in reversed(range(sInt.bit_length()))]
+
+ global mask1, mask2, polyred
+ mask1 = mask2 = 1 << degree
+ mask2 -= 1
+ polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
+
+def multGF2(p1, p2):
+ """Multiply two polynomials in GF(2^m)/g(x)"""
+ p = 0
+ while p2:
+ if p2 & 1:
+ p ^= p1
+ p1 <<= 1
+ if p1 & mask1:
+ p1 ^= polyred
+ p2 >>= 1
+ return p & mask2
+
+if __name__ == "__main__":
+
+ # Define binary field GF(2^3)/x^3 + x + 1
+ setGF2(3, 0b1011)
+
+ # Evaluate the product (x^2 + x + 1)(x^2 + 1)
+ print("{:02x}".format(multGF2(0b111, 0b101)))
+
+ # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
+ # (used in the Advanced Encryption Standard-AES)
+ setGF2(8, 0b100011011)
+
+ # Evaluate the product (x^7)(x^7 + x + 1)
+ print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
+```
+## GF add
+
+ RS = GFADD(RS, RA|0, gfdegree, modulo=RB)
+
+| 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
+| -- | -- | --- | --- | --- | ------- |--|
+| NN | RS | RA | RB | deg | 01 011 |Rc|
-# crc
+## gf invert
```
-uint_xlen_t crc32(uint_xlen_t x, int nbits)
-{
- for (int i = 0; i < nbits; i++)
- x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
- return x;
-}
-uint_xlen_t crc32c(uint_xlen_t x, int nbits)
-{
- for (int i = 0; i < nbits; i++)
- x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
- return x;
-}
-uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); }
-uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); }
-uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); }
-uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); }
-uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); }
-uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); }
-#if XLEN > 32
-uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); }
-uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); }
-#endif
+def gf_degree(a) :
+ res = 0
+ a >>= 1
+ while (a != 0) :
+ a >>= 1;
+ res += 1;
+ return res
+
+def gf_invert(a, mod=0x1B) :
+ v = mod
+ g1 = 1
+ g2 = 0
+ j = gf_degree(a) - 8
+
+ while (a != 1) :
+ if (j < 0) :
+ a, v = v, a
+ g1, g2 = g2, g1
+ j = -j
+
+ a ^= v << j
+ g1 ^= g2 << j
+
+ a %= 256 # Emulating 8-bit overflow
+ g1 %= 256 # Emulating 8-bit overflow
+
+ j = gf_degree(a) - gf_degree(v)
+
+ return g1
```
# bitmatrix