| 29.30 |31| name |
| ------ |--| --------- |
-| 00 |Rc| ternlogi |
-| 01 |sz| ternlogv |
-| 10 |0 | crternlog |
+| 0 0 |Rc| ternlogi |
+| 0 1 |sz| ternlogv |
+| 1 iv | | grevlogi |
2nd major opcode for other bitmanip: minor opcode allocation
| 28.30 |31| name |
| ------ |--| --------- |
-| -00 |0 | |
+| -00 |0 | xpermi |
| -00 |1 | grevlog |
-| -01 | | grevlogi |
+| -01 | | crternlog |
| 010 |Rc| bitmask |
| 011 | | gf/cl madd* |
| 110 |Rc| 1/2-op |
TODO: convert all instructions to use RT and not RS
+| 0.5|6.8 | 9.11|12.14|15.17|18.20|21.28 | 29.30|31|name|
+| -- | -- | --- | --- | --- |-----|----- | -----|--|----|
+| NN | BT | BA | BB | BC |m0-2 | imm | 10 |m3|crternlog|
+
| 0.5|6.10|11.15|16.20 |21..25 | 26....30 |31| name |
| -- | -- | --- | --- | ----- | -------- |--| ------ |
| NN | RT | RA |itype/| im0-4 | im5-7 00 |0 | xpermi |
| NN | RT | RA | RB | im0-4 | im5-7 00 |1 | grevlog |
-| NN | RT | RA | s0-4 | im0-4 | im5-7 01 |s5| grevlogi |
+| NN | | | | | ..... 01 |0 | crternlog |
| NN | RT | RA | RB | RC | mode 010 |Rc| bitmask* |
| NN | RS | RA | RB | RC | 00 011 |0 | gfbmadd |
| NN | RS | RA | RB | RC | 00 011 |1 | gfbmaddsub |
| 0.5|6.10|11.15|16.20| 21 | 22.23 | 24....30 |31| name |
| -- | -- | --- | --- | -- | ----- | -------- |--| ---- |
-| NN | RT | RA | RB | 0 | | 0000 110 |Rc| rsvd |
-| NN | RT | RA | RB | 1 | itype | 0000 110 |Rc| xperm |
-| NN | RA | RB | RC | 0 | itype | 0100 110 |Rc| minmax |
-| NN | RA | RB | RC | 1 | 00 | 0100 110 |Rc| av avgadd |
-| NN | RA | RB | RC | 1 | 01 | 0100 110 |Rc| av abs |
-| NN | RA | RB | | 1 | 10 | 0100 110 |Rc| rsvd |
-| NN | RA | RB | | 1 | 11 | 0100 110 |Rc| rsvd |
-| NN | RA | RB | sh | SH | itype | 1000 110 |Rc| bmopsi |
+| NN | RS | me | sh | SH | ME 0 | nn00 110 |Rc| bmopsi |
+| NN | RS | RB | sh | SH | / 0 | nn00 110 |Rc| bmopsi |
| NN | RT | RA | RB | | | 1100 110 |Rc| srsvd |
| NN | RT | RA | RB | 1 | 00 | 0001 110 |Rc| cldiv |
| NN | RT | RA | RB | 1 | 01 | 0001 110 |Rc| clmod |
| NN | RA | RB | RC | 0 | 01 | 0001 110 |Rc| vec sofm |
| NN | RA | RB | RC | 0 | 10 | 0001 110 |Rc| vec sifm |
| NN | RA | RB | RC | 0 | 11 | 0001 110 |Rc| vec cprop |
-| NN | RA | RB | | 0 | | 0101 110 |Rc| rsvd |
+| NN | RT | RA | RB | 1 | itype | 0101 110 |Rc| xperm |
+| NN | RA | RB | RC | 0 | itype | 0101 110 |Rc| minmax |
+| NN | RA | RB | RC | 1 | 00 | 0101 110 |Rc| av avgadd |
+| NN | RA | RB | RC | 1 | 01 | 0101 110 |Rc| av abs |
+| NN | RA | RB | | 1 | 10 | 0101 110 |Rc| rsvd |
+| NN | RA | RB | | 1 | 11 | 0101 110 |Rc| rsvd |
| NN | RA | RB | RC | 0 | 00 | 0010 110 |Rc| gorc |
| NN | RA | RB | sh | SH | 00 | 1010 110 |Rc| gorci |
| NN | RA | RB | RC | 0 | 00 | 0110 110 |Rc| gorcw |
| NN | RA | RB | RC | 0 | 01 | 0110 110 |Rc| grevw |
| NN | RA | RB | sh | 0 | 01 | 1110 110 |Rc| grevwi |
| NN | RA | RB | RC | 1 | 01 | 1110 110 |Rc| bmatxor |
-| NN | RA | RB | RC | 0 | 10 | 0010 110 |Rc| shfl |
-| NN | RA | RB | sh | SH | 10 | 1010 110 |Rc| shfli |
-| NN | RA | RB | RC | 0 | 10 | 0110 110 |Rc| shflw |
-| NN | RA | RB | RC | | 10 | 1110 110 |Rc| rsvd |
+| NN | RA | RB | RC | | 10 | --10 110 |Rc| rsvd |
| NN | RA | RB | RC | 0 | 11 | 1110 110 |Rc| clmulr |
| NN | RA | RB | RC | 1 | 11 | 1110 110 |Rc| clmulh |
| NN | | | | | | --11 110 |Rc| setvl |
## ternlogi
-
| 0.5|6.10|11.15|16.20| 21..28|29.30|31|
| -- | -- | --- | --- | ----- | --- |--|
| NN | RT | RA | RB | im0-7 | 00 |Rc|
| 0.5|6.8 | 9.11|12.14|15.17|18.20|21.28 | 29.30|31|
| -- | -- | --- | --- | --- |-----|----- | -----|--|
-| NN | BT | BA | BB | BC |m0-3 | imm | 10 |m4|
+| NN | BT | BA | BB | BC |m0-2 | imm | 10 |m3|
mask = m0-3,m4
for i in range(4):
| NN | RS | RB | sh | SH | itype | 1000 110 |Rc| bm*i |
```
+def MASK(x, y):
+ if x < y:
+ x = x+1
+ mask_a = ((1 << x) - 1) & ((1 << 64) - 1)
+ mask_b = ((1 << y) - 1) & ((1 << 64) - 1)
+ elif x == y:
+ return 1 << x
+ else:
+ x = x+1
+ mask_a = ((1 << x) - 1) & ((1 << 64) - 1)
+ mask_b = (~((1 << y) - 1)) & ((1 << 64) - 1)
+ return mask_a ^ mask_b
+
+
uint_xlen_t bmset(RS, RB, sh)
{
int shamt = RB & (XLEN - 1);
- mask = (2<<sh)-1;
- return RS | (mask << shamt);
+ return RS | MASK(shamt, sh)
}
uint_xlen_t bmclr(RS, RB, sh)
{
int shamt = RB & (XLEN - 1);
- mask = (2<<sh)-1;
- return RS & ~(mask << shamt);
+ return RS & ~MASK(shamt, sh)
}
uint_xlen_t bminv(RS, RB, sh)
{
int shamt = RB & (XLEN - 1);
- mask = (2<<sh)-1;
- return RS ^ (mask << shamt);
+ return RS ^ MASK(shamt, sh)
}
uint_xlen_t bmext(RS, RB, sh)
}
```
-bitmask extract with reverse. can be done by bitinverting all of RB and getting bits of RB from the opposite end.
+bitmask extract with reverse. can be done by bit-order-inverting all of RB and getting bits of RB from the opposite end.
when RA is zero, no shift occurs. this makes bmextrev useful for
simply reversing all bits of a register.
uint_xlen_t bmextrev(RA, RB, sh)
{
int shamt = XLEN-1;
- if (RA != 0) (GPR(RA) & (XLEN - 1));
+ if (RA != 0) shamt = (GPR(RA) & (XLEN - 1));
shamt = (XLEN-1)-shamt; # shift other end
bra = bitreverse(RB) # swap LSB-MSB
mask = (2<<sh)-1;
# grevlut
generalised reverse combined with a pair of LUT2s and allowing
-zero when RA=0 provides a wide range of instructions
+a constant `0b0101...0101` when RA=0, and an option to invert
+(including when RA=0, giving a constant 0b1010...1010 as the
+initial value) provides a wide range of instructions
and a means to set regular 64 bit patterns in one
32 bit instruction.
the two LUT2s are applied left-half (when not swapping)
and right-half (when swapping) so as to allow a wider
-range of options
+range of options.
+
+<img src="/openpower/sv/grevlut2x2.jpg" width=700 />
+
+* A value of `0b11001010` for the immediate provides
+the functionality of a standard "grev".
+* `0b11101110` provides gorc
grevlut should be arranged so as to produce the constants
needed to put into bext (bitextract) so as in turn to
-be able to emulate x86 pmovmask instructions <https://www.felixcloutier.com/x86/pmovmskb>
+be able to emulate x86 pmovmask instructions <https://www.felixcloutier.com/x86/pmovmskb>.
+This only requires 2 instructions (grevlut, bext).
-<img src="/openpower/sv/grevlut2x2.jpg" width=700 />
+Note that if the mask is required to be placed
+directly into CR Fields (for use as CR Predicate
+masks rather than a integer mask) then sv.ori
+may be used instead, bearing in mind that sv.ori
+is a 64-bit instruction, and `VL` must have been
+set to the required length:
+
+ sv.ori./elwid=8 r10.v, r10.v, 0
+
+The following settings provide the required mask constants:
+
+| RA | RB | imm | iv | result |
+| ------- | ------- | ---------- | -- | ---------- |
+| 0x555.. | 0b10 | 0b01101100 | 0 | 0x111111... |
+| 0x555.. | 0b110 | 0b01101100 | 0 | 0x010101... |
+| 0x555.. | 0b1110 | 0b01101100 | 0 | 0x00010001... |
+| 0x555.. | 0b10 | 0b11000110 | 1 | 0x88888... |
+| 0x555.. | 0b110 | 0b11000110 | 1 | 0x808080... |
+| 0x555.. | 0b1110 | 0b11000110 | 1 | 0x80008000... |
+
+Better diagram showing the correct ordering of shamt (RB). A LUT2
+is applied to all locations marked in red using the first 4
+bits of the immediate, and a separate LUT2 applied to all
+locations in green using the upper 4 bits of the immediate.
+
+<img src="/openpower/sv/grevlut.png" width=700 />
+
+demo code [[openpower/sv/grevlut.py]]
```
lut2(imm, a, b):
step_o[j] = lut2(imm, step_i[j], step_i[j ^ chunk_size])
return step_o
-uint64_t grevlut64(uint64_t RA, uint64_t RB, uint8 imm)
+uint64_t grevlut64(uint64_t RA, uint64_t RB, uint8 imm, bool iv)
{
- uint64_t x = RA;
+ uint64_t x = 0x5555_5555_5555_5555;
+ if (RA != 0) x = GPR(RA);
+ if (iv) x = ~x;
int shamt = RB & 63;
for i in 0 to 6
step = 1<<i
```
+| 0.5|6.10|11.15|16.20 |21..25 | 26....30 |31| name |
+| -- | -- | --- | --- | ----- | -------- |--| ------ |
+| NN | RT | RA | s0-4 | im0-4 | im5-7 1 iv |s5| grevlogi |
+| NN | RT | RA | RB | im0-4 | im5-7 00 |1 | grevlog |
+
+
# grev
based on RV bitmanip, this is also known as a butterfly network. however
```
-# shuffle / unshuffle
-
-based on RV bitmanip
-
-```
-uint32_t shfl32(uint32_t RA, uint32_t RB)
-{
- uint32_t x = RA;
- int shamt = RB & 15;
- if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
- if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
- if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
- if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
- return x;
-}
-uint32_t unshfl32(uint32_t RA, uint32_t RB)
-{
- uint32_t x = RA;
- int shamt = RB & 15;
- if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
- if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
- if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
- if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
- return x;
-}
-
-uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
-{
- uint64_t x = src & ~(maskL | maskR);
- x |= ((src << N) & maskL) | ((src >> N) & maskR);
- return x;
-}
-uint64_t shfl64(uint64_t RA, uint64_t RB)
-{
- uint64_t x = RA;
- int shamt = RB & 31;
- if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
- 0x00000000ffff0000LL, 16);
- if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
- 0x0000ff000000ff00LL, 8);
- if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
- 0x00f000f000f000f0LL, 4);
- if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
- 0x0c0c0c0c0c0c0c0cLL, 2);
- if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
- 0x2222222222222222LL, 1);
- return x;
-}
-uint64_t unshfl64(uint64_t RA, uint64_t RB)
-{
- uint64_t x = RA;
- int shamt = RB & 31;
- if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
- 0x2222222222222222LL, 1);
- if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
- 0x0c0c0c0c0c0c0c0cLL, 2);
- if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
- 0x00f000f000f000f0LL, 4);
- if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
- 0x0000ff000000ff00LL, 8);
- if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
- 0x00000000ffff0000LL, 16);
- return x;
-}
-```
-
# xperm
based on RV bitmanip.
These are operations on polynomials with coefficients in `GF(2)`, with the
polynomial's coefficients packed into integers with the following algorithm:
-```python
-def pack_poly(poly):
- """`poly` is a list where `poly[i]` is the coefficient for `x ** i`"""
- retval = 0
- for i, v in enumerate(poly):
- retval |= v << i
- return retval
-
-def unpack_poly(v):
- """returns a list `poly`, where `poly[i]` is the coefficient for `x ** i`.
- """
- poly = []
- while v != 0:
- poly.append(v & 1)
- v >>= 1
- return poly
-```
+[[!inline pagenames="openpower/sv/bitmanip/pack_poly.py" raw="true" feeds="no" actions="yes"]]
## Carry-less Multiply Instructions
### `clmul` Carry-less Multiply
-```c
-uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
-{
- uint_xlen_t x = 0;
- for (int i = 0; i < XLEN; i++)
- if ((RB >> i) & 1)
- x ^= RA << i;
- return x;
-}
-```
+[[!inline pagenames="openpower/sv/bitmanip/clmul.py" raw="true" feeds="no" actions="yes"]]
### `clmulh` Carry-less Multiply High
-```c
-uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
-{
- uint_xlen_t x = 0;
- for (int i = 1; i < XLEN; i++)
- if ((RB >> i) & 1)
- x ^= RA >> (XLEN-i);
- return x;
-}
-```
+[[!inline pagenames="openpower/sv/bitmanip/clmulh.py" raw="true" feeds="no" actions="yes"]]
### `clmulr` Carry-less Multiply (Reversed)
Useful for CRCs. Equivalent to bit-reversing the result of `clmul` on
bit-reversed inputs.
-```c
-uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
-{
- uint_xlen_t x = 0;
- for (int i = 0; i < XLEN; i++)
- if ((RB >> i) & 1)
- x ^= RA >> (XLEN-i-1);
- return x;
-}
-```
+[[!inline pagenames="openpower/sv/bitmanip/clmulr.py" raw="true" feeds="no" actions="yes"]]
## `clmadd` Carry-less Multiply-Add
(RS) = temp
```
+## `cldivrem` Carry-less Division and Remainder
+
+`cldivrem` isn't an actual instruction, but is just used in the pseudo-code
+for other instructions.
+
+[[!inline pagenames="openpower/sv/bitmanip/cldivrem.py" raw="true" feeds="no" actions="yes"]]
+
## `cldiv` Carry-less Division
```
cldiv RT, RA, RB
```
-TODO: decide what happens on division by zero
-
```
-(RT) = cldiv((RA), (RB))
+n = (RA)
+d = (RB)
+q, r = cldivrem(n, d, width=XLEN)
+(RT) = q
```
## `clrem` Carry-less Remainder
clrem RT, RA, RB
```
-TODO: decide what happens on division by zero
-
```
-(RT) = clrem((RA), (RB))
+n = (RA)
+d = (RB)
+q, r = cldivrem(n, d, width=XLEN)
+(RT) = r
```
# Instructions for Binary Galois Fields `GF(2^m)`