(no commit message)

[libreriscv.git] / openpower / sv / bitmanip.mdwn
diff --git a/openpower/sv/bitmanip.mdwn b/openpower/sv/bitmanip.mdwn

index b165dd7ff8fa556bbaf0e92aa74da8365f6795cd..fb95b2dbfc67cc6a709b8f2a0e433bdc1a67fa18 100644 (file)
--- a/openpower/sv/bitmanip.mdwn
+++ b/openpower/sv/bitmanip.mdwn
@@ -39,17 +39,17 @@ ternlog has its own major opcode
  
  |  29.30 |31| name      |
  | ------ |--| --------- |
-|   00   |Rc| ternlogi  |
-|   01   |sz| ternlogv  |
-|   10   |0 | crternlog |
+|   0  0   |Rc| ternlogi  |
+|   0  1   |sz| ternlogv  |
+|   1 iv   |  | grevlogi |
  
  2nd major opcode for other bitmanip: minor opcode allocation
  
  |  28.30 |31| name      |
  | ------ |--| --------- |
-|  -00   |0 |           |
+|  -00   |0 | xpermi    |
  |  -00   |1 | grevlog   |
-|  -01   |  | grevlogi  |
+|  -01   |  | crternlog  |
  |  010   |Rc| bitmask   |
  |  011   |  | gf/cl madd*  |
  |  110   |Rc| 1/2-op    |
@@ -87,11 +87,15 @@ ternlog has its own major opcode
  
  TODO: convert all instructions to use RT and not RS
  
+| 0.5|6.8 | 9.11|12.14|15.17|18.20|21.28 | 29.30|31|name|
+| -- | -- | --- | --- | --- |-----|----- | -----|--|----|
+| NN | BT | BA  | BB  | BC  |m0-2 | imm  |  10  |m3|crternlog|
+
  | 0.5|6.10|11.15|16.20 |21..25   | 26....30  |31| name |
  | -- | -- | --- | ---  | -----   | --------  |--| ------ |
  | NN | RT | RA  |itype/| im0-4   | im5-7  00 |0 | xpermi  |
  | NN | RT | RA  | RB   | im0-4   | im5-7  00 |1 | grevlog |
-| NN | RT | RA  | s0-4 | im0-4   | im5-7  01 |s5| grevlogi |
+| NN |    |     |      |         | .....  01 |0 | crternlog |
  | NN | RT | RA  | RB   | RC      | mode  010 |Rc| bitmask* |
  | NN | RS | RA  | RB   | RC      | 00    011 |0 | gfbmadd |
  | NN | RS | RA  | RB   | RC      | 00    011 |1 | gfbmaddsub |
@@ -110,14 +114,8 @@ double check that instructions didn't need 3 inputs.
  
  | 0.5|6.10|11.15|16.20| 21 | 22.23 | 24....30 |31| name |
  | -- | -- | --- | --- | -- | ----- | -------- |--| ---- |
-| NN | RT | RA  | RB  | 0  |       | 0000 110 |Rc| rsvd   |
-| NN | RT | RA  | RB  | 1  | itype | 0000 110 |Rc| xperm |
-| NN | RA | RB  | RC  | 0  | itype | 0100 110 |Rc| minmax |
-| NN | RA | RB  | RC  | 1  |   00  | 0100 110 |Rc| av avgadd |
-| NN | RA | RB  | RC  | 1  |   01  | 0100 110 |Rc| av abs |
-| NN | RA | RB  |     | 1  |   10  | 0100 110 |Rc| rsvd |
-| NN | RA | RB  |     | 1  |   11  | 0100 110 |Rc| rsvd |
-| NN | RA | RB  | sh  | SH | itype | 1000 110 |Rc| bmopsi |
+| NN | RS | me  | sh  | SH | ME 0  | nn00 110 |Rc| bmopsi |
+| NN | RS | RB  | sh  | SH | /   0 | nn00 110 |Rc| bmopsi |
  | NN | RT | RA  | RB  |    |       | 1100 110 |Rc| srsvd |
  | NN | RT | RA  | RB  | 1  |  00   | 0001 110 |Rc| cldiv |
  | NN | RT | RA  | RB  | 1  |  01   | 0001 110 |Rc| clmod |
@@ -127,7 +125,12 @@ double check that instructions didn't need 3 inputs.
  | NN | RA | RB  | RC  | 0  |   01  | 0001 110 |Rc| vec sofm |
  | NN | RA | RB  | RC  | 0  |   10  | 0001 110 |Rc| vec sifm |
  | NN | RA | RB  | RC  | 0  |   11  | 0001 110 |Rc| vec cprop |
-| NN | RA | RB  |     | 0  |       | 0101 110 |Rc| rsvd |
+| NN | RT | RA  | RB  | 1  | itype | 0101 110 |Rc| xperm |
+| NN | RA | RB  | RC  | 0  | itype | 0101 110 |Rc| minmax |
+| NN | RA | RB  | RC  | 1  |   00  | 0101 110 |Rc| av avgadd |
+| NN | RA | RB  | RC  | 1  |   01  | 0101 110 |Rc| av abs |
+| NN | RA | RB  |     | 1  |   10  | 0101 110 |Rc| rsvd |
+| NN | RA | RB  |     | 1  |   11  | 0101 110 |Rc| rsvd |
  | NN | RA | RB  | RC  | 0  | 00    | 0010 110 |Rc| gorc |
  | NN | RA | RB  | sh  | SH | 00    | 1010 110 |Rc| gorci |
  | NN | RA | RB  | RC  | 0  | 00    | 0110 110 |Rc| gorcw |
@@ -139,10 +142,7 @@ double check that instructions didn't need 3 inputs.
  | NN | RA | RB  | RC  | 0  | 01    | 0110 110 |Rc| grevw |
  | NN | RA | RB  | sh  | 0  | 01    | 1110 110 |Rc| grevwi |
  | NN | RA | RB  | RC  | 1  | 01    | 1110 110 |Rc| bmatxor   |
-| NN | RA | RB  | RC  | 0  | 10    | 0010 110 |Rc| shfl |
-| NN | RA | RB  | sh  | SH | 10    | 1010 110 |Rc| shfli |
-| NN | RA | RB  | RC  | 0  | 10    | 0110 110 |Rc| shflw |
-| NN | RA | RB  | RC  |    | 10    | 1110 110 |Rc| rsvd    |
+| NN | RA | RB  | RC  |    | 10    | --10 110 |Rc| rsvd  |
  | NN | RA | RB  | RC  | 0  | 11    | 1110 110 |Rc| clmulr  |
  | NN | RA | RB  | RC  | 1  | 11    | 1110 110 |Rc| clmulh  |
  | NN |    |     |     |    |       | --11 110 |Rc| setvl  |
@@ -155,7 +155,6 @@ Like the x86 AVX512F [vpternlogd/vpternlogq](https://www.felixcloutier.com/x86/v
  
  ## ternlogi
  
-
  | 0.5|6.10|11.15|16.20| 21..28|29.30|31|
  | -- | -- | --- | --- | ----- | --- |--|
  | NN | RT | RA  | RB  | im0-7 |  00 |Rc|
@@ -202,7 +201,7 @@ another mode selection would be CRs not Ints.
  
  | 0.5|6.8 | 9.11|12.14|15.17|18.20|21.28 | 29.30|31|
  | -- | -- | --- | --- | --- |-----|----- | -----|--|
-| NN | BT | BA  | BB  | BC  |m0-3 | imm  |  10  |m4|
+| NN | BT | BA  | BB  | BC  |m0-2 | imm  |  10  |m3|
  
      mask = m0-3,m4
      for i in range(4):
@@ -270,25 +269,36 @@ Immediate-variant is an overwrite form:
  | NN | RS | RB  | sh  | SH | itype | 1000 110 |Rc| bm*i |
  
  ```
+def MASK(x, y):
+     if x < y:
+         x = x+1
+         mask_a = ((1 << x) - 1) & ((1 << 64) - 1)
+         mask_b = ((1 << y) - 1) & ((1 << 64) - 1)
+     elif x == y:
+         return 1 << x
+     else:
+         x = x+1
+         mask_a = ((1 << x) - 1) & ((1 << 64) - 1)
+         mask_b = (~((1 << y) - 1)) & ((1 << 64) - 1)
+     return mask_a ^ mask_b
+
+
  uint_xlen_t bmset(RS, RB, sh)
  {
      int shamt = RB & (XLEN - 1);
-    mask = (2<<sh)-1;
-    return RS | (mask << shamt);
+    return RS | MASK(shamt, sh)
  }
  
  uint_xlen_t bmclr(RS, RB, sh)
  {
      int shamt = RB & (XLEN - 1);
-    mask = (2<<sh)-1;
-    return RS & ~(mask << shamt);
+    return RS & ~MASK(shamt, sh)
  }
  
  uint_xlen_t bminv(RS, RB, sh)
  {
      int shamt = RB & (XLEN - 1);
-    mask = (2<<sh)-1;
-    return RS ^ (mask << shamt);
+    return RS ^ MASK(shamt, sh)
  }
  
  uint_xlen_t bmext(RS, RB, sh)
@@ -299,7 +309,7 @@ uint_xlen_t bmext(RS, RB, sh)
  }
  ```
  
-bitmask extract with reverse.  can be done by bitinverting all of RB and getting bits of RB from the opposite end.
+bitmask extract with reverse.  can be done by bit-order-inverting all of RB and getting bits of RB from the opposite end.
  
  when RA is zero, no shift occurs. this makes bmextrev useful for
  simply reversing all bits of a register.
@@ -312,7 +322,7 @@ rt = ZE(rev[msb:0]);
  uint_xlen_t bmextrev(RA, RB, sh)
  {
      int shamt = XLEN-1;
-    if (RA != 0) (GPR(RA) & (XLEN - 1));
+    if (RA != 0) shamt = (GPR(RA) & (XLEN - 1));
      shamt = (XLEN-1)-shamt;  # shift other end
      bra = bitreverse(RB)     # swap LSB-MSB
      mask = (2<<sh)-1;
@@ -328,19 +338,55 @@ uint_xlen_t bmextrev(RA, RB, sh)
  # grevlut
  
  generalised reverse combined with a pair of LUT2s and allowing
-zero when RA=0 provides a wide range of instructions
+a constant `0b0101...0101` when RA=0, and an option to invert
+(including when RA=0, giving a constant 0b1010...1010 as the
+initial value) provides a wide range of instructions
  and a means to set regular 64 bit patterns in one
  32 bit instruction.
  
  the two LUT2s are applied left-half (when not swapping)
  and right-half (when swapping) so as to allow a wider
-range of options
+range of options.
+
+<img src="/openpower/sv/grevlut2x2.jpg" width=700 />
+
+* A value of `0b11001010` for the immediate provides
+the functionality of a standard "grev".  
+* `0b11101110` provides gorc
  
  grevlut should be arranged so as to produce the constants
  needed to put into bext (bitextract) so as in turn to
-be able to emulate x86 pmovmask instructions <https://www.felixcloutier.com/x86/pmovmskb>
+be able to emulate x86 pmovmask instructions <https://www.felixcloutier.com/x86/pmovmskb>.
+This only requires 2 instructions (grevlut, bext).
  
-<img src="/openpower/sv/grevlut2x2.jpg" width=700 />
+Note that if the mask is required to be placed
+directly into CR Fields (for use as CR Predicate
+masks rather than a integer mask) then sv.ori
+may be used instead, bearing in mind that sv.ori
+is a 64-bit instruction, and `VL` must have been
+set to the required length:
+
+    sv.ori./elwid=8 r10.v, r10.v, 0
+
+The following settings provide the required mask constants:
+
+| RA       | RB      | imm        | iv | result        |
+| -------  | ------- | ---------- | -- | ----------    |
+| 0x555..  | 0b10    | 0b01101100 | 0  | 0x111111...   |
+| 0x555..  | 0b110   | 0b01101100 | 0  | 0x010101...   |
+| 0x555..  | 0b1110  | 0b01101100 | 0  | 0x00010001...   |
+| 0x555..  | 0b10    | 0b11000110 | 1  | 0x88888...   |
+| 0x555..  | 0b110   | 0b11000110 | 1  | 0x808080...   |
+| 0x555..  | 0b1110  | 0b11000110 | 1  | 0x80008000...   |
+
+Better diagram showing the correct ordering of shamt (RB).  A LUT2
+is applied to all locations marked in red using the first 4
+bits of the immediate, and a separate LUT2 applied to all
+locations in green using the upper 4 bits of the immediate.
+
+<img src="/openpower/sv/grevlut.png" width=700 />
+
+demo code [[openpower/sv/grevlut.py]]
  
  ```
  lut2(imm, a, b):
@@ -356,9 +402,11 @@ dorow(imm8, step_i, chunksize):
          step_o[j] = lut2(imm, step_i[j], step_i[j ^ chunk_size])
      return step_o
  
-uint64_t grevlut64(uint64_t RA, uint64_t RB, uint8 imm)
+uint64_t grevlut64(uint64_t RA, uint64_t RB, uint8 imm, bool iv)
  {
-    uint64_t x = RA;
+    uint64_t x = 0x5555_5555_5555_5555;
+    if (RA != 0) x = GPR(RA);
+    if (iv) x = ~x;
      int shamt = RB & 63;
      for i in 0 to 6
          step = 1<<i
@@ -368,6 +416,12 @@ uint64_t grevlut64(uint64_t RA, uint64_t RB, uint8 imm)
  
  ```
  
+| 0.5|6.10|11.15|16.20 |21..25   | 26....30    |31| name |
+| -- | -- | --- | ---  | -----   | --------    |--| ------ |
+| NN | RT | RA  | s0-4 | im0-4   | im5-7  1 iv |s5| grevlogi |
+| NN | RT | RA  | RB   | im0-4   | im5-7  00   |1 | grevlog |
+
+
  # grev
  
  based on RV bitmanip, this is also known as a butterfly network. however
@@ -400,72 +454,6 @@ uint64_t grev64(uint64_t RA, uint64_t RB)
  
  ```
  
-# shuffle / unshuffle
-
-based on RV bitmanip
-
-```
-uint32_t shfl32(uint32_t RA, uint32_t RB)
-{
-    uint32_t x = RA;
-    int shamt = RB & 15;
-    if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
-    if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
-    if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
-    if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
-    return x;
-}
-uint32_t unshfl32(uint32_t RA, uint32_t RB)
-{
-    uint32_t x = RA;
-    int shamt = RB & 15;
-    if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
-    if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
-    if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
-    if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
-    return x;
-}
-
-uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
-{
-    uint64_t x = src & ~(maskL | maskR);
-    x |= ((src << N) & maskL) | ((src >> N) & maskR);
-    return x;
-}
-uint64_t shfl64(uint64_t RA, uint64_t RB)
-{
-    uint64_t x = RA;
-    int shamt = RB & 31;
-    if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
-                                           0x00000000ffff0000LL, 16);
-    if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
-                                           0x0000ff000000ff00LL, 8);
-    if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
-                                           0x00f000f000f000f0LL, 4);
-    if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
-                                           0x0c0c0c0c0c0c0c0cLL, 2);
-    if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
-                                           0x2222222222222222LL, 1);
-    return x;
-}
-uint64_t unshfl64(uint64_t RA, uint64_t RB)
-{
-    uint64_t x = RA;
-    int shamt = RB & 31;
-    if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
-                                           0x2222222222222222LL, 1);
-    if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
-                                           0x0c0c0c0c0c0c0c0cLL, 2);
-    if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
-                                           0x00f000f000f000f0LL, 4);
-    if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
-                                           0x0000ff000000ff00LL, 8);
-    if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
-                                           0x00000000ffff0000LL, 16);
-    return x;
-}
-```
-
  # xperm
  
  based on RV bitmanip.
@@ -555,23 +543,7 @@ instruction is not provided since the `xor[i]` instruction can be used instead.
  These are operations on polynomials with coefficients in `GF(2)`, with the
  polynomial's coefficients packed into integers with the following algorithm:
  
-```python
-def pack_poly(poly):
-    """`poly` is a list where `poly[i]` is the coefficient for `x ** i`"""
-    retval = 0
-    for i, v in enumerate(poly):
-        retval |= v << i
-    return retval
-
-def unpack_poly(v):
-    """returns a list `poly`, where `poly[i]` is the coefficient for `x ** i`.
-    """
-    poly = []
-    while v != 0:
-        poly.append(v & 1)
-        v >>= 1
-    return poly
-```
+[[!inline pagenames="openpower/sv/bitmanip/pack_poly.py" raw="true" feeds="no" actions="yes"]]
  
  ## Carry-less Multiply Instructions
  
@@ -585,45 +557,18 @@ They are worth adding as their own non-overwrite operations
  
  ### `clmul` Carry-less Multiply
  
-```c
-uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
-{
-    uint_xlen_t x = 0;
-    for (int i = 0; i < XLEN; i++)
-        if ((RB >> i) & 1)
-            x ^= RA << i;
-    return x;
-}
-```
+[[!inline pagenames="openpower/sv/bitmanip/clmul.py" raw="true" feeds="no" actions="yes"]]
  
  ### `clmulh` Carry-less Multiply High
  
-```c
-uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
-{
-    uint_xlen_t x = 0;
-    for (int i = 1; i < XLEN; i++)
-        if ((RB >> i) & 1)
-            x ^= RA >> (XLEN-i);
-    return x;
-}
-```
+[[!inline pagenames="openpower/sv/bitmanip/clmulh.py" raw="true" feeds="no" actions="yes"]]
  
  ### `clmulr` Carry-less Multiply (Reversed)
  
  Useful for CRCs. Equivalent to bit-reversing the result of `clmul` on
  bit-reversed inputs.
  
-```c
-uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
-{
-    uint_xlen_t x = 0;
-    for (int i = 0; i < XLEN; i++)
-        if ((RB >> i) & 1)
-            x ^= RA >> (XLEN-i-1);
-    return x;
-}
-```
+[[!inline pagenames="openpower/sv/bitmanip/clmulr.py" raw="true" feeds="no" actions="yes"]]
  
  ## `clmadd` Carry-less Multiply-Add
  
@@ -649,16 +594,24 @@ temp = clmul((RA), (RB)) ^ (RC)
  (RS) = temp
  ```
  
+## `cldivrem` Carry-less Division and Remainder
+
+`cldivrem` isn't an actual instruction, but is just used in the pseudo-code
+for other instructions.
+
+[[!inline pagenames="openpower/sv/bitmanip/cldivrem.py" raw="true" feeds="no" actions="yes"]]
+
  ## `cldiv` Carry-less Division
  
  ```
  cldiv RT, RA, RB
  ```
  
-TODO: decide what happens on division by zero
-
  ```
-(RT) = cldiv((RA), (RB))
+n = (RA)
+d = (RB)
+q, r = cldivrem(n, d, width=XLEN)
+(RT) = q
  ```
  
  ## `clrem` Carry-less Remainder
@@ -667,10 +620,11 @@ TODO: decide what happens on division by zero
  clrem RT, RA, RB
  ```
  
-TODO: decide what happens on division by zero
-
  ```
-(RT) = clrem((RA), (RB))
+n = (RA)
+d = (RB)
+q, r = cldivrem(n, d, width=XLEN)
+(RT) = r
  ```
  
  # Instructions for Binary Galois Fields `GF(2^m)`