(no commit message)

[libreriscv.git] / openpower / sv / bitmanip.mdwn
diff --git a/openpower/sv/bitmanip.mdwn b/openpower/sv/bitmanip.mdwn

index c023f6cec6a717b098c66f01a3fe608aaa921cf1..ea18797f8437d5462cb1f239e20732b7e359b0ac 100644 (file)
--- a/openpower/sv/bitmanip.mdwn
+++ b/openpower/sv/bitmanip.mdwn
@@ -4,8 +4,6 @@
  
  * ternlogi <https://bugs.libre-soc.org/show_bug.cgi?id=745>
  * grev <https://bugs.libre-soc.org/show_bug.cgi?id=755>
-* remove Rc=1 from ternlog due to conflicts in encoding as well
-  as saving space <https://bugs.libre-soc.org/show_bug.cgi?id=753#c5>
  * GF2^M <https://bugs.libre-soc.org/show_bug.cgi?id=782>
  
  # bitmanipulation
@@ -39,18 +37,17 @@ ternlog has its own major opcode
  
  |  29.30 |31| name      |
  | ------ |--| --------- |
-|   00   |Rc| ternlogi  |
-|   01   |0 | ternlog   |
-|   01   |1 | ternlogv  |
-|   10   |0 | crternlog |
+|   0  0   |Rc| ternlogi  |
+|   0  1   |sz| ternlogv  |
+|   1 iv   |  | grevlogi |
  
  2nd major opcode for other bitmanip: minor opcode allocation
  
  |  28.30 |31| name      |
  | ------ |--| --------- |
-|  -00   |0 |           |
+|  -00   |0 | xpermi    |
  |  -00   |1 | grevlog   |
-|  -01   |  | grevlogi  |
+|  -01   |  | crternlog  |
  |  010   |Rc| bitmask   |
  |  011   |  | gf/cl madd*  |
  |  110   |Rc| 1/2-op    |
@@ -88,11 +85,15 @@ ternlog has its own major opcode
  
  TODO: convert all instructions to use RT and not RS
  
+| 0.5|6.8 | 9.11|12.14|15.17|18.20|21.28 | 29.30|31|name|
+| -- | -- | --- | --- | --- |-----|----- | -----|--|----|
+| NN | BT | BA  | BB  | BC  |m0-2 | imm  |  10  |m3|crternlog|
+
  | 0.5|6.10|11.15|16.20 |21..25   | 26....30  |31| name |
  | -- | -- | --- | ---  | -----   | --------  |--| ------ |
-| NN | RT | RA  | RB   |         |        00 |0 | rsvd   |
+| NN | RT | RA  |itype/| im0-4   | im5-7  00 |0 | xpermi  |
  | NN | RT | RA  | RB   | im0-4   | im5-7  00 |1 | grevlog |
-| NN | RT | RA  | s0-4 | im0-4   | im5-7  01 |s5| grevlogi |
+| NN |    |     |      |         | .....  01 |0 | crternlog |
  | NN | RT | RA  | RB   | RC      | mode  010 |Rc| bitmask* |
  | NN | RS | RA  | RB   | RC      | 00    011 |0 | gfbmadd |
  | NN | RS | RA  | RB   | RC      | 00    011 |1 | gfbmaddsub |
@@ -104,22 +105,16 @@ TODO: convert all instructions to use RT and not RS
  | NN | RT | RA  | RB   | sh0-4   | sh5 1 111 |Rc| bmrevi |
  
  ops (note that av avg and abs as well as vec scalar mask
-are included here)
+are included here [[sv/vector_ops]], and
+the [[sv/av_opcodes]])
  
  TODO: convert from RA, RB, and RC to correct field names of RT, RA, and RB, and
  double check that instructions didn't need 3 inputs.
  
  | 0.5|6.10|11.15|16.20| 21 | 22.23 | 24....30 |31| name |
  | -- | -- | --- | --- | -- | ----- | -------- |--| ---- |
-| NN | RA | RB  |     | 0  |       | 0000 110 |Rc| rsvd   |
-| NN | RA | RB  | RC  | 1  | itype | 0000 110 |Rc| xperm |
-| NN | RA | RB  | RC  | 0  | itype | 0100 110 |Rc| minmax |
-| NN | RA | RB  | RC  | 1  |   00  | 0100 110 |Rc| av avgadd |
-| NN | RA | RB  | RC  | 1  |   01  | 0100 110 |Rc| av abs |
-| NN | RA | RB  |     | 1  |   10  | 0100 110 |Rc| rsvd |
-| NN | RA | RB  |     | 1  |   11  | 0100 110 |Rc| rsvd |
-| NN | RA | RB  | sh  | SH | itype | 1000 110 |Rc| bmopsi |
-| NN | RA | RB  |     |    |       | 1100 110 |Rc| rsvd |
+| NN | RS | me  | sh  | SH | ME 0  | nn00 110 |Rc| bmopsi |
+| NN | RS | RB  | sh  | SH | /   1 | nn00 110 |Rc| bmopsi |
  | NN | RT | RA  | RB  | 1  |  00   | 0001 110 |Rc| cldiv |
  | NN | RT | RA  | RB  | 1  |  01   | 0001 110 |Rc| clmod |
  | NN | RT | RA  | RB  | 1  |  10   | 0001 110 |Rc|       |
@@ -128,7 +123,14 @@ double check that instructions didn't need 3 inputs.
  | NN | RA | RB  | RC  | 0  |   01  | 0001 110 |Rc| vec sofm |
  | NN | RA | RB  | RC  | 0  |   10  | 0001 110 |Rc| vec sifm |
  | NN | RA | RB  | RC  | 0  |   11  | 0001 110 |Rc| vec cprop |
-| NN | RA | RB  |     | 0  |       | 0101 110 |Rc| rsvd |
+| NN | RT | RA  | RB  | 1  | itype | 0101 110 |Rc| xperm |
+| NN | RA | RB  | RC  | 0  | itype | 0101 110 |Rc| minmax |
+| NN | RA | RB  | RC  | 1  |   00  | 0101 110 |Rc| av avgadds |
+| NN | RA | RB  | RC  | 1  |   01  | 0101 110 |Rc| av avgaddu|
+| NN | RA | RB  |     | 1  |   10  | 0101 110 |Rc| avg abs |
+| NN | RA | RB  |     | 1  |   11  | 0101 110 |Rc| rsvd |
+| NN | RA | RB  |     |    |       | 1001 110 |Rc| rsvd |
+| NN | RA | RB  |     |    |       | 1101 110 |Rc| rsvd |
  | NN | RA | RB  | RC  | 0  | 00    | 0010 110 |Rc| gorc |
  | NN | RA | RB  | sh  | SH | 00    | 1010 110 |Rc| gorci |
  | NN | RA | RB  | RC  | 0  | 00    | 0110 110 |Rc| gorcw |
@@ -140,10 +142,7 @@ double check that instructions didn't need 3 inputs.
  | NN | RA | RB  | RC  | 0  | 01    | 0110 110 |Rc| grevw |
  | NN | RA | RB  | sh  | 0  | 01    | 1110 110 |Rc| grevwi |
  | NN | RA | RB  | RC  | 1  | 01    | 1110 110 |Rc| bmatxor   |
-| NN | RA | RB  | RC  | 0  | 10    | 0010 110 |Rc| shfl |
-| NN | RA | RB  | sh  | SH | 10    | 1010 110 |Rc| shfli |
-| NN | RA | RB  | RC  | 0  | 10    | 0110 110 |Rc| shflw |
-| NN | RA | RB  | RC  |    | 10    | 1110 110 |Rc| rsvd    |
+| NN | RA | RB  | RC  |    | 10    | --10 110 |Rc| rsvd  |
  | NN | RA | RB  | RC  | 0  | 11    | 1110 110 |Rc| clmulr  |
  | NN | RA | RB  | RC  | 1  | 11    | 1110 110 |Rc| clmulh  |
  | NN |    |     |     |    |       | --11 110 |Rc| setvl  |
@@ -156,11 +155,9 @@ Like the x86 AVX512F [vpternlogd/vpternlogq](https://www.felixcloutier.com/x86/v
  
  ## ternlogi
  
-TODO: if/when we get more encoding space, add Rc=1 option back to ternlogi, for consistency with OpenPower base logical instructions (and./xor./or./etc.). <https://bugs.libre-soc.org/show_bug.cgi?id=745#c56>
-
-| 0.5|6.10|11.15|16.20| 21..25| 26..30   |31|
-| -- | -- | --- | --- | ----- | -------- |--|
-| NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc|
+| 0.5|6.10|11.15|16.20| 21..28|29.30|31|
+| -- | -- | --- | --- | ----- | --- |--|
+| NN | RT | RA  | RB  | im0-7 |  00 |Rc|
  
      lut3(imm, a, b, c):
          idx = c << 2 | b << 1 | a
@@ -169,30 +166,11 @@ TODO: if/when we get more encoding space, add Rc=1 option back to ternlogi, for
      for i in range(64): 
          RT[i] = lut3(imm, RB[i], RA[i], RT[i]) 
  
-bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
-
-## ternlog
-
-a 5 operand variant which becomes more along the lines of an FPGA,
-this is very expensive: 4 in and 1 out  and is not recommended.
-
-| 0.5|6.10|11.15|16.20|21.25| 26...30  |31|
-| -- | -- | --- | --- | --- | -------- |--|
-| NN | RT | RA  | RB  | RC  | mode  01 |1 |
-
-    for i in range(64):
-        j = (i//8)*8 # 0,8,16,24,..,56
-        lookup = RC[j:j+8]
-        RT[i] = lut3(lookup, RT[i], RA[i], RB[i])
-
-mode (3 bit) may be used to do inversion of ordering, similar to carryless mul,
-3 modes.
-
  ## ternlogv
  
  also, another possible variant involving swizzle-like selection
-and masking, this only requires 2 64 bit registers (RA, RS) and
-only up to 16 LUT3s.
+and masking, this only requires 3 64 bit registers (RA, RS, RB) and
+only 16 LUT3s.
  
  Note however that unless XLEN matches sz, this instruction
  is a Read-Modify-Write: RS must be read as a second operand
@@ -200,23 +178,22 @@ and all unmodified bits preserved.  SVP64 may provide limited
  alternative destination for RS from RS-as-source, but again
  all unmodified bits must still be copied.
  
-| 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
-| -- | -- | --- | ----- | ---- | ----- |--|
-| NN | RS | RA  | idx0-3| mask | sz 01   |0 |
+| 0.5|6.10|11.15|16.20|21.28 | 29.30 |31|
+| -- | -- | --- | --- | ---- | ----- |--|
+| NN | RS | RA  | RB  |idx0-3|  01   |sz|
  
      SZ = (1+sz) * 8 # 8 or 16
      raoff = MIN(XLEN, idx0 * SZ)
      rboff = MIN(XLEN, idx1 * SZ)
      rcoff = MIN(XLEN, idx2 * SZ)
-    imoff = MIN(XLEN, idx3 * SZ)
-    imm = RA[imoff:imoff+SZ]
+    rsoff = MIN(XLEN, idx3 * SZ)
+    imm = RB[0:8]
      for i in range(MIN(XLEN, SZ)):
          ra = RA[raoff:+i]
          rb = RA[rboff+i]
          rc = RA[rcoff+i]
          res = lut3(imm, ra, rb, rc)
-        for j in range(MIN(XLEN//8, 4)):
-             if mask[j]: RS[i+j*SZ] = res
+        RS[rsoff+i] = res
  
  ## ternlogcr
  
@@ -224,7 +201,7 @@ another mode selection would be CRs not Ints.
  
  | 0.5|6.8 | 9.11|12.14|15.17|18.20|21.28 | 29.30|31|
  | -- | -- | --- | --- | --- |-----|----- | -----|--|
-| NN | BT | BA  | BB  | BC  |m0-3 | imm  |  10  |m4|
+| NN | BT | BA  | BB  | BC  |m0-2 | imm  |  10  |m3|
  
      mask = m0-3,m4
      for i in range(4):
@@ -292,6 +269,20 @@ Immediate-variant is an overwrite form:
  | NN | RS | RB  | sh  | SH | itype | 1000 110 |Rc| bm*i |
  
  ```
+def MASK(x, y):
+     if x < y:
+         x = x+1
+         mask_a = ((1 << x) - 1) & ((1 << 64) - 1)
+         mask_b = ((1 << y) - 1) & ((1 << 64) - 1)
+     elif x == y:
+         return 1 << x
+     else:
+         x = x+1
+         mask_a = ((1 << x) - 1) & ((1 << 64) - 1)
+         mask_b = (~((1 << y) - 1)) & ((1 << 64) - 1)
+     return mask_a ^ mask_b
+
+
  uint_xlen_t bmset(RS, RB, sh)
  {
      int shamt = RB & (XLEN - 1);
@@ -321,18 +312,22 @@ uint_xlen_t bmext(RS, RB, sh)
  }
  ```
  
-bitmask extract with reverse.  can be done by bitinverting all of RA and getting bits of RA from the opposite end.
+bitmask extract with reverse.  can be done by bit-order-inverting all of RB and getting bits of RB from the opposite end.
+
+when RA is zero, no shift occurs. this makes bmextrev useful for
+simply reversing all bits of a register.
  
  ```
-msb = rb[5:0];
-rev[0:msb] = ra[msb:0];
+msb = ra[5:0];
+rev[0:msb] = rb[msb:0];
  rt = ZE(rev[msb:0]);
  
  uint_xlen_t bmextrev(RA, RB, sh)
  {
-    int shamt = (RB & (XLEN - 1));
+    int shamt = XLEN-1;
+    if (RA != 0) shamt = (GPR(RA) & (XLEN - 1));
      shamt = (XLEN-1)-shamt;  # shift other end
-    bra = bitreverse(RA)     # swap LSB-MSB
+    bra = bitreverse(RB)     # swap LSB-MSB
      mask = (2<<sh)-1;
      return mask & (bra >> shamt);
  }
@@ -346,19 +341,55 @@ uint_xlen_t bmextrev(RA, RB, sh)
  # grevlut
  
  generalised reverse combined with a pair of LUT2s and allowing
-zero when RA=0 provides a wide range of instructions
+a constant `0b0101...0101` when RA=0, and an option to invert
+(including when RA=0, giving a constant 0b1010...1010 as the
+initial value) provides a wide range of instructions
  and a means to set regular 64 bit patterns in one
  32 bit instruction.
  
  the two LUT2s are applied left-half (when not swapping)
  and right-half (when swapping) so as to allow a wider
-range of options
+range of options.
+
+<img src="/openpower/sv/grevlut2x2.jpg" width=700 />
+
+* A value of `0b11001010` for the immediate provides
+the functionality of a standard "grev".  
+* `0b11101110` provides gorc
  
  grevlut should be arranged so as to produce the constants
  needed to put into bext (bitextract) so as in turn to
-be able to emulate x86 pmovmask instructions <https://www.felixcloutier.com/x86/pmovmskb>
+be able to emulate x86 pmovmask instructions <https://www.felixcloutier.com/x86/pmovmskb>.
+This only requires 2 instructions (grevlut, bext).
  
-<img src="/openpower/sv/grevlut2x2.jpg" width=700 />
+Note that if the mask is required to be placed
+directly into CR Fields (for use as CR Predicate
+masks rather than a integer mask) then sv.ori
+may be used instead, bearing in mind that sv.ori
+is a 64-bit instruction, and `VL` must have been
+set to the required length:
+
+    sv.ori./elwid=8 r10.v, r10.v, 0
+
+The following settings provide the required mask constants:
+
+| RA       | RB      | imm        | iv | result        |
+| -------  | ------- | ---------- | -- | ----------    |
+| 0x555..  | 0b10    | 0b01101100 | 0  | 0x111111...   |
+| 0x555..  | 0b110   | 0b01101100 | 0  | 0x010101...   |
+| 0x555..  | 0b1110  | 0b01101100 | 0  | 0x00010001...   |
+| 0x555..  | 0b10    | 0b11000110 | 1  | 0x88888...   |
+| 0x555..  | 0b110   | 0b11000110 | 1  | 0x808080...   |
+| 0x555..  | 0b1110  | 0b11000110 | 1  | 0x80008000...   |
+
+Better diagram showing the correct ordering of shamt (RB).  A LUT2
+is applied to all locations marked in red using the first 4
+bits of the immediate, and a separate LUT2 applied to all
+locations in green using the upper 4 bits of the immediate.
+
+<img src="/openpower/sv/grevlut.png" width=700 />
+
+demo code [[openpower/sv/grevlut.py]]
  
  ```
  lut2(imm, a, b):
@@ -374,9 +405,11 @@ dorow(imm8, step_i, chunksize):
          step_o[j] = lut2(imm, step_i[j], step_i[j ^ chunk_size])
      return step_o
  
-uint64_t grevlut64(uint64_t RA, uint64_t RB, uint8 imm)
+uint64_t grevlut64(uint64_t RA, uint64_t RB, uint8 imm, bool iv)
  {
-    uint64_t x = RA;
+    uint64_t x = 0x5555_5555_5555_5555;
+    if (RA != 0) x = GPR(RA);
+    if (iv) x = ~x;
      int shamt = RB & 63;
      for i in 0 to 6
          step = 1<<i
@@ -386,6 +419,12 @@ uint64_t grevlut64(uint64_t RA, uint64_t RB, uint8 imm)
  
  ```
  
+| 0.5|6.10|11.15|16.20 |21..25   | 26....30    |31| name |
+| -- | -- | --- | ---  | -----   | --------    |--| ------ |
+| NN | RT | RA  | s0-4 | im0-4   | im5-7  1 iv |s5| grevlogi |
+| NN | RT | RA  | RB   | im0-4   | im5-7  00   |1 | grevlog |
+
+
  # grev
  
  based on RV bitmanip, this is also known as a butterfly network. however
@@ -418,86 +457,37 @@ uint64_t grev64(uint64_t RA, uint64_t RB)
  
  ```
  
-# shuffle / unshuffle
+# xperm
  
-based on RV bitmanip
+based on RV bitmanip.
  
-```
-uint32_t shfl32(uint32_t RA, uint32_t RB)
-{
-    uint32_t x = RA;
-    int shamt = RB & 15;
-    if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
-    if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
-    if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
-    if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
-    return x;
-}
-uint32_t unshfl32(uint32_t RA, uint32_t RB)
-{
-    uint32_t x = RA;
-    int shamt = RB & 15;
-    if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
-    if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
-    if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
-    if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
-    return x;
-}
+RA contains a vector of indices to select parts of RB to be
+copied to RT.  The immediate-variant allows up to an 8 bit
+pattern (repeated) to be targetted at different parts of RT
  
-uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
-{
-    uint64_t x = src & ~(maskL | maskR);
-    x |= ((src << N) & maskL) | ((src >> N) & maskR);
-    return x;
-}
-uint64_t shfl64(uint64_t RA, uint64_t RB)
-{
-    uint64_t x = RA;
-    int shamt = RB & 31;
-    if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
-                                           0x00000000ffff0000LL, 16);
-    if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
-                                           0x0000ff000000ff00LL, 8);
-    if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
-                                           0x00f000f000f000f0LL, 4);
-    if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
-                                           0x0c0c0c0c0c0c0c0cLL, 2);
-    if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
-                                           0x2222222222222222LL, 1);
-    return x;
-}
-uint64_t unshfl64(uint64_t RA, uint64_t RB)
+```
+uint_xlen_t xpermi(uint8_t imm8, uint_xlen_t RB, int sz_log2)
  {
-    uint64_t x = RA;
-    int shamt = RB & 31;
-    if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
-                                           0x2222222222222222LL, 1);
-    if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
-                                           0x0c0c0c0c0c0c0c0cLL, 2);
-    if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
-                                           0x00f000f000f000f0LL, 4);
-    if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
-                                           0x0000ff000000ff00LL, 8);
-    if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
-                                           0x00000000ffff0000LL, 16);
-    return x;
+    uint_xlen_t r = 0;
+    uint_xlen_t sz = 1LL << sz_log2;
+    uint_xlen_t mask = (1LL << sz) - 1;
+    uint_xlen_t RA = imm8 | imm8<<8 | ... | imm8<<56;
+    for (int i = 0; i < XLEN; i += sz) {
+        uint_xlen_t pos = ((RA >> i) & mask) << sz_log2;
+        if (pos < XLEN)
+            r |= ((RB >> pos) & mask) << i;
+    }
+    return r;
  }
-```
-
-# xperm
-
-based on RV bitmanip
-
-```
  uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
  {
      uint_xlen_t r = 0;
      uint_xlen_t sz = 1LL << sz_log2;
      uint_xlen_t mask = (1LL << sz) - 1;
      for (int i = 0; i < XLEN; i += sz) {
-        uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
+        uint_xlen_t pos = ((RA >> i) & mask) << sz_log2;
          if (pos < XLEN)
-            r |= ((RA >> pos) & mask) << i;
+            r |= ((RB >> pos) & mask) << i;
      }
      return r;
  }
@@ -556,23 +546,7 @@ instruction is not provided since the `xor[i]` instruction can be used instead.
  These are operations on polynomials with coefficients in `GF(2)`, with the
  polynomial's coefficients packed into integers with the following algorithm:
  
-```python
-def pack_poly(poly):
-    """`poly` is a list where `poly[i]` is the coefficient for `x ** i`"""
-    retval = 0
-    for i, v in enumerate(poly):
-        retval |= v << i
-    return retval
-
-def unpack_poly(v):
-    """returns a list `poly`, where `poly[i]` is the coefficient for `x ** i`.
-    """
-    poly = []
-    while v != 0:
-        poly.append(v & 1)
-        v >>= 1
-    return poly
-```
+[[!inline pagenames="openpower/sv/bitmanip/pack_poly.py" raw="true" feeds="no" actions="yes"]]
  
  ## Carry-less Multiply Instructions
  
@@ -586,45 +560,18 @@ They are worth adding as their own non-overwrite operations
  
  ### `clmul` Carry-less Multiply
  
-```c
-uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
-{
-    uint_xlen_t x = 0;
-    for (int i = 0; i < XLEN; i++)
-        if ((RB >> i) & 1)
-            x ^= RA << i;
-    return x;
-}
-```
+[[!inline pagenames="openpower/sv/bitmanip/clmul.py" raw="true" feeds="no" actions="yes"]]
  
  ### `clmulh` Carry-less Multiply High
  
-```c
-uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
-{
-    uint_xlen_t x = 0;
-    for (int i = 1; i < XLEN; i++)
-        if ((RB >> i) & 1)
-            x ^= RA >> (XLEN-i);
-    return x;
-}
-```
+[[!inline pagenames="openpower/sv/bitmanip/clmulh.py" raw="true" feeds="no" actions="yes"]]
  
  ### `clmulr` Carry-less Multiply (Reversed)
  
  Useful for CRCs. Equivalent to bit-reversing the result of `clmul` on
  bit-reversed inputs.
  
-```c
-uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
-{
-    uint_xlen_t x = 0;
-    for (int i = 0; i < XLEN; i++)
-        if ((RB >> i) & 1)
-            x ^= RA >> (XLEN-i-1);
-    return x;
-}
-```
+[[!inline pagenames="openpower/sv/bitmanip/clmulr.py" raw="true" feeds="no" actions="yes"]]
  
  ## `clmadd` Carry-less Multiply-Add
  
@@ -644,11 +591,17 @@ cltmadd RT, RA, RB, RC
  
  TODO: add link to explanation for where `RS` comes from.
  
+``` 
+(RT) = RC ^ clmul((RA), (RB))
+(RS) = RA ^ RC
  ```
-temp = clmul((RA), (RB)) ^ (RC)
-(RT) = temp
-(RS) = temp
-```
+
+## `cldivrem` Carry-less Division and Remainder
+
+`cldivrem` isn't an actual instruction, but is just used in the pseudo-code
+for other instructions.
+
+[[!inline pagenames="openpower/sv/bitmanip/cldivrem.py" raw="true" feeds="no" actions="yes"]]
  
  ## `cldiv` Carry-less Division
  
@@ -656,10 +609,11 @@ temp = clmul((RA), (RB)) ^ (RC)
  cldiv RT, RA, RB
  ```
  
-TODO: decide what happens on division by zero
-
  ```
-(RT) = cldiv((RA), (RB))
+n = (RA)
+d = (RB)
+q, r = cldivrem(n, d, width=XLEN)
+(RT) = q
  ```
  
  ## `clrem` Carry-less Remainder
@@ -668,10 +622,11 @@ TODO: decide what happens on division by zero
  clrem RT, RA, RB
  ```
  
-TODO: decide what happens on division by zero
-
  ```
-(RT) = clrem((RA), (RB))
+n = (RA)
+d = (RB)
+q, r = cldivrem(n, d, width=XLEN)
+(RT) = r
  ```
  
  # Instructions for Binary Galois Fields `GF(2^m)`
@@ -701,40 +656,21 @@ the LSB set, since otherwise it would be divisible by the polynomial `x`,
  making it reducible, making whatever we're working on no longer a Field.
  Therefore, we can reuse the LSB to indicate `degree == XLEN`.
  
-```python
-def decode_reducing_polynomial(GFBREDPOLY, XLEN):
-    """returns the decoded coefficient list in LSB to MSB order,
-        len(retval) == degree + 1"""
-    v = GFBREDPOLY & ((1 << XLEN) - 1) # mask to XLEN bits
-    if v == 0 or v == 2: # GF(2)
-        return [0, 1] # degree = 1, poly = x
-    if v & 1:
-        degree = floor_log2(v)
-    else:
-        # all reducing polynomials of degree > 1 must have the LSB set,
-        # because they must be irreducible polynomials (meaning they
-        # can't be factored), if the LSB was clear, then they would
-        # have `x` as a factor. Therefore, we can reuse the LSB clear
-        # to instead mean the polynomial has degree XLEN.
-        degree = XLEN
-        v |= 1 << XLEN
-        v |= 1 # LSB must be set
-    return [(v >> i) & 1 for i in range(1 + degree)]
-```
+[[!inline pagenames="openpower/sv/bitmanip/decode_reducing_polynomial.py" raw="true" feeds="no" actions="yes"]]
  
  ## `gfbredpoly` -- Set the Reducing Polynomial SPR `GFBREDPOLY`
  
  unless this is an immediate op, `mtspr` is completely sufficient.
  
+[[!inline pagenames="openpower/sv/bitmanip/gfbredpoly.py" raw="true" feeds="no" actions="yes"]]
+
  ## `gfbmul` -- Binary Galois Field `GF(2^m)` Multiplication
  
  ```
  gfbmul RT, RA, RB
  ```
  
-```
-(RT) = gfbmul((RA), (RB))
-```
+[[!inline pagenames="openpower/sv/bitmanip/gfbmul.py" raw="true" feeds="no" actions="yes"]]
  
  ## `gfbmadd` -- Binary Galois Field `GF(2^m)` Multiply-Add
  
@@ -742,9 +678,7 @@ gfbmul RT, RA, RB
  gfbmadd RT, RA, RB, RC
  ```
  
-```
-(RT) = gfbadd(gfbmul((RA), (RB)), (RC))
-```
+[[!inline pagenames="openpower/sv/bitmanip/gfbmadd.py" raw="true" feeds="no" actions="yes"]]
  
  ## `gfbtmadd` -- Binary Galois Field `GF(2^m)` Twin Multiply-Add (for FFT)
  
@@ -754,10 +688,9 @@ gfbtmadd RT, RA, RB, RC
  
  TODO: add link to explanation for where `RS` comes from.
  
-```
-temp = gfbadd(gfbmul((RA), (RB)), (RC))
-(RT) = temp
-(RS) = temp
+``` 
+(RT) = gfbmadd((RA), (RB), (RC))
+(RS) = RA ^ RC
  ```
  
  ## `gfbinv` -- Binary Galois Field `GF(2^m)` Inverse
@@ -766,9 +699,7 @@ temp = gfbadd(gfbmul((RA), (RB)), (RC))
  gfbinv RT, RA
  ```
  
-```
-(RT) = gfbinv((RA))
-```
+[[!inline pagenames="openpower/sv/bitmanip/gfbinv.py" raw="true" feeds="no" actions="yes"]]
  
  # Instructions for Prime Galois Fields `GF(p)`
  
@@ -975,118 +906,6 @@ if __name__ == "__main__":
      print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
  ```
  
-## GF(2^M) Inverse
-
-```
-# https://bugs.libre-soc.org/show_bug.cgi?id=782#c33
-# https://ftp.libre-soc.org/ARITH18_Kobayashi.pdf
-def gf_invert(a) :
-
-    s = getGF2() # get the full polynomial (including the MSB)
-    r = a
-    v = 0
-    u = 1
-    j = 0
-
-    for i in range(1, 2*degree+1):
-        # could use count-trailing-1s here to skip ahead
-        if r & mask1:          # test MSB of r
-            if s & mask1:      # test MSB of s
-                s ^= r
-                v ^= u
-            s <<= 1            # shift left 1
-            if j == 0:
-                r, s = s, r    # swap r,s
-                u, v = v<<1, u # shift v and swap
-                j = 1
-            else:
-                u >>= 1        # right shift left
-                j -= 1
-        else:
-            r <<= 1            # shift left 1
-            u <<= 1            # shift left 1
-            j += 1
-
-    return u
-```
-
-# GF2 (Carryless)
-
-## GF2 (carryless) div and mod
-
-```
-def gf_degree(a) :
-  res = 0
-  a >>= 1
-  while (a != 0) :
-    a >>= 1;
-    res += 1;
-  return res
-
-def FullDivision(self, f, v):
-        """
-        Takes two arguments, f, v
-        fDegree and vDegree are the degrees of the field elements
-        f and v represented as a polynomials.
-        This method returns the field elements a and b such that
-
-            f(x) = a(x) * v(x) + b(x).  
-
-        That is, a is the divisor and b is the remainder, or in
-        other words a is like floor(f/v) and b is like f modulo v.
-        """
-
-        fDegree, vDegree = gf_degree(f), gf_degree(v)
-        res, rem = 0, f
-        for i in reversed(range(vDegree, fDegree+1):
-            if ((rem >> i) & 1): # check bit
-                res ^= (1 << (i - vDegree))
-                rem ^= ( v << (i - vDegree)))
-        return (res, rem)
-```
-
-| 0.5|6.10|11.15|16.20| 21 | 22.23 | 24....30 |31| name |
-| -- | -- | --- | --- | -- | ----- | -------- |--| ---- |
-| NN | RT | RA  | RB  | 1  |  00   | 0001 110 |Rc| cldiv |
-| NN | RT | RA  | RB  | 1  |  01   | 0001 110 |Rc| clmod |
-
-## GF2 carryless mul
-
-based on RV bitmanip
-see <https://en.wikipedia.org/wiki/CLMUL_instruction_set> and
-<https://www.felixcloutier.com/x86/pclmulqdq> and
-<https://en.m.wikipedia.org/wiki/Carry-less_product>
-
-these are GF2 operations with the modulo set to 2^degree.
-they are worth adding as their own non-overwrite operations
-(in the same pipeline).
-
-```
-uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
-{
-    uint_xlen_t x = 0;
-    for (int i = 0; i < XLEN; i++)
-        if ((RB >> i) & 1)
-            x ^= RA << i;
-    return x;
-}
-uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
-{
-    uint_xlen_t x = 0;
-    for (int i = 1; i < XLEN; i++)
-        if ((RB >> i) & 1)
-            x ^= RA >> (XLEN-i);
-    return x;
-}
-uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
-{
-    uint_xlen_t x = 0;
-    for (int i = 0; i < XLEN; i++)
-        if ((RB >> i) & 1)
-            x ^= RA >> (XLEN-i-1);
-    return x;
-}
-```
  ## carryless Twin Butterfly (Tukey-Cooley) Mul-add-sub
  
  used in combination with SV FFT REMAP to perform
@@ -1164,7 +983,7 @@ if((RS)i=1) then break end end count ← count + 1
  RA ← EXTZ64(count)
  ```
  
-##  bit deposit
+## bit deposit
  
  vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep, found already in v3.1 p106