sync_up: Updated my section

[libreriscv.git] / openpower / sv / bitmanip.mdwn
diff --git a/openpower/sv/bitmanip.mdwn b/openpower/sv/bitmanip.mdwn

index 5ad241f3f25b01488dd178b3cbf53ac74239f352..5c84e83e5912844d771488ac875921f787accf40 100644 (file)
--- a/openpower/sv/bitmanip.mdwn
+++ b/openpower/sv/bitmanip.mdwn
@@ -19,8 +19,8 @@ pseudocode: [[openpower/isa/bitmanip]]
  this extension amalgamates bitmanipulation primitives from many sources,
  including RISC-V bitmanip, Packed SIMD, AVX-512 and OpenPOWER VSX.
  Also included are DSP/Multimedia operations suitable for Audio/Video.
-Vectorisation and SIMD are removed: these are straight scalar (element)
-operations making them suitable for embedded applications.  Vectorisation
+Vectorization and SIMD are removed: these are straight scalar (element)
+operations making them suitable for embedded applications.  Vectorization
  Context is provided by [[openpower/sv]].
  
  When combined with SV, scalar variants of bitmanip operations found in
@@ -109,7 +109,7 @@ For bincrlut, `BFA` selects the 4-bit CR Field as the LUT2:
      for i in range(64): 
          RT[i] = lut2(CRs{BFA}, RB[i], RA[i]) 
  
-When Vectorised with SVP64, as usual both source and destination may be
+When Vectorized with SVP64, as usual both source and destination may be
  Vector or Scalar.
  
  *Programmer's note: a dynamic ternary lookup may be synthesised from
@@ -159,7 +159,7 @@ CRB-Form:
          a,b = CRs[BF][i], CRs[BF][i])
          if msk[i] CRs[BF][i] = lut2(CRs[BFB], a, b)
  
-When SVP64 Vectorised any of the 4 operands may be Scalar or
+When SVP64 Vectorized any of the 4 operands may be Scalar or
  Vector, including `BFB` meaning that multiple different dynamic
  lookups may be performed with a single instruction.  Note that
  this instruction is deliberately an overwrite in order to reduce
@@ -178,21 +178,20 @@ writing back to non-masked-out bits of `BF`.
  
  required for the [[sv/av_opcodes]]
  
-signed and unsigned min/max for integer.  this is sort-of partly
-synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg
-is one of the sources, but not both signed and unsigned.  when the dest
-is also one of the srces and the mv fails due to the CR bittest failing
-this will only overwrite the dest where the src is greater (or less).
+signed and unsigned min/max for integer.
  
  signed/unsigned min/max gives more flexibility.
  
+\[un]signed min/max instructions are specifically needed for vector reduce min/max operations which are pretty common.
+
  X-Form
  
-* XO=0001001110, itype=0b00 min, unsigned
-* XO=0101001110, itype=0b01 min, signed
-* XO=0011001110, itype=0b10 max, unsigned
-* XO=0111001110, itype=0b11 max, signed
+* PO=19, XO=----000011 `minmax RT, RA, RB, MMM`
+* PO=19, XO=----000011 `minmax. RT, RA, RB, MMM`
+
+see [[openpower/sv/rfc/ls013]] for `MMM` definition and pseudo-code.
  
+implements all of (and more):
  
  ```
  uint_xlen_t mins(uint_xlen_t rs1, uint_xlen_t rs2)
@@ -264,16 +263,21 @@ Replaces a pair of explicit instructions in hot-loops.
  
  Pseudo-code (shadd):
  
-    shift <- shift + 1                 # Shift is between 1-4
-    sum[0:63] <- ((RB) << shift) + (RA) # Shift RB, add RA
-    RT <- sum                          # Result stored in RT
+    n <- (RB)
+    m <- sm + 1
+    RT <- (n[m:XLEN-1] || [0]*m) + (RA)
+
+Pseudo-code (shaddw):
+
+    shift <- sm + 1                # Shift is between 1-4
+    n <- EXTS((RB)[XLEN/2:XLEN-1]) # Only use lower XLEN/2-bits of RB
+    RT <- (n << shift) + (RA)      # Shift n, add RA
  
  Pseudo-code (shadduw):
  
-    shift <- shift + 1                 # Shift is between 1-4
-    n <- (RB)[XLEN/2:XLEN-1]           # Limit RB to upper word (32-bits)
-    sum[0:63] <- (n << shift) + (RA)    # Shift n, add RA
-    RT <- sum                          # Result stored in RT
+    n <- ([0]*(XLEN/2)) || (RB)[XLEN/2:XLEN-1]
+    m <- sm + 1
+    RT <- (n[m:XLEN-1] || [0]*m) + (RA)
  
  ```
  uint_xlen_t shadd(uint_xlen_t RA, uint_xlen_t RB, uint8_t sm) {
@@ -281,6 +285,12 @@ uint_xlen_t shadd(uint_xlen_t RA, uint_xlen_t RB, uint8_t sm) {
      return (RB << (sm+1)) + RA;
  }
  
+uint_xlen_t shaddw(uint_xlen_t RA, uint_xlen_t RB, uint8_t sm) {
+    uint_xlen_t n = (int_xlen_t)(RB << XLEN / 2) >> XLEN / 2;
+    sm = sm & 0x3;
+    return (n << (sm+1)) + RA;
+}
+
  uint_xlen_t shadduw(uint_xlen_t RA, uint_xlen_t RB, uint8_t sm) {
      uint_xlen_t n = RB & 0xFFFFFFFF;
      sm = sm & 0x3;
@@ -449,30 +459,38 @@ locations in green using the upper 4 bits of the immediate.
  demo code [[openpower/sv/grevlut.py]]
  
  ```
-lut2(imm, a, b):
+def lut2(imm, a, b):
      idx = b << 1 | a
-    return imm[idx] # idx by LSB0 order
-
-dorow(imm8, step_i, chunksize, us32b):
-    for j in 0 to 31 if is32b else 63:
-        if (j&chunk_size) == 0
-           imm = imm8[0..3]
-        else
-           imm = imm8[4..7]
-        step_o[j] = lut2(imm, step_i[j], step_i[j ^ chunk_size])
+    return (imm>>idx) & 1
+
+def dorow(imm8, step_i, chunk_size):
+    step_o = 0
+    for j in range(64):
+        if (j&chunk_size) == 0:
+           imm = (imm8 & 0b1111)
+        else:
+           imm = (imm8>>4)
+        a = (step_i>>j)&1
+        b = (step_i>>(j ^ chunk_size))&1
+        res = lut2(imm, a, b)
+        #print(j, bin(imm), a, b, res)
+        step_o |= (res<<j)
+    #print ("  ", chunk_size, bin(step_o))
      return step_o
  
-uint64_t grevlut(uint64_t RA, uint64_t RB, uint8 imm, bool iv, bool is32b)
-{
-    uint64_t x = 0x5555_5555_5555_5555;
-    if (RA != 0) x = GPR(RA);
-    if (iv) x = ~x;
-    int shamt = RB & 31 if is32b else 63
-    for i in 0 to (6-is32b)
+def grevlut64(RA, RB, imm, iv):
+    x = 0
+    if RA is None: # RA=0
+        x = 0x5555555555555555
+    else:
+        x = RA
+    if (iv): x = ~x;
+    shamt = RB & 63;
+    for i in range(6):
          step = 1<<i
-        if (shamt & step) x = dorow(imm, x, step, is32b)
-    return x;
-}
+        if (shamt & step):
+            x = dorow(imm, x, step)
+    return x & ((1<<64)-1)
  ```
  
  A variant may specify different LUT-pairs per row,