(no commit message)

[libreriscv.git] / openpower / sv / ldst.mdwn
diff --git a/openpower/sv/ldst.mdwn b/openpower/sv/ldst.mdwn

index 8256443a54fc030fa51d0edf14789d56777c8085..a34025f54f10eaeb151b581fa94c1e58dca7eb6b 100644 (file)
--- a/openpower/sv/ldst.mdwn
+++ b/openpower/sv/ldst.mdwn
@@ -9,11 +9,12 @@ Links:
  * <https://bugs.libre-soc.org/show_bug.cgi?id=571>
  * <https://llvm.org/devmtg/2016-11/Slides/Emerson-ScalableVectorizationinLLVMIR.pdf>
  * <https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#vector-loads-and-stores>
+* [[simple_v_extension/specification/ld.x]]
  
  Vectorisation of Load and Store requires creation, from scalar operations,
  a number of different modes:
  
-* fixed stride (contiguous sequence with no gaps)
+* fixed stride (contiguous sequence with no gaps) aka "unit" stride
  * element strided (sequential but regularly offset, with gaps)
  * vector indexed (vector of base addresses and vector of offsets)
  * fail-first on the same (where it makes sense to do so)
@@ -42,7 +43,7 @@ with the pseudocode below, the immediate can be used to give unit stride or elem
  
      # LD not VLD!  format - ldop RT, immed(RA)
      # op_width: lb=1, lh=2, lw=4, ld=8
-    op_load(RT, RA, op_width, immed, svctx, RAupdate):
+    op_load(RT, RA, RC, op_width, immed, svctx, RAupdate):
        ps = get_pred_val(FALSE, RA); # predication on src
        pd = get_pred_val(FALSE, RT); # ... AND on dest
        for (i=0, j=0, u=0; i < VL && j < VL;):
@@ -50,14 +51,21 @@ with the pseudocode below, the immediate can be used to give unit stride or elem
          if (RA.isvec) while (!(ps & 1<<i)) i++;
          if (RAupdate.isvec) while (!(ps & 1<<u)) u++;
          if (RT.isvec) while (!(pd & 1<<j)) j++;
-        if svctx.ldstmode == elementstride:
+        if svctx.ldstmode == shifted: # for FFT/DCT
+          # FFT/DCT shifted mode
+          if (RA.isvec)
+            srcbase = ireg[RA+i]
+          else
+            srcbase = ireg[RA]
+          offs = (i * immed) << RC
+        elif svctx.ldstmode == elementstride:
            # element stride mode
            srcbase = ireg[RA]
-          offs = i * immed
+          offs = i * immed              # j*immed for a ST
          elif svctx.ldstmode == unitstride:
            # unit stride mode
            srcbase = ireg[RA]
-          offs = i * op_width
+          offs = immed + (i * op_width) # j*op_width for ST
          elif RA.isvec:
            # quirky Vector indexed mode but with an immediate
            srcbase = ireg[RA+i]
@@ -80,6 +88,15 @@ with the pseudocode below, the immediate can be used to give unit stride or elem
          if (RAupdate.isvec) u++;
          if (RT.isvec) j++;
  
+    # reverses the bitorder up to "width" bits
+    def bitrev(val, VL):
+      width = log2(VL)
+      result = 0
+      for _ in range(width):
+        result = (result << 1) | (val & 1)
+        val >>= 1
+      return result
+
  Indexed LD is:
   
      # format: ldop RT, RA, RB
@@ -115,52 +132,85 @@ allows attackers to probe large numbers of pages from userspace, where
  strided fail-first (by creating contiguous sequential LDs) does not.
  
  In addition, reduce mode makes no sense, and for LD/ST with immediates
- Vector source RA makes no sense either. Realistically we need
+ Vector source RA makes no sense either (or, is a quirk). Realistically we need
  an alternative table meaning for [[sv/svp64]] mode.  The following modes make sense:
  
  * saturation
  * predicate-result (mostly for cache-inhibited LD/ST)
  * normal
-* fail-first, where vector source on RA or RB is banned
-
-The table for [[sv/svp64]] for immed(RA) is:
-
-| 0-1 |  2  |  3   4  |  description              |
-| --- | --- |---------|-------------------------- |
-| 00  | str |  sz  dz | normal mode               |
-| 01  | inv | CR-bit  | Rc=1: ffirst CR sel       |
-| 01  | inv | els RC1 |  Rc=0: ffirst z/nonz |
-| 10  |   N | sz  str |  sat mode: N=0/1 u/s |
-| 11  | inv | CR-bit  |  Rc=1: pred-result CR sel |
-| 11  | inv | els RC1 |  Rc=0: pred-result z/nonz |
+* fail-first, where a vector source on RA or RB is banned
+
+Also, given that FFT, DCT and other related algorithms
+are of such high importance in so many areas of Computer
+Science, a special "shift" mode has been added which
+allows part of the immediate to be used instead as RC, a register
+which shifts the immediate `DS << GPR(RC)`.
+
+The table for [[sv/svp64]] for `immed(RA)` is:
+
+| 0-1 |  2  |  3   4  |  description               |
+| --- | --- |---------|--------------------------- |
+| 00  | 0   |  dz els | normal mode                |
+| 00  | 1   |  dz shf | shift mode                 |
+| 01  | inv | CR-bit  | Rc=1: ffirst CR sel        |
+| 01  | inv | els RC1 |  Rc=0: ffirst z/nonz       |
+| 10  |   N | dz  els |  sat mode: N=0/1 u/s       |
+| 11  | inv | CR-bit  |  Rc=1: pred-result CR sel  |
+| 11  | inv | els RC1 |  Rc=0: pred-result z/nonz  |
  
  The `els` bit is only relevant when `RA.isvec` is clear: this indicates
  whether stride is unit or element:
  
-    if RA.isvec:
+    if bitreversed:
+        svctx.ldstmode = bitreversed
+    elif RA.isvec:
          svctx.ldstmode = indexed
      elif els == 0:
          svctx.ldstmode = unitstride
-    else:
+    elif immediate != 0:
          svctx.ldstmode = elementstride
  
-The modes for RA+RB indexed version are slightly different:
+An immediate of zero is a safety-valve to allow `LD-VSPLAT`:
+in effect the multiplication of the immediate-offset by zero results
+in reading from the exact same memory location.
+
+For `LD-VSPLAT`, on non-cache-inhibited Loads, the read can occur
+just the once and be copied, rather than hitting the Data Cache
+multiple times with the same memory read at the same location.
+This would allow for memory-mapped peripherals to have multiple
+data values read in quick succession and stored in sequentially
+numbered registers.
+
+For non-cache-inhibited ST from a vector source onto a scalar
+destination: with the Vector
+loop effectively creating multiple memory writes to the same location,
+we can deduce that the last of these will be the "successful" one. Thus,
+implementations are free and clear to optimise out the overwriting STs,
+leaving just the last one as the "winner".  Bear in mind that predicate
+masks will skip some elements (in source non-zeroing mode).
+Cache-inhibited ST operations on the other hand **MUST** write out
+a Vector source multiple successive times to the exact same Scalar
+destination.
+
+Note that there are no immediate versions of cache-inhibited LD/ST.
+
+The modes for `RA+RB` indexed version are slightly different:
  
  | 0-1 |  2  |  3   4  |  description              |
  | --- | --- |---------|-------------------------- |
-| 00  |   0 |  sz  dz | normal mode                      |
-| 00  | rsv |  rsvd   | reserved                     |
+| 00  |   0 |  dz  sz | normal mode                      |
+| 00  |   1 |  rsvd   | reserved                     |
  | 01  | inv | CR-bit  | Rc=1: ffirst CR sel              |
-| 01  | inv | sz  RC1 |  Rc=0: ffirst z/nonz |
-| 10  |   N | sz   dz |  sat mode: N=0/1 u/s |
+| 01  | inv | dz  RC1 |  Rc=0: ffirst z/nonz |
+| 10  |   N | dz   sz |  sat mode: N=0/1 u/s |
  | 11  | inv | CR-bit  |  Rc=1: pred-result CR sel |
-| 11  | inv | sz  RC1 |  Rc=0: pred-result z/nonz |
+| 11  | inv | dz  RC1 |  Rc=0: pred-result z/nonz |
  
  A summary of the effect of Vectorisation of src or dest:
   
       imm(RA)  RT.v   RA.v   no stride allowed
-     imm(RA)  RY.s   RA.v   no stride allowed
-     imm(RA)  RT.v   RA.s   stride-select needed
+     imm(RA)  RT.s   RA.v   no stride allowed
+     imm(RA)  RT.v   RA.s   stride-select allowed
       imm(RA)  RT.s   RA.s   not vectorised
       RA,RB    RT.v  RA/RB.v ffirst banned
       RA,RB    RT.s  RA/RB.v ffirst banned
@@ -168,9 +218,14 @@ A summary of the effect of Vectorisation of src or dest:
       RA,RB    RT.s  RA/RB.s not vectorised
  
  Note that cache-inhibited LD/ST (`ldcix`) when VSPLAT is activated will perform **multiple** LD/ST operations, sequentially.  `ldcix` even with scalar src will read the same memory location *multiple times*, storing the result in successive Vector destination registers.  This because the cache-inhibit instructions are used to read and write memory-mapped peripherals.
-If a genuine VSPLAT is required then a scalar cache-inhibited LD should be performed, followed by a VSPLAT-augmented mv.
+If a genuine cache-inhibited LD-VSPLAT is required then a *scalar*
+cache-inhibited LD should be performed, followed by a VSPLAT-augmented mv.
  
-# LOAD/STORE Elwidths <a name="ldst"></a>
+## LD/ST ffirst
+
+ffirst LD/ST to multiple pages via a Vectorised base is considered a security risk due to the abuse of probing multiple pages in rapid succession and getting feedback on which pages would fail.  Therefore in these special circumstances requesting ffirst with a vector base is instead interpreted as element-strided LD/ST.  See <https://bugs.libre-soc.org/show_bug.cgi?id=561>
+
+# LOAD/STORE Elwidths <a name="elwidth"></a>
  
  Loads and Stores are almost unique in that the OpenPOWER Scalar ISA
  provides a width for the operation (lb, lh, lw, ld).  Only `extsb` and
@@ -198,6 +253,17 @@ is treated effectively as completely separate and distinct from SV
  augmentation.  This is primarily down to quirks surrounding LE/BE and
  byte-reversal in OpenPOWER.
  
+It is unfortubately possible to request an elwidth override on the memory side which
+does not mesh with the operation width: these result in `UNDEFINED`
+behaviour.  The reason is that the effect of attempting a 64-bit `sv.ld`
+operation with a source elwidth override of 8/16/32 would result in
+overlapping memory requests, particularly on unit and element strided
+operations.  Thus it is `UNDEFINED` when the elwidth is smaller than
+the memory operation width. Examples include `sv.lw/sw=16/els` which
+requests (overlapping) 4-byte memory reads offset from
+each other at 2-byte intervals.  Store likewise is also `UNDEFINED`
+where the dest elwidth override is less than the operation width.
+
  Note the following regarding the pseudocode to follow:
  
  * `scalar identity behaviour` SV Context parameter conditions turn this
@@ -281,6 +347,21 @@ LD/ST, will give that same capability, with far more flexibility.
  
  # notes from lxo
  
+this section covers assembly notation for the immediate and indexed LD/ST.
+the summary is that in immediate mode for LD it is not clear that if the 
+destination register is Vectorised `RT.v` but the source `imm(RA)` is scalar
+the memory being read is *still a vector load*, known as "unit or element strides".
+
+This anomaly is made clear with the following notation:
+
+    sv.ld RT.v, imm(RA).v
+
+The following notation, although technically correct due to being implicitly identical to the above, is prohibited and is a syntax error:
+ 
+    sv.ld RT.v, imm(RA)
+
+Notes taken from IRC conversation
+
      <lxo> sv.ld r#.v, ofst(r#).v -> the whole vector is at ofst+r#
      <lxo> sv.ld r#.v, ofst(r#.v) -> r# is a vector of addresses
      <lxo> similarly sv.ldx r#.v, r#, r#.v -> whole vector at r#+r#
@@ -291,16 +372,24 @@ LD/ST, will give that same capability, with far more flexibility.
  
  permutations of vector selection, to identify above asm-syntax:
  
-     imm(RA)  RT.v   RA.v   no stride allowed
+     imm(RA)  RT.v   RA.v   nonstrided
           sv.ld r#.v, ofst(r#2.v) -> r#2 is a vector of addresses
-     imm(RA)  RT.s   RA.v   no stride allowed
+           mem@     0+r#2   offs+(r#2+1)  offs+(r#2+2)
+           destreg  r#      r#+1          r#+2
+     imm(RA)  RT.s   RA.v   nonstrided
           sv.ld r#, ofst(r#2.v) -> r#2 is a vector of addresses
-     imm(RA)  RT.v   RA.s   stride-select needed
-         sv.ld r#.v, ofst(r#2).v -> the whole vector is at ofst+r#2
+           (dest r# is scalar) -> VSELECT mode
+     imm(RA)  RT.v   RA.s   fixed stride: unit or element
+         sv.ld r#.v, ofst(r#2).v -> whole vector is at ofst+r#2
+           mem@r#2  +0   +1   +2
+           destreg  r#   r#+1 r#+2
+         sv.ld/els r#.v, ofst(r#2).v -> vector at ofst*elidx+r#2
+           mem@r#2  +0 ...   +offs ...  +offs*2
+           destreg  r#       r#+1       r#+2
       imm(RA)  RT.s   RA.s   not vectorised
           sv.ld r#, ofst(r#2)
  
-TODO: indexed mode
+indexed mode:
  
       RA,RB    RT.v  RA.v  RB.v
          sv.ldx r#.v, r#2, r#3.v -> whole vector at r#2+r#3
@@ -309,6 +398,7 @@ TODO: indexed mode
       RA,RB    RT.v  RA.v  RB.s
          sv.ldx r#.v, r#2.v, r#3 -> vector of addresses
       RA,RB    RT.v  RA.s  RB.s
+        sv.ldx r#.v, r#2, r#3 -> VSPLAT mode
       RA,RB    RT.s  RA.v  RB.v
       RA,RB    RT.s  RA.s  RB.v
       RA,RB    RT.s  RA.v  RB.s