(no commit message)

[libreriscv.git] / openpower / sv / ldst.mdwn
diff --git a/openpower/sv/ldst.mdwn b/openpower/sv/ldst.mdwn

index b5bd169032cbb099422d76c01dfd4d583e2d1a6c..a34025f54f10eaeb151b581fa94c1e58dca7eb6b 100644 (file)
--- a/openpower/sv/ldst.mdwn
+++ b/openpower/sv/ldst.mdwn
@@ -9,14 +9,16 @@ Links:
  * <https://bugs.libre-soc.org/show_bug.cgi?id=571>
  * <https://llvm.org/devmtg/2016-11/Slides/Emerson-ScalableVectorizationinLLVMIR.pdf>
  * <https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#vector-loads-and-stores>
+* [[simple_v_extension/specification/ld.x]]
  
  Vectorisation of Load and Store requires creation, from scalar operations,
-a number of different types:
+a number of different modes:
  
-* fixed stride (contiguous sequence with no gaps)
+* fixed stride (contiguous sequence with no gaps) aka "unit" stride
  * element strided (sequential but regularly offset, with gaps)
  * vector indexed (vector of base addresses and vector of offsets)
  * fail-first on the same (where it makes sense to do so)
+* Structure Packing (covered in SV by [[sv/remap]]).
  
  OpenPOWER Load/Store operations may be seen from [[isa/fixedload]] and
  [[isa/fixedstore]] pseudocode to be of the form:
@@ -37,12 +39,11 @@ example only the one source and one dest may be marked as scalar or
  vector.
  
  Thus we can see that Vector Indexed may be covered, and, as demonstrated
-with the pseudocode below, the immediate can be set to the element width
-in order to give unit or element stride.  With there being no way to tell which from the Scalar opcode, the choice is provided instead by the SV Context.
+with the pseudocode below, the immediate can be used to give unit stride or element stride.  With there being no way to tell which from the OpenPOWER v3.0B Scalar opcode alone, the choice is provided instead by the SV Context.
  
-    # LD not VLD!
+    # LD not VLD!  format - ldop RT, immed(RA)
      # op_width: lb=1, lh=2, lw=4, ld=8
-    op_load(RT, RA, op_width, immed, svctx, RAupdate):
+    op_load(RT, RA, RC, op_width, immed, svctx, RAupdate):
        ps = get_pred_val(FALSE, RA); # predication on src
        pd = get_pred_val(FALSE, RT); # ... AND on dest
        for (i=0, j=0, u=0; i < VL && j < VL;):
@@ -50,16 +51,23 @@ in order to give unit or element stride.  With there being no way to tell which
          if (RA.isvec) while (!(ps & 1<<i)) i++;
          if (RAupdate.isvec) while (!(ps & 1<<u)) u++;
          if (RT.isvec) while (!(pd & 1<<j)) j++;
-        if svctx.ldstmode == elementstride:
+        if svctx.ldstmode == shifted: # for FFT/DCT
+          # FFT/DCT shifted mode
+          if (RA.isvec)
+            srcbase = ireg[RA+i]
+          else
+            srcbase = ireg[RA]
+          offs = (i * immed) << RC
+        elif svctx.ldstmode == elementstride:
            # element stride mode
            srcbase = ireg[RA]
-          offs = i * immed
+          offs = i * immed              # j*immed for a ST
          elif svctx.ldstmode == unitstride:
            # unit stride mode
            srcbase = ireg[RA]
-          offs = i * op_width
+          offs = immed + (i * op_width) # j*op_width for ST
          elif RA.isvec:
-          # type of indirect (indexed) but with an immediate
+          # quirky Vector indexed mode but with an immediate
            srcbase = ireg[RA+i]
            offs = immed;
          else
@@ -77,10 +85,21 @@ in order to give unit or element stride.  With there being no way to tell which
          if (!RT.isvec)
              break # destination scalar, end now
          if (RA.isvec) i++;
+        if (RAupdate.isvec) u++;
          if (RT.isvec) j++;
  
+    # reverses the bitorder up to "width" bits
+    def bitrev(val, VL):
+      width = log2(VL)
+      result = 0
+      for _ in range(width):
+        result = (result << 1) | (val & 1)
+        val >>= 1
+      return result
+
  Indexed LD is:
   
+    # format: ldop RT, RA, RB
      function op_ldx(RT, RA, RB, RAupdate=False) # LD not VLD!
        ps = get_pred_val(FALSE, RA); # predication on src
        pd = get_pred_val(FALSE, RT); # ... AND on dest
@@ -98,10 +117,11 @@ Indexed LD is:
          if (!RA.isvec && !RB.isvec)
              break # scalar-scalar
          if (RA.isvec) i++;
+        if (RAupdate.isvec) u++;
          if (RB.isvec) k++;
          if (RT.isvec) j++;
  
-Note in both cases that [[sv/svp64]] allows RA in "update" mode (`ldux`) to be effectively a completely different register from RA-as-a-source.  This because there is room in svp64 to extend RA-as-src as well as RA-as-dest, both independently as scalar or vector *and* independently extending their range.
+Note in both cases that [[sv/svp64]] allows RA-as-a-dest in "update" mode (`ldux`) to be effectively a *completely different* register from RA-as-a-source.  This because there is room in svp64 to extend RA-as-src as well as RA-as-dest, both independently as scalar or vector *and* independently extending their range.
  
  # Determining the LD/ST Modes
  
@@ -111,65 +131,101 @@ sense or are considered a security risk.  Fail-first on Vector Indexed
  allows attackers to probe large numbers of pages from userspace, where
  strided fail-first (by creating contiguous sequential LDs) does not.
  
-In addition, even in other modes, Vector source RA makes no sense for
-computing offsets, and reduce mode even less.  Realistically we need
-an alternative table meaning for [[sv/svp64]] mode.
-
-TODO
-
-    in all cases:
-     - vector immed(RA) nonsense.
-     - unit-stride/el-stride needed on immed(RA)
-
-    modes for immed(RA) version:
-
-    * saturation
-    * predicate-result?
-    * normal
-    * fail-first
-      - vector RA is "banned"
-
-| 0-1 |  2  |  3   4  |  description              |
-| --- | --- |---------|-------------------------- |
-| 00  | str |  sz  dz | normal mode               |
-| 01  | inv | CR-bit  | Rc=1: ffirst CR sel       |
-| 01  | inv | str RC1 |  Rc=0: ffirst z/nonz |
-| 10  |   N | sz  str |  sat mode: N=0/1 u/s |
-| 11  | inv | CR-bit  |  Rc=1: pred-result CR sel |
-| 11  | inv | str RC1 |  Rc=0: pred-result z/nonz |
-
-The `str` bit is only relevant when `RA.isvec` is clear: this indicates 
-
-
-    modes for RA+RB indexed version:
-
-    * saturation
-    * predicate-result
-    * normal
-    * fail-first
-      - vector RA or RB is "banned"
-
+In addition, reduce mode makes no sense, and for LD/ST with immediates
+ Vector source RA makes no sense either (or, is a quirk). Realistically we need
+an alternative table meaning for [[sv/svp64]] mode.  The following modes make sense:
+
+* saturation
+* predicate-result (mostly for cache-inhibited LD/ST)
+* normal
+* fail-first, where a vector source on RA or RB is banned
+
+Also, given that FFT, DCT and other related algorithms
+are of such high importance in so many areas of Computer
+Science, a special "shift" mode has been added which
+allows part of the immediate to be used instead as RC, a register
+which shifts the immediate `DS << GPR(RC)`.
+
+The table for [[sv/svp64]] for `immed(RA)` is:
+
+| 0-1 |  2  |  3   4  |  description               |
+| --- | --- |---------|--------------------------- |
+| 00  | 0   |  dz els | normal mode                |
+| 00  | 1   |  dz shf | shift mode                 |
+| 01  | inv | CR-bit  | Rc=1: ffirst CR sel        |
+| 01  | inv | els RC1 |  Rc=0: ffirst z/nonz       |
+| 10  |   N | dz  els |  sat mode: N=0/1 u/s       |
+| 11  | inv | CR-bit  |  Rc=1: pred-result CR sel  |
+| 11  | inv | els RC1 |  Rc=0: pred-result z/nonz  |
+
+The `els` bit is only relevant when `RA.isvec` is clear: this indicates
+whether stride is unit or element:
+
+    if bitreversed:
+        svctx.ldstmode = bitreversed
+    elif RA.isvec:
+        svctx.ldstmode = indexed
+    elif els == 0:
+        svctx.ldstmode = unitstride
+    elif immediate != 0:
+        svctx.ldstmode = elementstride
+
+An immediate of zero is a safety-valve to allow `LD-VSPLAT`:
+in effect the multiplication of the immediate-offset by zero results
+in reading from the exact same memory location.
+
+For `LD-VSPLAT`, on non-cache-inhibited Loads, the read can occur
+just the once and be copied, rather than hitting the Data Cache
+multiple times with the same memory read at the same location.
+This would allow for memory-mapped peripherals to have multiple
+data values read in quick succession and stored in sequentially
+numbered registers.
+
+For non-cache-inhibited ST from a vector source onto a scalar
+destination: with the Vector
+loop effectively creating multiple memory writes to the same location,
+we can deduce that the last of these will be the "successful" one. Thus,
+implementations are free and clear to optimise out the overwriting STs,
+leaving just the last one as the "winner".  Bear in mind that predicate
+masks will skip some elements (in source non-zeroing mode).
+Cache-inhibited ST operations on the other hand **MUST** write out
+a Vector source multiple successive times to the exact same Scalar
+destination.
+
+Note that there are no immediate versions of cache-inhibited LD/ST.
+
+The modes for `RA+RB` indexed version are slightly different:
  
  | 0-1 |  2  |  3   4  |  description              |
  | --- | --- |---------|-------------------------- |
-| 00  |   0 |  sz  dz | normal mode                      |
-| 00  | rsv |  rsvd   | reserved                     |
+| 00  |   0 |  dz  sz | normal mode                      |
+| 00  |   1 |  rsvd   | reserved                     |
  | 01  | inv | CR-bit  | Rc=1: ffirst CR sel              |
-| 01  | inv | sz  RC1 |  Rc=0: ffirst z/nonz |
-| 10  |   N | sz   dz |  sat mode: N=0/1 u/s |
+| 01  | inv | dz  RC1 |  Rc=0: ffirst z/nonz |
+| 10  |   N | dz   sz |  sat mode: N=0/1 u/s |
  | 11  | inv | CR-bit  |  Rc=1: pred-result CR sel |
-| 11  | inv | sz  RC1 |  Rc=0: pred-result z/nonz |
+| 11  | inv | dz  RC1 |  Rc=0: pred-result z/nonz |
  
+A summary of the effect of Vectorisation of src or dest:
+ 
       imm(RA)  RT.v   RA.v   no stride allowed
-     imm(RA)  RY.s   RA.v   no stride allowed
-     imm(RA)  RT.v   RA.s   stride-select needed
+     imm(RA)  RT.s   RA.v   no stride allowed
+     imm(RA)  RT.v   RA.s   stride-select allowed
       imm(RA)  RT.s   RA.s   not vectorised
       RA,RB    RT.v  RA/RB.v ffirst banned
       RA,RB    RT.s  RA/RB.v ffirst banned
-     RA,RB    RT.v  RA/RB.s vsplat activated
-     RA,RB    RT.s  RA/RB.s not vectirised
+     RA,RB    RT.v  RA/RB.s VSPLAT possible
+     RA,RB    RT.s  RA/RB.s not vectorised
+
+Note that cache-inhibited LD/ST (`ldcix`) when VSPLAT is activated will perform **multiple** LD/ST operations, sequentially.  `ldcix` even with scalar src will read the same memory location *multiple times*, storing the result in successive Vector destination registers.  This because the cache-inhibit instructions are used to read and write memory-mapped peripherals.
+If a genuine cache-inhibited LD-VSPLAT is required then a *scalar*
+cache-inhibited LD should be performed, followed by a VSPLAT-augmented mv.
+
+## LD/ST ffirst
  
-# LOAD/STORE Elwidths <a name="ldst"></a>
+ffirst LD/ST to multiple pages via a Vectorised base is considered a security risk due to the abuse of probing multiple pages in rapid succession and getting feedback on which pages would fail.  Therefore in these special circumstances requesting ffirst with a vector base is instead interpreted as element-strided LD/ST.  See <https://bugs.libre-soc.org/show_bug.cgi?id=561>
+
+# LOAD/STORE Elwidths <a name="elwidth"></a>
  
  Loads and Stores are almost unique in that the OpenPOWER Scalar ISA
  provides a width for the operation (lb, lh, lw, ld).  Only `extsb` and
@@ -177,7 +233,7 @@ others like it provide an explicit operation width.  There are therefore
  *three* widths involved:
  
  * operation width (lb=8, lh=16, lw=32, ld=64)
-s src elelent width override
+* src elelent width override
  * destination element width override
  
  Some care is therefore needed to express and make clear the transformations, 
@@ -197,6 +253,17 @@ is treated effectively as completely separate and distinct from SV
  augmentation.  This is primarily down to quirks surrounding LE/BE and
  byte-reversal in OpenPOWER.
  
+It is unfortubately possible to request an elwidth override on the memory side which
+does not mesh with the operation width: these result in `UNDEFINED`
+behaviour.  The reason is that the effect of attempting a 64-bit `sv.ld`
+operation with a source elwidth override of 8/16/32 would result in
+overlapping memory requests, particularly on unit and element strided
+operations.  Thus it is `UNDEFINED` when the elwidth is smaller than
+the memory operation width. Examples include `sv.lw/sw=16/els` which
+requests (overlapping) 4-byte memory reads offset from
+each other at 2-byte intervals.  Store likewise is also `UNDEFINED`
+where the dest elwidth override is less than the operation width.
+
  Note the following regarding the pseudocode to follow:
  
  * `scalar identity behaviour` SV Context parameter conditions turn this
@@ -277,3 +344,62 @@ min/max Vectorised instructions as post-processing stages.
  Thus we do not need to provide specialist LD/ST "Structure Packed" opcodes
  because the generic abstracted concept of "Remapping", when applied to
  LD/ST, will give that same capability, with far more flexibility.
+
+# notes from lxo
+
+this section covers assembly notation for the immediate and indexed LD/ST.
+the summary is that in immediate mode for LD it is not clear that if the 
+destination register is Vectorised `RT.v` but the source `imm(RA)` is scalar
+the memory being read is *still a vector load*, known as "unit or element strides".
+
+This anomaly is made clear with the following notation:
+
+    sv.ld RT.v, imm(RA).v
+
+The following notation, although technically correct due to being implicitly identical to the above, is prohibited and is a syntax error:
+ 
+    sv.ld RT.v, imm(RA)
+
+Notes taken from IRC conversation
+
+    <lxo> sv.ld r#.v, ofst(r#).v -> the whole vector is at ofst+r#
+    <lxo> sv.ld r#.v, ofst(r#.v) -> r# is a vector of addresses
+    <lxo> similarly sv.ldx r#.v, r#, r#.v -> whole vector at r#+r#
+    <lxo> whereas sv.ldx r#.v, r#.v, r# -> vector of addresses
+    <lxo> point being, you take an operand with the "m" constraint (or other memory-operand constraints), append .v to it and you're done addressing the in-memory vector
+    <lxo> as in asm ("sv.ld1 %0.v, %1.v" : "=r"(vec_in_reg) : "m"(vec_in_mem));
+    <lxo> (and ld%U1 got mangled into underline; %U expands to x if the address is a sum of registers
+
+permutations of vector selection, to identify above asm-syntax:
+
+     imm(RA)  RT.v   RA.v   nonstrided
+         sv.ld r#.v, ofst(r#2.v) -> r#2 is a vector of addresses
+           mem@     0+r#2   offs+(r#2+1)  offs+(r#2+2)
+           destreg  r#      r#+1          r#+2
+     imm(RA)  RT.s   RA.v   nonstrided
+         sv.ld r#, ofst(r#2.v) -> r#2 is a vector of addresses
+           (dest r# is scalar) -> VSELECT mode
+     imm(RA)  RT.v   RA.s   fixed stride: unit or element
+         sv.ld r#.v, ofst(r#2).v -> whole vector is at ofst+r#2
+           mem@r#2  +0   +1   +2
+           destreg  r#   r#+1 r#+2
+         sv.ld/els r#.v, ofst(r#2).v -> vector at ofst*elidx+r#2
+           mem@r#2  +0 ...   +offs ...  +offs*2
+           destreg  r#       r#+1       r#+2
+     imm(RA)  RT.s   RA.s   not vectorised
+         sv.ld r#, ofst(r#2)
+
+indexed mode:
+
+     RA,RB    RT.v  RA.v  RB.v
+        sv.ldx r#.v, r#2, r#3.v -> whole vector at r#2+r#3
+     RA,RB    RT.v  RA.s  RB.v
+        sv.ldx r#.v, r#2.v, r#3.v -> whole vector at r#2+r#3
+     RA,RB    RT.v  RA.v  RB.s
+        sv.ldx r#.v, r#2.v, r#3 -> vector of addresses
+     RA,RB    RT.v  RA.s  RB.s
+        sv.ldx r#.v, r#2, r#3 -> VSPLAT mode
+     RA,RB    RT.s  RA.v  RB.v
+     RA,RB    RT.s  RA.s  RB.v
+     RA,RB    RT.s  RA.v  RB.s
+     RA,RB    RT.s  RA.s  RB.s not vectorised