(no commit message)

[libreriscv.git] / openpower / sv / setvl.mdwn
diff --git a/openpower/sv/setvl.mdwn b/openpower/sv/setvl.mdwn

index 12f7cb52e0d33387658989e9409e797fef65ce0a..c6233de43177857e4c2a67ee81ba8849c128d736 100644 (file)
--- a/openpower/sv/setvl.mdwn
+++ b/openpower/sv/setvl.mdwn
@@ -8,9 +8,10 @@ See links:
  * <https://bugs.libre-soc.org/show_bug.cgi?id=535>
  * <https://bugs.libre-soc.org/show_bug.cgi?id=587>
  * <https://bugs.libre-soc.org/show_bug.cgi?id=568> TODO
+* <https://bugs.libre-soc.org/show_bug.cgi?id=862> VF Predication
  * <https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#vsetvlivsetvl-instructions>
-* old page [[simple_v_extension/specification/sv.setvl]]
  * [[sv/svstep]]
+* pseudocode [[openpower/isa/simplev]]
  
  Use of setvl results in changes to the MVL, VL and STATE SPRs. see [[sv/sprs]]♧
  
@@ -78,9 +79,9 @@ using EXT22 temporarily and fitting into the
  
  Form: SVL-Form (see [[isatables/fields.text]])
  
-| 0.5|6.10|11.15|16..21| 22...25    | 26.30 |31|  name   |
+| 0.5|6.10|11.15|16..22| 23...25    | 26.30 |31|  name   |
  | -- | -- | --- | ---- |----------- | ----- |--| ------- |
-|OPCD| RT | RA  | SVi  |cv ms vs vf | 11110 |Rc| setvl   |
+|OPCD| RT | RA  | SVi  |   ms vs vf | 11011 |Rc| setvl   |
  
  Instruction format:
  
@@ -89,7 +90,6 @@ Instruction format:
  
  Note that the immediate (`SVi`) spans 7 bits (16 to 22)
  
-* `cv` - bit 22 - reads CTR instead of RA 
  * `ms` - bit 23 - allows for setting of MVL.
  * `vs` - bit 24 - allows for setting of VL.
  * `vf` - bit 25 - sets "Vertical First Mode".
@@ -125,7 +125,7 @@ the same instruction.  That would require two instructions.
  Vertical First is effectively like an implicit single bit predicate
  applied to every SVP64 instruction.  **ONLY** one element in each
  SVP64 Vector instruction is executed; srcstep and dststep do **not**
-increment, and the Program Counter progresses **immediately* to
+increment, and the Program Counter progresses **immediately** to
  the next instruction just as it would for any standard scalar v3.0B
  instruction.
  
@@ -162,12 +162,17 @@ Testing any end condition of any loop of any REMAP state allows branches to be u
  Nested looping with different schedules is perfectly possible, as is
  calling of functions, however SVSTATE (and any associated SVSTATE) should be stored on the stack.*
  
+**SUBVL**
+
+Sub-vector elements are not be considered "Vertical". The vec2/3/4
+is to be considered as if the "single element".  Caveats exist for
+[[sv/mv.swizzle]] and [[sv/mv.vec]] when Pack/Unpack is enabled.
+
  # Pseudocode
  
      // instruction fields:
      rd = get_rt_field();         // bits 6..10
      ra = get_ra_field();         // bits 11..15
-    vc = get_vc_field();         // bit 22
      vf = get_vf_field();         // bit 23
      vs = get_vs_field();         // bit 24
      ms = get_ms_field();         // bit 25
@@ -176,7 +181,7 @@ calling of functions, however SVSTATE (and any associated SVSTATE) should be sto
      if vf and not vs and not ms {
          // increment src/dest step mode
          // NOTE! this is in no way complete! predication is not included
-        // and neither is SUB-VL mode
+        // and neither is SUBVL mode
          srcstep = SPR[SV].srcstep
          dststep = SPR[SV].dststep
          VL = SPR[SV].VL
@@ -193,7 +198,7 @@ calling of functions, however SVSTATE (and any associated SVSTATE) should be sto
  
          // write CR? helps for doing Vertical loops, detects end
          // of Vector Elements
-        if Rc {
+        if Rc = 1 {
              // update CR to indicate that srcstep/dststep "rolled over"
              CR0.eq = rollover
          }
@@ -205,9 +210,7 @@ calling of functions, however SVSTATE (and any associated SVSTATE) should be sto
          // 4 options: from SPR, from immed, from ra, from CTR
          if vs {
             // VL to be sourced from fields/regs
-           if vc {
-               VL = CTR
-           } else if ra != 0 {
+           if ra != 0 {
                 VL = GPR[ra]
             } else {
                 VL = vlimmed
@@ -240,7 +243,7 @@ calling of functions, however SVSTATE (and any associated SVSTATE) should be sto
              regs[rt] = VL;
          }
          // write CR?
-        if Rc {
+        if Rc = 1 {
              // update CR from VL (not rt)
              CR0.eq = (VL == 0)
              ...
@@ -279,55 +282,3 @@ loop:
      end:
        blr
  
-## setmvlhi double loop
-
-Two elements per inner loop are executed per instruction. This assumes
-that underlying hardware, when `setmvlhi` requests a parallelism hint of 2
-actually sets a parallelism hint of 2.
-
-This example, in c, would be:
-
-```
-long *r4;
-for (i=0; i < CTR; i++)
-{
-    r4[i+2] += r4[i]
-}
-```
-
-where, clearly, it is not possible to do more
-than 2 elements in parallel at a time: attempting
-to do so would result in data corruption. The compiler
-may be able to determine memory aliases and inform
-hardware at runtime of the maximum safe parallelism
-limit.
-
-Whilst this example could be simplified to simply set VL=2,
-or exploit the fact that overlapping adds have well-defined
-behaviour, this has not been done, here, for illustrative purposes
-in order to demonstrate setmvhli and Vertical-First Mode.
-
-Note, crucially, how r4, r32 and r20 are **NOT** incremented
-inside the inner loop.  The MAXVL reservation is still 8,
-i.e. as srcstep and dststep advance (by 2 elements at a time)
-registers r20-r27 will be used for the first LD, and
-registers r32-39 for the second LD.  `r4+srcstep*8` will be used
-as the elstrided offset for LDs.
-
-```
-   setmvlhi  8, 2 # MVL=8, VFHint=2
-loop:
-    setvl  r1, CTR, vf=1 # VL=r1=MAX(MVL, CTR), VF=1
-    mulli  r1, r1, 8     # multiply by int width
-loopinner:
-    sv.ld r20.v, r4(0) # load VLhint elements (max 2)
-    addi r2, r4, 16    # 2 elements ahead
-    sv.ld r32.v, r2(0) # load VLhint elements (max 2)
-    sv.add r32.v, r20.v, r32.v # x[i+2] += x[i]
-    sv.st r32.v, r2(0) # store VLhint elements
-    svstep.            # srcstep += VLhint
-    bnz loopinner      # repeat until srcstep=VL
-    # now done VL elements, move to next batch
-    add r4, r4, r1     # move r4 pointer forward
-    sv.bnz/ctr loop    # decrement CTR by VL
-```