From 96bdd481d17ef2986cf8754e04d63901113abf54 Mon Sep 17 00:00:00 2001
From: lkcl <lkcl@web>
Date: Mon, 16 Aug 2021 12:19:24 +0100
Subject: [PATCH]

---
 openpower/sv/setvl.mdwn | 47 +++++++++++++++++++++++++++++++++++------
 1 file changed, 41 insertions(+), 6 deletions(-)

diff --git a/openpower/sv/setvl.mdwn b/openpower/sv/setvl.mdwn
index 1391cf8e3..6bd770b5f 100644
--- a/openpower/sv/setvl.mdwn
+++ b/openpower/sv/setvl.mdwn
@@ -318,18 +318,53 @@ loop:
 
 ## setmvlhi double loop
 
+Two elements per inner loop are executed per instruction. This assumes
+that underlying hardware, when `setmvlhi` requests a parallelism hint of 2
+actually sets a parallelism hint of 2.
+
+This example, in c, would be:
+
+```
+long *r4;
+for (i=0; i < CTR; i++)
+{
+    r4[i+2] += r4[i]
+}
+```
+
+where, clearly, it is not possible to do more
+than 2 elements in parallel at a time: attempting
+to do so would result in data corruption. The compiler
+may be able to determine memory aliases and inform
+hardware at runtime of the maximum safe parallelism
+limit.
+
+Whilst this example could be simplified to simply set VL=2,
+or exploit the fact that overlapping adds have well-defined
+behaviour, this has not been done, here, for illustrative purposes
+in order to demonstrate setmvhli and Vertical-First Mode.
+
+Note, crucially, how r4, r32 and r20 are **NOT** incremented
+inside the inner loop.  The MAXVL reservation is still 8,
+i.e. as srcstep and dststep advance (by 2 elements at a time)
+registers r20-r27 will be used for the first LD, and
+registers r32-39 for the second LD.  `r4+srcstep*8` will be used
+as the elstrided offset for LDs.
+
 ```
    setmvlhi  8, 2 # MVL=8, VFHint=2
 loop:
-    setvl  r5, r3 # VL=r5=MAX(MVL, r3)
+    setvl  r1, CTR, vf=1 # VL=r1=MAX(MVL, CTR), VF=1
+    mulli  r1, r1, 8     # multiply by int width
 loopinner:
     sv.ld r20.v, r4(0) # load VLhint elements (max 2)
-    sv.addi r20.v, r20.v, 55 # add 55 to 2 elements
-    sv.st r20.v, r4(0) # store VLhint elements
+    addi r2, r4, 16    # 2 elements ahead
+    sv.ld r32.v, r2(0) # load VLhint elements (max 2)
+    sv.add r32.v, r20.v, r32.v # x[i+2] += x[i]
+    sv.st r32.v, r2(0) # store VLhint elements
     svstep.            # srcstep += VLhint
     bnz loopinner      # repeat until srcstep=VL
     # now done VL elements, move to next batch
-    add r4, r4, r5 # move r4 pointer forward
-    sub. r3, r3, r5 # decrement total count by VL
-    bnz loop
+    add r4, r4, r1     # move r4 pointer forward
+    sv.bnz/ctr loop    # decrement CTR by VL
 ```
-- 
2.30.2