From 9a82e8461226106f36e9bc5d664eb7ae5764680a Mon Sep 17 00:00:00 2001
From: lkcl <lkcl@web>
Date: Tue, 4 Oct 2022 14:38:06 +0100
Subject: [PATCH]

---
 openpower/sv/svp64/discussion.mdwn | 63 ++++++++++--------------------
 1 file changed, 21 insertions(+), 42 deletions(-)

diff --git a/openpower/sv/svp64/discussion.mdwn b/openpower/sv/svp64/discussion.mdwn
index 51e56d3f3..48ba0cdb8 100644
--- a/openpower/sv/svp64/discussion.mdwn
+++ b/openpower/sv/svp64/discussion.mdwn
@@ -372,45 +372,24 @@ a need for merging (ORing) all bits into a single alternative predicate mask
 ## fast traditional packed SIMD
 
 A major motivation for changing SVP64 with all isvec=0 to temporarily
-override VL to 1 is to allow supporting traditional SIMD that has
-constantly varying element sizes (and therefore vector lengths too)
-without needing setvl every few instructions, by using SUBVL and
-elwidth overrides.
-
-Examples of use cases:
-
-* WebAssembly's [128-bit packed SIMD extension](https://github.com/WebAssembly/spec/blob/8a352708cffeb71206ca49a0f743bdc57269fb1a/proposals/simd/SIMD.md) (which is becoming a de-facto standard for WebAssembly on the Web and on Servers)
-* Java/C#/JavaScript/etc. 128-bit packed SIMD
-* Cross-compiling x86 SSE2/AVX2 or ARM NEON or VSX/VMX code to SVP64.
-
-Implementing 128-bit packed SIMD can be done without constantly needing
-`setvl` instructions by:
-
-Setting VL=4 on entry to the code.
-
-Then, all 128-bit packed SIMD types can be emulated without additional
-`setvl` instructions:
-
-| 128-bit SIMD type            | SVP64 vector add                                            |
-|------------------------------|-------------------------------------------------------------|
-| `u8x16`/`i8x16`              | sv.add/subvl=4/elwid=8 RT.vector, RA.vector, RB.vector      |
-| `u16x8`/`i16x8`              | sv.add/subvl=2/elwid=16 RT.vector, RA.vector, RB.vector     |
-| `u32x4`/`i32x4`              | sv.add/elwid=32 RT.vector, RA.vector, RB.vector             |
-| `u64x2`/`i64x2`              | sv.add/subvl=2 RT.scalar, RA.scalar, RB.scalar              |
-| `bf16x8` (not in base SVP64) | sv.fadd/subvl=2/elwid=8 FRT.vector, FRA.vector, FRB.vector  |
-| `f16x8`                      | sv.fadd/subvl=2/elwid=16 FRT.vector, FRA.vector, FRB.vector |
-| `f32x4`                      | sv.fadd/elwid=32 FRT.vector, FRA.vector, FRB.vector         |
-| `f64x2`                      | sv.fadd/subvl=2 FRT.scalar, FRA.scalar, FRB.scalar          |
-
-Likewise, all 256-bit packed SIMD types can be emulated without additional `setvl` instructions by setting VL=8 on entry to the code:
-
-| 256-bit SIMD type             | SVP64 vector add                                            |
-|-------------------------------|-------------------------------------------------------------|
-| `u8x32`/`i8x32`               | sv.add/subvl=4/elwid=8 RT.vector, RA.vector, RB.vector      |
-| `u16x16`/`i16x16`             | sv.add/subvl=2/elwid=16 RT.vector, RA.vector, RB.vector     |
-| `u32x8`/`i32x8`               | sv.add/elwid=32 RT.vector, RA.vector, RB.vector             |
-| `u64x4`/`i64x4`               | sv.add/subvl=4 RT.scalar, RA.scalar, RB.scalar              |
-| `bf16x16` (not in base SVP64) | sv.fadd/subvl=2/elwid=8 FRT.vector, FRA.vector, FRB.vector  |
-| `f16x16`                      | sv.fadd/subvl=2/elwid=16 FRT.vector, FRA.vector, FRB.vector |
-| `f32x8`                       | sv.fadd/elwid=32 FRT.vector, FRA.vector, FRB.vector         |
-| `f64x4`                       | sv.fadd/subvl=4 FRT.scalar, FRA.scalar, FRB.scalar          |
+override VL to 1 is to allow easy interleaving of Scalar instructions
+accessing the otherwise-inaccessible registers and CR Fields numbered
+32-127, without the need to change VL. Given that VL may have been
+set to any value between 0 and MAXVL on a Fail-First, and given
+that VL=0 turns Vector instructions into `nop` this is quite important.
+
+An example, emulating PackedSIMD ISA behaviour:
+
+    # set VL=4 to be able to do the next add as `u8x16`/`i8x16`
+    setvl MAXVL=VL=4
+    sv.add/subvl=4/elwid=8 *RT, *RA, *RB
+    # now set VL=1 to get scalar register access
+    setvl VL=1
+    sv.add/subvl=4/elwid=8 RT, RA, RB
+    # set VL=4 again.  repeat, repeat, repeat
+    setvl MAXVL=VL=4
+
+This applies equally to loops when VL is dynamically set (from RA or CTR)
+reducing instruction count in hot-loops as it does when setting VL to
+static quantities as a method of emulating PackedSIMD ISA behaviour.
+
-- 
2.30.2