From 9a82e8461226106f36e9bc5d664eb7ae5764680a Mon Sep 17 00:00:00 2001 From: lkcl Date: Tue, 4 Oct 2022 14:38:06 +0100 Subject: [PATCH] --- openpower/sv/svp64/discussion.mdwn | 63 ++++++++++-------------------- 1 file changed, 21 insertions(+), 42 deletions(-) diff --git a/openpower/sv/svp64/discussion.mdwn b/openpower/sv/svp64/discussion.mdwn index 51e56d3f3..48ba0cdb8 100644 --- a/openpower/sv/svp64/discussion.mdwn +++ b/openpower/sv/svp64/discussion.mdwn @@ -372,45 +372,24 @@ a need for merging (ORing) all bits into a single alternative predicate mask ## fast traditional packed SIMD A major motivation for changing SVP64 with all isvec=0 to temporarily -override VL to 1 is to allow supporting traditional SIMD that has -constantly varying element sizes (and therefore vector lengths too) -without needing setvl every few instructions, by using SUBVL and -elwidth overrides. - -Examples of use cases: - -* WebAssembly's [128-bit packed SIMD extension](https://github.com/WebAssembly/spec/blob/8a352708cffeb71206ca49a0f743bdc57269fb1a/proposals/simd/SIMD.md) (which is becoming a de-facto standard for WebAssembly on the Web and on Servers) -* Java/C#/JavaScript/etc. 128-bit packed SIMD -* Cross-compiling x86 SSE2/AVX2 or ARM NEON or VSX/VMX code to SVP64. - -Implementing 128-bit packed SIMD can be done without constantly needing -`setvl` instructions by: - -Setting VL=4 on entry to the code. - -Then, all 128-bit packed SIMD types can be emulated without additional -`setvl` instructions: - -| 128-bit SIMD type | SVP64 vector add | -|------------------------------|-------------------------------------------------------------| -| `u8x16`/`i8x16` | sv.add/subvl=4/elwid=8 RT.vector, RA.vector, RB.vector | -| `u16x8`/`i16x8` | sv.add/subvl=2/elwid=16 RT.vector, RA.vector, RB.vector | -| `u32x4`/`i32x4` | sv.add/elwid=32 RT.vector, RA.vector, RB.vector | -| `u64x2`/`i64x2` | sv.add/subvl=2 RT.scalar, RA.scalar, RB.scalar | -| `bf16x8` (not in base SVP64) | sv.fadd/subvl=2/elwid=8 FRT.vector, FRA.vector, FRB.vector | -| `f16x8` | sv.fadd/subvl=2/elwid=16 FRT.vector, FRA.vector, FRB.vector | -| `f32x4` | sv.fadd/elwid=32 FRT.vector, FRA.vector, FRB.vector | -| `f64x2` | sv.fadd/subvl=2 FRT.scalar, FRA.scalar, FRB.scalar | - -Likewise, all 256-bit packed SIMD types can be emulated without additional `setvl` instructions by setting VL=8 on entry to the code: - -| 256-bit SIMD type | SVP64 vector add | -|-------------------------------|-------------------------------------------------------------| -| `u8x32`/`i8x32` | sv.add/subvl=4/elwid=8 RT.vector, RA.vector, RB.vector | -| `u16x16`/`i16x16` | sv.add/subvl=2/elwid=16 RT.vector, RA.vector, RB.vector | -| `u32x8`/`i32x8` | sv.add/elwid=32 RT.vector, RA.vector, RB.vector | -| `u64x4`/`i64x4` | sv.add/subvl=4 RT.scalar, RA.scalar, RB.scalar | -| `bf16x16` (not in base SVP64) | sv.fadd/subvl=2/elwid=8 FRT.vector, FRA.vector, FRB.vector | -| `f16x16` | sv.fadd/subvl=2/elwid=16 FRT.vector, FRA.vector, FRB.vector | -| `f32x8` | sv.fadd/elwid=32 FRT.vector, FRA.vector, FRB.vector | -| `f64x4` | sv.fadd/subvl=4 FRT.scalar, FRA.scalar, FRB.scalar | +override VL to 1 is to allow easy interleaving of Scalar instructions +accessing the otherwise-inaccessible registers and CR Fields numbered +32-127, without the need to change VL. Given that VL may have been +set to any value between 0 and MAXVL on a Fail-First, and given +that VL=0 turns Vector instructions into `nop` this is quite important. + +An example, emulating PackedSIMD ISA behaviour: + + # set VL=4 to be able to do the next add as `u8x16`/`i8x16` + setvl MAXVL=VL=4 + sv.add/subvl=4/elwid=8 *RT, *RA, *RB + # now set VL=1 to get scalar register access + setvl VL=1 + sv.add/subvl=4/elwid=8 RT, RA, RB + # set VL=4 again. repeat, repeat, repeat + setvl MAXVL=VL=4 + +This applies equally to loops when VL is dynamically set (from RA or CTR) +reducing instruction count in hot-loops as it does when setting VL to +static quantities as a method of emulating PackedSIMD ISA behaviour. + -- 2.30.2