From a393c854c825c5a775545344d580a0b9ca84dd20 Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Fri, 9 Sep 2022 01:21:14 +0100 Subject: [PATCH] use-cases --- openpower/sv/rfc/ls001.mdwn | 74 ++++++++++++++++++++++++++++++++++--- 1 file changed, 68 insertions(+), 6 deletions(-) diff --git a/openpower/sv/rfc/ls001.mdwn b/openpower/sv/rfc/ls001.mdwn index 460098ab4..74c6fb1dd 100644 --- a/openpower/sv/rfc/ls001.mdwn +++ b/openpower/sv/rfc/ls001.mdwn @@ -275,9 +275,6 @@ in 7 instructions. Expanding this to 16-wide is a matter of setting `svshape 16`. Lee Composition may be deployed to construct non-power-two DCTs. The cosine table may be computed (once) with 18 Vector instructions. - - - ``` 1014 def test_sv_remap_fpmadds_ldbrev_idct_8_mode_4(self): 1015 """>>> lst = [# LOAD bit-reversed with half-swap @@ -293,6 +290,8 @@ DCTs. The cosine table may be computed (once) with 18 Vector instructions. 1025 "sv.ffmadds *0, *0, *0, *8" ``` + + # Use case: Matrix Multiply Matrix Multiply of any size (non-power-2) up to a total of 127 operations @@ -301,7 +300,6 @@ ISA at least one source requires Transposition and often massive rolling repetition of data is required. These 3 instructions may be used as the "inner triple-loop kernel" of the usual 6-loop Massive Matrix Multiply. - ``` 28 def test_sv_remap1(self): 29 """>>> lst = ["svshape 2, 2, 3, 0, 0", @@ -310,6 +308,8 @@ repetition of data is required. These 3 instructions may be used as the 32 ] ``` + + # Use case: Parallel Reduction Parallel (Horizontal) Reduction is often deeply problematic in SIMD and @@ -317,8 +317,6 @@ Vector ISAs. Parallel Reduction is Fully Deterministic in Simple-V and thus may even usefully be deployed on non-associative and non-commutative operations. - - ``` 75 def test_sv_remap2(self): 76 """>>> lst = ["svshape 7, 0, 0, 7, 0", @@ -329,6 +327,8 @@ operations. 81 left/right due to subf ``` + + # Use case: LD/ST-Multi Context-switching saving and restoring of registers on the stack often @@ -343,6 +343,68 @@ runtime-configurable LD/ST-Multi is achievable with 2 instructions. setvli 64 sv.ld/sm=EQ *rt,0(ra) ``` +\newpage{} + +# Use case: Twin-Predication, re-entrant + +This example demonstrates two key concepts: firstly Twin-Predication +(separate source predicate mask from destination predicate mask) and +that sufficient state is stored within the Vector Context SPR, SVSTATE, +for full re-entrancy on a Context Switch or function call *even if +in the middle of executing a loop*. Also demonstrates that it is +permissible for a programmer to write **directly** to the SVSTATE +SPR, and still expect Deterministic Behaviour. + +``` + 292 # checks that we are able to resume in the middle of a VL loop, + 293 # after an interrupt, or after the user has updated src/dst step + 294 # let's assume the user has prepared src/dst step before running this + 295 # vector instruction + 296 def test_intpred_reentrant(self): + 297 # reg num 0 1 2 3 4 5 6 7 8 9 10 11 12 + 298 # srcstep=1 v + 299 # src r3=0b0101 Y N Y N + 300 # : | + 301 # + - - + | + 302 # : +-------+ + 303 # : | + 304 # dest ~r3=0b1010 N Y N Y + 305 # dststep=2 ^ + 306 + 307 isa = SVP64Asm(['sv.extsb/sm=r3/dm=~r3 *5, *9']) +``` + + + +# Use case: 3D GPU style "Branch Conditional" + +(*Note: Specification is ready, Simulator still under development of +full specification capabilities*) +This example demonstrates a 2-long Vector Branch-Conditional only +succeeding if *all* elements in the Vector are successful. This +avoids the need for additional instructions that would need to +perform a Parallel Reduction of a Vector of Condition Register +tests down to a single value, on which a Scalar Branch-Conditional +could then be performed. Full Rationale at + + + +``` + 80 def test_sv_branch_cond_all(self): + 81 for i in [7, 8, 9]: + 82 lst = SVP64Asm( + 83 [f"addi 1, 0, {i+1}", # set r1 to i + 84 f"addi 2, 0, {i}", # set r2 to i + 85 "cmpi cr0, 1, 1, 8", # compare r1 with 10 and store to cr0 + 86 "cmpi cr1, 1, 2, 8", # compare r2 with 10 and store to cr1 + 87 "sv.bc/all 12, *1, 0xc", # bgt 0xc - branch if BOTH + 88 # r1 AND r2 greater 8 to the nop below + 89 "addi 3, 0, 0x1234", # if tests fail this shouldn't execute + 90 "or 0, 0, 0"] # branch target + 91 ) +``` + + [[!tag opf_rfc]] -- 2.30.2