From 96bdd481d17ef2986cf8754e04d63901113abf54 Mon Sep 17 00:00:00 2001 From: lkcl Date: Mon, 16 Aug 2021 12:19:24 +0100 Subject: [PATCH] --- openpower/sv/setvl.mdwn | 47 +++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/openpower/sv/setvl.mdwn b/openpower/sv/setvl.mdwn index 1391cf8e3..6bd770b5f 100644 --- a/openpower/sv/setvl.mdwn +++ b/openpower/sv/setvl.mdwn @@ -318,18 +318,53 @@ loop: ## setmvlhi double loop +Two elements per inner loop are executed per instruction. This assumes +that underlying hardware, when `setmvlhi` requests a parallelism hint of 2 +actually sets a parallelism hint of 2. + +This example, in c, would be: + +``` +long *r4; +for (i=0; i < CTR; i++) +{ + r4[i+2] += r4[i] +} +``` + +where, clearly, it is not possible to do more +than 2 elements in parallel at a time: attempting +to do so would result in data corruption. The compiler +may be able to determine memory aliases and inform +hardware at runtime of the maximum safe parallelism +limit. + +Whilst this example could be simplified to simply set VL=2, +or exploit the fact that overlapping adds have well-defined +behaviour, this has not been done, here, for illustrative purposes +in order to demonstrate setmvhli and Vertical-First Mode. + +Note, crucially, how r4, r32 and r20 are **NOT** incremented +inside the inner loop. The MAXVL reservation is still 8, +i.e. as srcstep and dststep advance (by 2 elements at a time) +registers r20-r27 will be used for the first LD, and +registers r32-39 for the second LD. `r4+srcstep*8` will be used +as the elstrided offset for LDs. + ``` setmvlhi 8, 2 # MVL=8, VFHint=2 loop: - setvl r5, r3 # VL=r5=MAX(MVL, r3) + setvl r1, CTR, vf=1 # VL=r1=MAX(MVL, CTR), VF=1 + mulli r1, r1, 8 # multiply by int width loopinner: sv.ld r20.v, r4(0) # load VLhint elements (max 2) - sv.addi r20.v, r20.v, 55 # add 55 to 2 elements - sv.st r20.v, r4(0) # store VLhint elements + addi r2, r4, 16 # 2 elements ahead + sv.ld r32.v, r2(0) # load VLhint elements (max 2) + sv.add r32.v, r20.v, r32.v # x[i+2] += x[i] + sv.st r32.v, r2(0) # store VLhint elements svstep. # srcstep += VLhint bnz loopinner # repeat until srcstep=VL # now done VL elements, move to next batch - add r4, r4, r5 # move r4 pointer forward - sub. r3, r3, r5 # decrement total count by VL - bnz loop + add r4, r4, r1 # move r4 pointer forward + sv.bnz/ctr loop # decrement CTR by VL ``` -- 2.30.2