From 6cac17aa4a16c30c337a26cb083138d5fbc0ef74 Mon Sep 17 00:00:00 2001 From: lkcl Date: Thu, 16 Jun 2022 23:02:13 +0100 Subject: [PATCH] --- openpower/sv/setvl.mdwn | 62 ++--------------------------------------- 1 file changed, 3 insertions(+), 59 deletions(-) diff --git a/openpower/sv/setvl.mdwn b/openpower/sv/setvl.mdwn index 150d58774..384d2c0d7 100644 --- a/openpower/sv/setvl.mdwn +++ b/openpower/sv/setvl.mdwn @@ -79,9 +79,9 @@ using EXT22 temporarily and fitting into the Form: SVL-Form (see [[isatables/fields.text]]) -| 0.5|6.10|11.15|16..21| 22...25 | 26.30 |31| name | +| 0.5|6.10|11.15|16..22| 23...25 | 26.30 |31| name | | -- | -- | --- | ---- |----------- | ----- |--| ------- | -|OPCD| RT | RA | SVi |cv ms vs vf | 11011 |Rc| setvl | +|OPCD| RT | RA | SVi | ms vs vf | 11011 |Rc| setvl | Instruction format: @@ -90,7 +90,6 @@ Instruction format: Note that the immediate (`SVi`) spans 7 bits (16 to 22) -* `cv` - bit 22 - reads CTR instead of RA * `ms` - bit 23 - allows for setting of MVL. * `vs` - bit 24 - allows for setting of VL. * `vf` - bit 25 - sets "Vertical First Mode". @@ -194,7 +193,6 @@ is `UNDEFINED`. // instruction fields: rd = get_rt_field(); // bits 6..10 ra = get_ra_field(); // bits 11..15 - vc = get_vc_field(); // bit 22 vf = get_vf_field(); // bit 23 vs = get_vs_field(); // bit 24 ms = get_ms_field(); // bit 25 @@ -232,9 +230,7 @@ is `UNDEFINED`. // 4 options: from SPR, from immed, from ra, from CTR if vs { // VL to be sourced from fields/regs - if vc { - VL = CTR - } else if ra != 0 { + if ra != 0 { VL = GPR[ra] } else { VL = vlimmed @@ -306,55 +302,3 @@ loop: end: blr -## setmvlhi double loop - -Two elements per inner loop are executed per instruction. This assumes -that underlying hardware, when `setmvlhi` requests a parallelism hint of 2 -actually sets a parallelism hint of 2. - -This example, in c, would be: - -``` -long *r4; -for (i=0; i < CTR; i++) -{ - r4[i+2] += r4[i] -} -``` - -where, clearly, it is not possible to do more -than 2 elements in parallel at a time: attempting -to do so would result in data corruption. The compiler -may be able to determine memory aliases and inform -hardware at runtime of the maximum safe parallelism -limit. - -Whilst this example could be simplified to simply set VL=2, -or exploit the fact that overlapping adds have well-defined -behaviour, this has not been done, here, for illustrative purposes -in order to demonstrate setmvhli and Vertical-First Mode. - -Note, crucially, how r4, r32 and r20 are **NOT** incremented -inside the inner loop. The MAXVL reservation is still 8, -i.e. as srcstep and dststep advance (by 2 elements at a time) -registers r20-r27 will be used for the first LD, and -registers r32-39 for the second LD. `r4+srcstep*8` will be used -as the elstrided offset for LDs. - -``` - setmvlhi 8, 2 # MVL=8, VFHint=2 -loop: - setvl r1, CTR, vf=1 # VL=r1=MAX(MVL, CTR), VF=1 - mulli r1, r1, 8 # multiply by int width -loopinner: - sv.ld r20.v, r4(0) # load VLhint elements (max 2) - addi r2, r4, 16 # 2 elements ahead - sv.ld r32.v, r2(0) # load VLhint elements (max 2) - sv.add r32.v, r20.v, r32.v # x[i+2] += x[i] - sv.st r32.v, r2(0) # store VLhint elements - svstep. # srcstep += VLhint - bnz loopinner # repeat until srcstep=VL - # now done VL elements, move to next batch - add r4, r4, r1 # move r4 pointer forward - sv.bnz/ctr loop # decrement CTR by VL -``` -- 2.30.2