# c code
```
- void daxpy(size_t n, double a, const double x[], double y[])
- {
- for (size_t i = 0; i < n; i++) {
- y[i] = a*x[i] + y[i];
- }
+ void daxpy(size_t n, double a, const double x[], double y[]) {
+ for (size_t i = 0; i < n; i++)
+ y[i] = a*x[i] + y[i];
}
```
+Summary
+
+| ISA | total | loop | words | notes |
+|-----|-------|------|-------|-------|
+| SVP64 | 9 | 7 | 14 | 5 64-bit, 4 32-bit |
+| RVV | 13 | 11 | 9.5 | 7 32-bit, 5 16-bit |
+| SVE | 12 | 7 | 12 | all 32-bit |
+
# SVP64 Power ISA version
-```
+Relies on post-increment, relies on no overlap between x and y
+in memory, and critically relies on y overwrite. x is post-incremented
+when read, but y is post-incremented on write. Element-Strided
+ensures the Immediate (8) results in a contiguous LD (or store)
+despite RA being marked Scalar (*without* modifying RA, on `sv.lfd/els`).
+For `sv.lfdup`, RA is Scalar so that only one
+LD/ST Update "wins": the last write to RA is the address for
+the next block.
- # r5: n count
- # r6: x ptr
- # r7: y ptr
- # fp1: a mul-scalar
- mtctr 5 # move n to CTR
- addi r10,r6,0 # copy y-ptr into r10
-.L2
- setvl MAXVL=32,VL=CTR # could do more
- sv.lfdup/els *32,8(6) # load from x into fp32-63
- sv.lfdup/els *64,8(7) # load from y into fp64-95
- sv.fmadd *64,*64,1,*32 # (*y) = (*y) * (*x) + fp1
- stfdup/els *64,8(10) # store y-copy
- sv.bc/ctr .L2 # decrement VL by CTR
- blr # return
+```
+ # r5: n count; r6: x ptr; r7: y ptr; fp1: a
+ 1 addi r3,r7,0 # return result
+ 2 mtctr 5 # move n to CTR
+ 3 .L2
+ 4 setvl MAXVL=32,VL=CTR # actually VL=MIN(MAXVL,CTR)
+ 5 sv.lfdup/els *32,8(6) # load x into fp32-63, incr x
+ 6 sv.lfd/els *64,8(7) # load y into fp64-95, NO INC
+ 7 sv.fmadd *64,*64,1,*32 # (*y) = (*y) * (*x) + a
+ 8 sv.stfdup/els *64,8(7) # store at y, incr y
+ 9 sv.bc/ctr .L2 # decr CTR by VL, jump !zero
+ 10 blr # return
```
# RVV version
+
```
# a0 is n, a1 is pointer to x[0], a2 is pointer to y[0], fa0 is a
li t0, 2<<25