simple_v_extension/daxpy_example.mdwn

   1 ```
   2     # c code
   3     void daxpy(size_t n, double a, const double x[], double y[])
   4     {
   5      for (size_t i = 0; i < n; i++) {
   6        y[i] = a*x[i] + y[i];
   7      }
   8     }
   9
  10
  11     # SVP64 Power ISA version
  12     # r5: n
  13     # r5: x
  14     # r6: y
  15     # fp1: a
  16     mtctr 5                 # move n to CTR
  17     addi r10,r6,0           # copy y-ptr into r10
  18 .L2
  19     setvl MAXVL=32,VL=CTR   # could do more
  20     sv.lfdup/els *32,8(6)   # load from x
  21     sv.lfdup/els *64,8(7)   # load from y
  22     sv.fmadd *64,*64,1,*32  # fmadd
  23     stfdup/els *64,8(10)    # store y-copy
  24     sv.bc/ctr .L2           # decrement VL by CTR
  25     blr                     # return
  26
  27     # SV Version
  28     # a0 is n, a1 is ptr to x[0], a2 is ptr to y[0], fa0 is a (scalar)
  29       VBLK.REG[0] = {type: F, isvec: 1, regkey: a3, regidx: a3, elwidth: dflt}
  30       VBLK.REG[1] = {type: F, isvec: 1, regkey: a7, regidx: a7, elwidth: dflt}
  31     loop:
  32       VBLK.SETVL  t0, a0, #4   # MVL=4, vl = t0 = min(a0, MVL))
  33       c.ld     a3, a1          # load 4 registers a3-6 from x
  34       c.slli   t1, t0, 3       # t1 = vl * 8 (in bytes: FP is double)
  35       c.ld     a7, a2          # load 4 registers a7-10 from y
  36       c.add    a1, a1, t1      # increment pointer to x by vl*8
  37       fmadd  a7, a3, fa0, a7   # v1 += v0 * fa0 (y = a * x + y)
  38       c.sub    a0, a0, t0      # n -= vl (t0)
  39       c.st     a7, a2          # store 4 registers a7-10 to y
  40       c.add    a2, a2, t1      # increment pointer to y by vl*8
  41       c.bnez   a0, loop        # repeat if n != 0
  42       c.ret                    # return
  43
  44     # RVV version
  45     # a0 is n, a1 is pointer to x[0], a2 is pointer to y[0], fa0 is a
  46       li t0, 2<<25
  47       vsetdcfg t0             # enable 2 64b Fl.Pt. registers
  48     loop:
  49       setvl  t0, a0           # vl = t0 = min(mvl, n)
  50       vld    v0, a1           # load vector x
  51       c.slli   t1, t0, 3      # t1 = vl * 8 (in bytes)
  52       vld    v1, a2           # load vector y
  53       c.add    a1, a1, t1     # increment pointer to x by vl*8
  54       vfmadd v1, v0, fa0, v1  # v1 += v0 * fa0 (y = a * x + y)
  55       c.sub    a0, a0, t0     # n -= vl (t0)
  56       vst    v1, a2           # store Y
  57       c.add    a2, a2, t1     # increment pointer to y by vl*8
  58       c.bnez   a0, loop       # repeat if n != 0
  59       c.ret                   # return
  60 ```