benchmarks/vec-cmplxmult/vec_cmplxmult_asm.S

   1 # See LICENSE for license details.
   2
   3 #*****************************************************************************
   4 # cmplxmult function (assembly version)
   5 #-----------------------------------------------------------------------------
   6
   7
   8 #--------------------------------------------------------------------------
   9 # Headers and Defines
  10 #--------------------------------------------------------------------------
  11
  12 # Here are some defines that make writing assembly code easier.
  13
  14 # I'm using the knowledge that rN will be placed in register a0, rA will be
  15 # placed into register a1, etc., based on the calling convention for functions.
  16
  17
  18 #define rN      a0
  19 #define rA      a1
  20 #define rB      a2
  21 #define rC      a3
  22
  23 #define rVlen   a6
  24 #define rStride a7
  25
  26 #define rAI t0
  27 #define rBI t1
  28 #define rCI t2
  29
  30 # WARNING: do not write to the s0,...,s9 registers without first saving them to
  31 # the stack.
  32
  33 #--------------------------------------------------------------------------
  34 # void scalar_cmplxmult_asm( int n, float a[], float b[], float c[] )
  35 #--------------------------------------------------------------------------
  36
  37         .text
  38         .align 2
  39         .globl scalar_cmplxmult_asm
  40         .type  scalar_cmplxmult_asm,@function
  41
  42 scalar_cmplxmult_asm:
  43
  44         # *****   Scalar Example   *****
  45
  46         blez rN, done    # exit early if n < 0
  47
  48 loop:
  49       # The following code is a naive implementation...
  50       # Re-ordering instructions may increase performance, also,
  51       # RISC-V supports instrucitons such as the "fmuladd" and "fmulsub".
  52       # fmsub.s fa2,fa4,fa3,ft1
  53       # Finally, unrolling and other fun transformations can also provide
  54       # performance gains.
  55
  56         flw  f2, 0(rA)
  57         flw  f3, 4(rA)
  58         flw  f4, 0(rB)
  59         flw  f5, 4(rB)
  60         fmul.s f6, f2, f4
  61         fmul.s f7, f3, f5
  62         fmul.s f8, f3, f4
  63         fmul.s f9, f2, f5
  64         fsub.s f10, f6, f7
  65         fadd.s f11, f8, f9
  66         fsw  f10, 0(rC)
  67         fsw  f11, 4(rC)
  68         addi rN, rN, -1
  69         addi rA, rA, 8
  70         addi rB, rB, 8
  71         addi rC, rC, 8
  72         bne  rN, zero, loop
  73 done:
  74         ret
  75
  76
  77 #--------------------------------------------------------------------------
  78 # void vt_cmplxmult_asm( int n, float a[], float b[], float c[] )
  79 #--------------------------------------------------------------------------
  80
  81
  82         # ***** Vector-Thread Example *****
  83
  84         .globl vt_cmplxmult_asm
  85         .type  vt_cmplxmult_asm,@function
  86
  87         # HINT: because you are dealing with an array of structures, a regular,
  88         # vanilla vector-load/vector-store won't work here!
  89
  90 vt_cmplxmult_asm:
  91
  92         blez rN, cpdone
  93         la a4, vtcode
  94         li rStride, 8
  95
  96         vvcfgivl rVlen, rN, 1, 7
  97
  98 stripmineloop:
  99
 100          # ADD YOUR CODE HERE....
 101         vsetvl rVlen, rN   # set the vector length
 102                            # rN is the desired (application) vector length
 103                            # rVLen is what vector length we were given
 104
 105         vflstw vf2, rA, rStride       # real number vector load of A
 106         addi rAI, rA, 4
 107         vflstw vf4, rB, rStride  # real number vector load of B
 108         addi rBI, rB, 4
 109         vflstw vf3, rAI, rStride #imaginary number vector load of A
 110         vflstw vf5, rBI, rStride #imaginary vector number load of B
 111
 112         vf 0(a4)           # jump to vector-fetch code
 113
 114         vfsstw vf0, rC, rStride       # real number vector store C
 115         addi rCI, rC, 4
 116         vfsstw vf1, rCI, rStride # imaginary
 117
 118         slli a5, rVlen, 3
 119         sub rN, rN, rVlen  # book keeping
 120         add rA, rA, a5
 121         add rB, rB, a5
 122         add rC, rC, a5
 123         bne rN, zero, stripmineloop
 124          # Step 0: set the vector length
 125          # Step 1: perform your vector loads
 126          # Step 2: jump to the vector-fetch code to perform the calculation
 127          # Step 3: perform the vector store
 128          # Step 4: book keeping, update the pointers, etc.
 129
 130 cpdone:
 131         fence.v.l
 132         ret
 133
 134 vtcode:
 135         # ADD YOUR VECTOR-ELEMENT CODE HERE ...
 136         fmul.s f0, f2, f4
 137         fmsub.s f0, f3, f5, f0
 138
 139         fmul.s f1, f2, f5
 140         fmadd.s f1, f3, f4, f1
 141         stop
 142
 143         # The C code uses a jalr instruction to call this function
 144         # so we can use a jr to return back to where the function
 145         # was called.  Also known as "ret", for "return".
 146
 147         ret
 148