dff057efea6c2cfc8d759a11163bdb7408a1e718
[riscv-tests.git] / benchmarks / vec-cmplxmult / vec_cmplxmult_asm.S
1 # See LICENSE for license details.
2
3 #*****************************************************************************
4 # cmplxmult function (assembly version)
5 #-----------------------------------------------------------------------------
6
7
8 #--------------------------------------------------------------------------
9 # Headers and Defines
10 #--------------------------------------------------------------------------
11
12 # Here are some defines that make writing assembly code easier.
13
14 # I'm using the knowledge that rN will be placed in register a0, rA will be
15 # placed into register a1, etc., based on the calling convention for functions.
16
17
18 #define rN a0
19 #define rA a1
20 #define rB a2
21 #define rC a3
22
23 #define rVlen a6
24 #define rStride a7
25
26 #define rAI t0
27 #define rBI t1
28 #define rCI t2
29
30 # WARNING: do not write to the s0,...,s9 registers without first saving them to
31 # the stack.
32
33 #--------------------------------------------------------------------------
34 # void scalar_cmplxmult_asm( int n, float a[], float b[], float c[] )
35 #--------------------------------------------------------------------------
36
37 .text
38 .align 2
39 .globl scalar_cmplxmult_asm
40 .type scalar_cmplxmult_asm,@function
41
42 scalar_cmplxmult_asm:
43
44 # ***** Scalar Example *****
45
46 blez rN, done # exit early if n < 0
47
48 loop:
49 # The following code is a naive implementation...
50 # Re-ordering instructions may increase performance, also,
51 # RISC-V supports instrucitons such as the "fmuladd" and "fmulsub".
52 # fmsub.s fa2,fa4,fa3,ft1
53 # Finally, unrolling and other fun transformations can also provide
54 # performance gains.
55
56 flw f2, 0(rA)
57 flw f3, 4(rA)
58 flw f4, 0(rB)
59 flw f5, 4(rB)
60 fmul.s f6, f2, f4
61 fmul.s f7, f3, f5
62 fmul.s f8, f3, f4
63 fmul.s f9, f2, f5
64 fsub.s f10, f6, f7
65 fadd.s f11, f8, f9
66 fsw f10, 0(rC)
67 fsw f11, 4(rC)
68 addi rN, rN, -1
69 addi rA, rA, 8
70 addi rB, rB, 8
71 addi rC, rC, 8
72 bne rN, zero, loop
73 done:
74 ret
75
76
77 #--------------------------------------------------------------------------
78 # void vt_cmplxmult_asm( int n, float a[], float b[], float c[] )
79 #--------------------------------------------------------------------------
80
81
82 # ***** Vector-Thread Example *****
83
84 .globl vt_cmplxmult_asm
85 .type vt_cmplxmult_asm,@function
86
87 # HINT: because you are dealing with an array of structures, a regular,
88 # vanilla vector-load/vector-store won't work here!
89
90 vt_cmplxmult_asm:
91
92 blez rN, cpdone
93 la a4, vtcode
94 li rStride, 8
95
96 vvcfgivl rVlen, rN, 1, 7
97
98 stripmineloop:
99
100 # ADD YOUR CODE HERE....
101 vsetvl rVlen, rN # set the vector length
102 # rN is the desired (application) vector length
103 # rVLen is what vector length we were given
104
105 vflstw vf2, rA, rStride # real number vector load of A
106 addi rAI, rA, 4
107 vflstw vf4, rB, rStride # real number vector load of B
108 addi rBI, rB, 4
109 vflstw vf3, rAI, rStride #imaginary number vector load of A
110 vflstw vf5, rBI, rStride #imaginary vector number load of B
111
112 vf 0(a4) # jump to vector-fetch code
113
114 vfsstw vf0, rC, rStride # real number vector store C
115 addi rCI, rC, 4
116 vfsstw vf1, rCI, rStride # imaginary
117
118 slli a5, rVlen, 3
119 sub rN, rN, rVlen # book keeping
120 add rA, rA, a5
121 add rB, rB, a5
122 add rC, rC, a5
123 bne rN, zero, stripmineloop
124 # Step 0: set the vector length
125 # Step 1: perform your vector loads
126 # Step 2: jump to the vector-fetch code to perform the calculation
127 # Step 3: perform the vector store
128 # Step 4: book keeping, update the pointers, etc.
129
130 cpdone:
131 fence.v.l
132 ret
133
134 vtcode:
135 # ADD YOUR VECTOR-ELEMENT CODE HERE ...
136 fmul.s f0, f2, f4
137 fmsub.s f0, f3, f5, f0
138
139 fmul.s f1, f2, f5
140 fmadd.s f1, f3, f4, f1
141 stop
142
143 # The C code uses a jalr instruction to call this function
144 # so we can use a jr to return back to where the function
145 # was called. Also known as "ret", for "return".
146
147 ret
148