Clean up benchmarks; support uarch-specific counters
[riscv-tests.git] / benchmarks / vec-cmplxmult / vec_cmplxmult_asm.S
1 #*****************************************************************************
2 # cmplxmult function (assembly version)
3 #-----------------------------------------------------------------------------
4
5
6 #--------------------------------------------------------------------------
7 # Headers and Defines
8 #--------------------------------------------------------------------------
9
10 # Here are some defines that make writing assembly code easier.
11
12 # I'm using the knowledge that rN will be placed in register a0, rA will be
13 # placed into register a1, etc., based on the calling convention for functions.
14
15
16 #define rN a0
17 #define rA a1
18 #define rB a2
19 #define rC a3
20
21 #define rVlen a6
22 #define rStride a7
23
24 #define rAI t0
25 #define rBI t1
26 #define rCI t2
27
28 # WARNING: do not write to the s0,...,s9 registers without first saving them to
29 # the stack.
30
31 #--------------------------------------------------------------------------
32 # void scalar_cmplxmult_asm( int n, float a[], float b[], float c[] )
33 #--------------------------------------------------------------------------
34
35 .text
36 .align 2
37 .globl scalar_cmplxmult_asm
38 .type scalar_cmplxmult_asm,@function
39
40 scalar_cmplxmult_asm:
41
42 # ***** Scalar Example *****
43
44 blez rN, done # exit early if n < 0
45
46 loop:
47 # The following code is a naive implementation...
48 # Re-ordering instructions may increase performance, also,
49 # RISC-V supports instrucitons such as the "fmuladd" and "fmulsub".
50 # fmsub.s fa2,fa4,fa3,ft1
51 # Finally, unrolling and other fun transformations can also provide
52 # performance gains.
53
54 flw f2, 0(rA)
55 flw f3, 4(rA)
56 flw f4, 0(rB)
57 flw f5, 4(rB)
58 fmul.s f6, f2, f4
59 fmul.s f7, f3, f5
60 fmul.s f8, f3, f4
61 fmul.s f9, f2, f5
62 fsub.s f10, f6, f7
63 fadd.s f11, f8, f9
64 fsw f10, 0(rC)
65 fsw f11, 4(rC)
66 addi rN, rN, -1
67 addi rA, rA, 8
68 addi rB, rB, 8
69 addi rC, rC, 8
70 bne rN, zero, loop
71 done:
72 ret
73
74
75 #--------------------------------------------------------------------------
76 # void vt_cmplxmult_asm( int n, float a[], float b[], float c[] )
77 #--------------------------------------------------------------------------
78
79
80 # ***** Vector-Thread Example *****
81
82 .globl vt_cmplxmult_asm
83 .type vt_cmplxmult_asm,@function
84
85 # HINT: because you are dealing with an array of structures, a regular,
86 # vanilla vector-load/vector-store won't work here!
87
88 vt_cmplxmult_asm:
89
90 blez rN, cpdone
91 la a4, vtcode
92 li rStride, 8
93
94 vvcfgivl rVlen, rN, 1, 7
95
96 stripmineloop:
97
98 # ADD YOUR CODE HERE....
99 vsetvl rVlen, rN # set the vector length
100 # rN is the desired (application) vector length
101 # rVLen is what vector length we were given
102
103 vflstw vf2, rA, rStride # real number vector load of A
104 addi rAI, rA, 4
105 vflstw vf4, rB, rStride # real number vector load of B
106 addi rBI, rB, 4
107 vflstw vf3, rAI, rStride #imaginary number vector load of A
108 vflstw vf5, rBI, rStride #imaginary vector number load of B
109
110 vf 0(a4) # jump to vector-fetch code
111
112 vfsstw vf0, rC, rStride # real number vector store C
113 addi rCI, rC, 4
114 vfsstw vf1, rCI, rStride # imaginary
115
116 slli a5, rVlen, 3
117 sub rN, rN, rVlen # book keeping
118 add rA, rA, a5
119 add rB, rB, a5
120 add rC, rC, a5
121 bne rN, zero, stripmineloop
122 # Step 0: set the vector length
123 # Step 1: perform your vector loads
124 # Step 2: jump to the vector-fetch code to perform the calculation
125 # Step 3: perform the vector store
126 # Step 4: book keeping, update the pointers, etc.
127
128 cpdone:
129 fence.v.l
130 ret
131
132 vtcode:
133 # ADD YOUR VECTOR-ELEMENT CODE HERE ...
134 fmul.s f0, f2, f4
135 fmsub.s f0, f3, f5, f0
136
137 fmul.s f1, f2, f5
138 fmadd.s f1, f3, f4, f1
139 stop
140
141 # The C code uses a jalr instruction to call this function
142 # so we can use a jr to return back to where the function
143 # was called. Also known as "ret", for "return".
144
145 ret
146