Add a top-level make clean target.
[riscv-tests.git] / benchmarks / vec-fft / vec-vfft.S
1 # See LICENSE for license details.
2
3 .text
4 .align 2
5
6 #include "fft_const.h"
7
8 #if defined(FFT_FIXED)
9 #define PTR_SHIFT 2
10 #define PTR_SIZE 4
11
12 #define DATA_LOAD lw
13 #define DATA_STORE sw
14
15 #define FFT_MUL mul
16 #define FFT_ADD add
17 #define FFT_SUB sub
18
19 #define REG0 x3
20 #define REG1 x4
21 #define REG2 x5
22 #define REG3 x6
23 #define REG4 x7
24 #define REG5 x8
25 #elif defined(FFT_FLOATING)
26 #if defined(FP_HALF)
27 #define PTR_SHIFT 1
28 #define PTR_SIZE 2
29
30 #define DATA_LOAD flh
31 #define DATA_STORE fsh
32
33 #define FFT_MUL fmul.h
34 #define FFT_ADD fadd.h
35 #define FFT_SUB fsub.h
36 #elif defined(FP_SINGLE)
37 #define PTR_SHIFT 2
38 #define PTR_SIZE 4
39
40 #define DATA_LOAD flw
41 #define DATA_STORE fsw
42
43 #define FFT_MUL fmul.s
44 #define FFT_ADD fadd.s
45 #define FFT_SUB fsub.s
46 #elif defined(FP_DOUBLE)
47 #define PTR_SHIFT 3
48 #define PTR_SIZE 8
49
50 #define DATA_LOAD fld
51 #define DATA_STORE fsd
52
53 #define FFT_MUL fmul.d
54 #define FFT_ADD fadd.d
55 #define FFT_SUB fsub.d
56 #endif
57
58 #define REG0 f0
59 #define REG1 f1
60 #define REG2 f2
61 #define REG3 f3
62 #define REG4 f4
63 #define REG5 f5
64 #else
65 #error FFT_FIXED or FFT_FLOATING not defined
66 #endif
67
68 .globl vf_test
69 vf_test:
70 utidx x2
71 add x1, x1, x2
72 add x1, x1, x1
73 stop
74
75 .globl vf_fft_init
76 vf_fft_init:
77 # IN:
78 # x1: lane start (utidx=0 actually has this pos due to stripmining)
79 # x2: bit mask to select FFT block from op idx
80 # x3: bit mask to select operand in FFT block from op idx
81 # x4: necessary shift to adjust TF appropriately ( REMOVED )
82 # x5: half the current FFT size (add to get the second op)
83 # OUT:
84 # x1: Has the first operand pos = (opid & i_x2) << 1 + (opid & i_x3)
85 # x2: Has the second operand pos = o_x1 + i_x5
86 # x3: Has the twiddle factor pos = (opid & i_x3) << i_x4
87 utidx x6
88 add x6, x1, x6 # x6 <= opid
89 and x2, x2, x6 # x2 <= opid & i_x2
90 and x3, x3, x6 # x3 <= opid & i_x3
91 slli x2, x2, 1 # x2 <= (opid & i_x2) << 1
92
93 add x1, x2, x3 # x1 is now the proper result
94 add x2, x1, x5 # x2 is now the proper result
95 sll x3, x3, x4
96
97 stop
98
99 .globl vf_fft_scale
100 vf_fft_scale:
101 # IN:
102 # x1: Has the first operand pos (reused)
103 # x2: Has the second operand pos (reused)
104 # x3: Has the twiddle factor pos (reused)
105 # x4: Has the tf real ptr
106 # x5: Has the tf imag ptr
107 # x6: Has the workspace real ptr
108 # x7: Has the workspace imag ptr
109 # x8: Has the fixed point shift ( REMOVED )
110 # OUT:
111 # x1: Has the first operand offset = i_x1 << 3
112 # x2: Has the second operand offset = i_x2 << 3
113 # x3: Has the scale factor real
114 # x4: Has the scale factor imag
115 # Convert positions into actual memory offsets from table start
116 slli x1, x1, PTR_SHIFT # x1 <= i_x1 << 3 (proper result)
117 slli x2, x2, PTR_SHIFT # x2 <= i_x2 << 3 (proper result)
118 slli x3, x3, PTR_SHIFT # x3 <= i_x3 << 3 (tf offset)
119
120 # Compute memory locations
121 add x4, x4, x3 # x4 <= load address for tf real
122 add x5, x5, x3 # x5 <= load address for tf imag
123 add x6, x6, x2 # x6 <= load address for op2 real
124 add x7, x7, x2 # x7 <= load address for op2 imag
125
126 # Actually read memory
127 DATA_LOAD REG1, 0(x4) # tf real (a)
128 DATA_LOAD REG2, 0(x5) # tf imag (bi)
129 DATA_LOAD REG3, 0(x6) # op2 real (c)
130 DATA_LOAD REG4, 0(x7) # op2 imag (di)
131
132 # Do the math using 3 multiplies
133 FFT_ADD REG0, REG1, REG2 # REG0 <= a + b
134 FFT_SUB REG2, REG2, REG1 # REG2 <= b - a
135 FFT_MUL REG0, REG0, REG4 # REG0 <= (a+b)d
136 #ifdef FFT_FIXED
137 sra REG0, REG0, REG5 # DO NOT SHIFT FOR FLOATING
138 #endif
139 FFT_MUL REG2, REG2, REG3 # REG2 <= (b-a)c
140 #ifdef FFT_FIXED
141 sra REG2, REG2, REG5 # DO NOT SHIFT FOR FLOATING
142 #endif
143 FFT_ADD REG3, REG3, REG4 # REG3 <= c + d
144 FFT_MUL REG4, REG1, REG3 # REG4 <= a(c+d)
145 #ifdef FFT_FIXED
146 sra REG4, REG4, REG5 # DO NOT SHIFT FOR FLOATING
147 #endif
148
149 # Prepare final result
150 FFT_SUB REG0, REG4, REG0 # REG0 <= a(c+d) - (a+b)d (scale real)
151 FFT_ADD REG1, REG4, REG2 # REG1 <= a(c+d) + (b-a)c (scale imag)
152
153 stop
154 /*
155 # Four multiply version
156 # Do the multiplications (a+bi)(c+di) needs ac ad bc bd
157 mul x3, x4, x6 # x3 <= ac
158 mul x4, x4, x7 # x4 <= adi
159 mul x6, x5, x6 # x6 <= bc
160 mul x5, x5, x7 # x5 <= bdi
161 sra x3, x3, x8 # These 4 shifts make sure the fixed pt properly aligned
162 sra x4, x4, x8
163 sra x5, x5, x8
164 sra x6, x6, x8
165
166 # Do the additions (ac - bd) and (bc + ad)
167 sub x3, x3, x5 # x3 <= ac - bd (proper result)
168 add x4, x4, x6 # x4 <= bc + ad (proper result)
169 */
170
171 .globl vf_fft_exec
172 vf_fft_exec:
173 # IN:
174 # x1: Has the first operand offset (reused)
175 # x2: Has the second operand offset (reused)
176 # x3: Has the scale factor real (reused)
177 # x4: Has the scale factor imag (reused)
178 # x5: Has the workspace real ptr
179 # x6: Has the workspace imag ptr
180 # OUT:
181 # x1: Has the first operand offset (carry)
182 # x2: Has the second operand offset (carry)
183 # x5: Has the first result real
184 # x6: Has the first result imag
185 # x7: Has the second result real
186 # x8: Has the second result imag
187 # Compute first operand memory locations
188 add x5, x5, x1 # x5 <= load address for op1 real
189 add x6, x6, x1 # x6 <= load address for op1 imag
190
191 #actually read memory
192 DATA_LOAD REG2, 0(x5) # op1 real
193 DATA_LOAD REG3, 0(x6) # op1 imag
194
195 # Do the add/subs (res1=op1+scale), (res2=op1-scale)
196 FFT_SUB REG4, REG2, REG0 # res2 real
197 FFT_SUB REG5, REG3, REG1 # res2 imag
198 FFT_ADD REG2, REG2, REG0 # res1 real
199 FFT_ADD REG3, REG3, REG1 # res1 imag
200
201 stop
202
203 .globl vf_fft_store1
204 vf_fft_store1:
205 # IN:
206 # x1: Has the first operand offset (reused)
207 # x2: Has the second operand offset (reused)
208 # x3: Has the workspace real ptr
209 # x4: Has the workspace imag ptr
210 # x5: Has the first result real (reused)
211 # x6: Has the first result imag (reused)
212 # x7: Has the second result real (reused)
213 # x8: Has the second result imag (reused)
214 # OUT:
215 # x2: Has the second operand offset (carry)
216 # x7: Has the second result real (carry)
217 # x8: Has the second result imag (carry)
218 # Compute first result memory locations
219 add x3, x3, x1
220 add x4, x4, x1
221
222 # actually write memory
223 DATA_STORE REG2, 0(x3)
224 DATA_STORE REG3, 0(x4)
225
226 stop
227
228 .globl vf_fft_store2
229 vf_fft_store2:
230 # IN:
231 # x2: Has the second operand offset
232 # x3: Has the workspace real ptr
233 # x4: Has the workspace imag ptr
234 # x7: Has the second result real
235 # x8: Has the second result imag
236 # OUT: (none)
237 # Compute second result memory locations
238 add x3, x3, x2
239 add x4, x4, x2
240
241 # actually write memory
242 DATA_STORE REG4, 0(x3)
243 DATA_STORE REG5, 0(x4)
244
245 stop