Merge branch 'master' of github.com:ucb-bar/riscv-tests
[riscv-tests.git] / benchmarks / vec-fft / vec-vfft.S
1 .text
2 .align 2
3
4 #include "fft_const.h"
5
6 #if defined(FFT_FIXED)
7 #define PTR_SHIFT 2
8 #define PTR_SIZE 4
9
10 #define DATA_LOAD lw
11 #define DATA_STORE sw
12
13 #define FFT_MUL mul
14 #define FFT_ADD add
15 #define FFT_SUB sub
16
17 #define REG0 x3
18 #define REG1 x4
19 #define REG2 x5
20 #define REG3 x6
21 #define REG4 x7
22 #define REG5 x8
23 #elif defined(FFT_FLOATING)
24 #if defined(FP_HALF)
25 #define PTR_SHIFT 1
26 #define PTR_SIZE 2
27
28 #define DATA_LOAD flh
29 #define DATA_STORE fsh
30
31 #define FFT_MUL fmul.h
32 #define FFT_ADD fadd.h
33 #define FFT_SUB fsub.h
34 #elif defined(FP_SINGLE)
35 #define PTR_SHIFT 2
36 #define PTR_SIZE 4
37
38 #define DATA_LOAD flw
39 #define DATA_STORE fsw
40
41 #define FFT_MUL fmul.s
42 #define FFT_ADD fadd.s
43 #define FFT_SUB fsub.s
44 #elif defined(FP_DOUBLE)
45 #define PTR_SHIFT 3
46 #define PTR_SIZE 8
47
48 #define DATA_LOAD fld
49 #define DATA_STORE fsd
50
51 #define FFT_MUL fmul.d
52 #define FFT_ADD fadd.d
53 #define FFT_SUB fsub.d
54 #endif
55
56 #define REG0 f0
57 #define REG1 f1
58 #define REG2 f2
59 #define REG3 f3
60 #define REG4 f4
61 #define REG5 f5
62 #else
63 #error FFT_FIXED or FFT_FLOATING not defined
64 #endif
65
66 .globl vf_test
67 vf_test:
68 utidx x2
69 add x1, x1, x2
70 add x1, x1, x1
71 stop
72
73 .globl vf_fft_init
74 vf_fft_init:
75 # IN:
76 # x1: lane start (utidx=0 actually has this pos due to stripmining)
77 # x2: bit mask to select FFT block from op idx
78 # x3: bit mask to select operand in FFT block from op idx
79 # x4: necessary shift to adjust TF appropriately ( REMOVED )
80 # x5: half the current FFT size (add to get the second op)
81 # OUT:
82 # x1: Has the first operand pos = (opid & i_x2) << 1 + (opid & i_x3)
83 # x2: Has the second operand pos = o_x1 + i_x5
84 # x3: Has the twiddle factor pos = (opid & i_x3) << i_x4
85 utidx x6
86 add x6, x1, x6 # x6 <= opid
87 and x2, x2, x6 # x2 <= opid & i_x2
88 and x3, x3, x6 # x3 <= opid & i_x3
89 slli x2, x2, 1 # x2 <= (opid & i_x2) << 1
90
91 add x1, x2, x3 # x1 is now the proper result
92 add x2, x1, x5 # x2 is now the proper result
93 sll x3, x3, x4
94
95 stop
96
97 .globl vf_fft_scale
98 vf_fft_scale:
99 # IN:
100 # x1: Has the first operand pos (reused)
101 # x2: Has the second operand pos (reused)
102 # x3: Has the twiddle factor pos (reused)
103 # x4: Has the tf real ptr
104 # x5: Has the tf imag ptr
105 # x6: Has the workspace real ptr
106 # x7: Has the workspace imag ptr
107 # x8: Has the fixed point shift ( REMOVED )
108 # OUT:
109 # x1: Has the first operand offset = i_x1 << 3
110 # x2: Has the second operand offset = i_x2 << 3
111 # x3: Has the scale factor real
112 # x4: Has the scale factor imag
113 # Convert positions into actual memory offsets from table start
114 slli x1, x1, PTR_SHIFT # x1 <= i_x1 << 3 (proper result)
115 slli x2, x2, PTR_SHIFT # x2 <= i_x2 << 3 (proper result)
116 slli x3, x3, PTR_SHIFT # x3 <= i_x3 << 3 (tf offset)
117
118 # Compute memory locations
119 add x4, x4, x3 # x4 <= load address for tf real
120 add x5, x5, x3 # x5 <= load address for tf imag
121 add x6, x6, x2 # x6 <= load address for op2 real
122 add x7, x7, x2 # x7 <= load address for op2 imag
123
124 # Actually read memory
125 DATA_LOAD REG1, 0(x4) # tf real (a)
126 DATA_LOAD REG2, 0(x5) # tf imag (bi)
127 DATA_LOAD REG3, 0(x6) # op2 real (c)
128 DATA_LOAD REG4, 0(x7) # op2 imag (di)
129
130 # Do the math using 3 multiplies
131 FFT_ADD REG0, REG1, REG2 # REG0 <= a + b
132 FFT_SUB REG2, REG2, REG1 # REG2 <= b - a
133 FFT_MUL REG0, REG0, REG4 # REG0 <= (a+b)d
134 #ifdef FFT_FIXED
135 sra REG0, REG0, REG5 # DO NOT SHIFT FOR FLOATING
136 #endif
137 FFT_MUL REG2, REG2, REG3 # REG2 <= (b-a)c
138 #ifdef FFT_FIXED
139 sra REG2, REG2, REG5 # DO NOT SHIFT FOR FLOATING
140 #endif
141 FFT_ADD REG3, REG3, REG4 # REG3 <= c + d
142 FFT_MUL REG4, REG1, REG3 # REG4 <= a(c+d)
143 #ifdef FFT_FIXED
144 sra REG4, REG4, REG5 # DO NOT SHIFT FOR FLOATING
145 #endif
146
147 # Prepare final result
148 FFT_SUB REG0, REG4, REG0 # REG0 <= a(c+d) - (a+b)d (scale real)
149 FFT_ADD REG1, REG4, REG2 # REG1 <= a(c+d) + (b-a)c (scale imag)
150
151 stop
152 /*
153 # Four multiply version
154 # Do the multiplications (a+bi)(c+di) needs ac ad bc bd
155 mul x3, x4, x6 # x3 <= ac
156 mul x4, x4, x7 # x4 <= adi
157 mul x6, x5, x6 # x6 <= bc
158 mul x5, x5, x7 # x5 <= bdi
159 sra x3, x3, x8 # These 4 shifts make sure the fixed pt properly aligned
160 sra x4, x4, x8
161 sra x5, x5, x8
162 sra x6, x6, x8
163
164 # Do the additions (ac - bd) and (bc + ad)
165 sub x3, x3, x5 # x3 <= ac - bd (proper result)
166 add x4, x4, x6 # x4 <= bc + ad (proper result)
167 */
168
169 .globl vf_fft_exec
170 vf_fft_exec:
171 # IN:
172 # x1: Has the first operand offset (reused)
173 # x2: Has the second operand offset (reused)
174 # x3: Has the scale factor real (reused)
175 # x4: Has the scale factor imag (reused)
176 # x5: Has the workspace real ptr
177 # x6: Has the workspace imag ptr
178 # OUT:
179 # x1: Has the first operand offset (carry)
180 # x2: Has the second operand offset (carry)
181 # x5: Has the first result real
182 # x6: Has the first result imag
183 # x7: Has the second result real
184 # x8: Has the second result imag
185 # Compute first operand memory locations
186 add x5, x5, x1 # x5 <= load address for op1 real
187 add x6, x6, x1 # x6 <= load address for op1 imag
188
189 #actually read memory
190 DATA_LOAD REG2, 0(x5) # op1 real
191 DATA_LOAD REG3, 0(x6) # op1 imag
192
193 # Do the add/subs (res1=op1+scale), (res2=op1-scale)
194 FFT_SUB REG4, REG2, REG0 # res2 real
195 FFT_SUB REG5, REG3, REG1 # res2 imag
196 FFT_ADD REG2, REG2, REG0 # res1 real
197 FFT_ADD REG3, REG3, REG1 # res1 imag
198
199 stop
200
201 .globl vf_fft_store1
202 vf_fft_store1:
203 # IN:
204 # x1: Has the first operand offset (reused)
205 # x2: Has the second operand offset (reused)
206 # x3: Has the workspace real ptr
207 # x4: Has the workspace imag ptr
208 # x5: Has the first result real (reused)
209 # x6: Has the first result imag (reused)
210 # x7: Has the second result real (reused)
211 # x8: Has the second result imag (reused)
212 # OUT:
213 # x2: Has the second operand offset (carry)
214 # x7: Has the second result real (carry)
215 # x8: Has the second result imag (carry)
216 # Compute first result memory locations
217 add x3, x3, x1
218 add x4, x4, x1
219
220 # actually write memory
221 DATA_STORE REG2, 0(x3)
222 DATA_STORE REG3, 0(x4)
223
224 stop
225
226 .globl vf_fft_store2
227 vf_fft_store2:
228 # IN:
229 # x2: Has the second operand offset
230 # x3: Has the workspace real ptr
231 # x4: Has the workspace imag ptr
232 # x7: Has the second result real
233 # x8: Has the second result imag
234 # OUT: (none)
235 # Compute second result memory locations
236 add x3, x3, x2
237 add x4, x4, x2
238
239 # actually write memory
240 DATA_STORE REG4, 0(x3)
241 DATA_STORE REG5, 0(x4)
242
243 stop