Add a top-level make clean target.
[riscv-tests.git] / benchmarks / vec-fft / vec-fft.c
1 // See LICENSE for license details.
2
3 // *************************************************************************
4 // multiply function (c version)
5 // -------------------------------------------------------------------------
6 #include "vec-fft.h"
7 #include "fft_const.h"
8
9 int log2down(int in)
10 {
11 int counter = -1;
12 while(in > 0) { counter++; in = in >> 1; }
13 return counter;
14 }
15
16 void fft(fftval_t workspace_real[], fftval_t workspace_imag[],
17 const fftval_t tf_real[], const fftval_t tf_imag[]) //size is FFT_SIZE
18 {
19 const int num_stage_ops = FFT_SIZE >> 1;
20 const int logfftsize = log2down(FFT_SIZE);
21
22 int given_vl;
23 // First, setup hwacha to what we need:
24 #if defined(FFT_FIXED)
25 // num_stage_ops VL, 9 x-reg (1 zero, 2 ctrl, 4 data, 2 scratch), 1 fpu (avert bug)
26 asm volatile ("vsetcfg 9, 1");
27 #elif defined(FFT_FLOATING)
28 asm volatile ("vsetcfg 8, 6");
29 #if defined(FP_SINGLE)
30 // asm volatile ("vsetprec 32");
31 #elif defined(FP_HALF)
32 // asm volatile ("vsetprec 16");
33 #elif defined(FP_DOUBLE)
34 #else
35 #error wat
36 #endif
37 #endif
38 asm volatile ("vsetvl %[gvl], %[nvl]" : [gvl]"=r"(given_vl) : [nvl]"r"(num_stage_ops));
39 asm volatile ("fence"); // Make sure prefilling of workspace is complete
40
41 for(int stage = 0; stage < logfftsize; stage++)
42 {
43 const int half_cur_fft_size = (1 << stage);
44 const int sel_block_op = half_cur_fft_size-1;
45 const int sel_block = ~sel_block_op;
46 const int tf_scale = logfftsize - stage - 1;
47
48 // Stripmining loop
49 for(int lane_start = 0; lane_start < num_stage_ops; lane_start += given_vl)
50 {
51 // Setup new vector length for this stripmining pass
52 const int needed_vl = num_stage_ops - lane_start;
53 asm volatile ("vsetvl %[gvl], %[nvl]" : [gvl]"=r"(given_vl) : [nvl]"r"(needed_vl));
54
55 #if defined(FFT_FIXED)
56 // First VF block to have vector unit determine what op it is doing
57 asm volatile (R"(
58 vmsv vx1, %[lane_start]
59 vmsv vx2, %[sel_block]
60 vmsv vx3, %[sel_block_op]
61 vmsv vx4, %[tf_scale]
62 vmsv vx5, %[half_cfs]
63 vf 0(%[vf_ptr])
64 )": // no output registers
65 : [lane_start]"r"(lane_start),
66 [sel_block]"r"(sel_block),
67 [sel_block_op]"r"(sel_block_op),
68 [tf_scale]"r"(tf_scale),
69 [half_cfs]"r"(half_cur_fft_size),
70 [vf_ptr]"r"(&vf_fft_init)
71 : // no clobber
72 );
73
74 // Second VF block loads tf and op2 then calculates scale factor
75 asm volatile (R"(
76 vmsv vx4, %[tf_real]
77 vmsv vx5, %[tf_imag]
78 vmsv vx6, %[workspace_real]
79 vmsv vx7, %[workspace_imag]
80 vmsv vx8, %[fix_pt]
81 vf 0(%[vf_ptr])
82 )": // no output registers
83 : [tf_real]"r"(tf_real),
84 [tf_imag]"r"(tf_imag),
85 [workspace_real]"r"(workspace_real),
86 [workspace_imag]"r"(workspace_imag),
87 [fix_pt]"r"(FIX_PT),
88 [vf_ptr]"r"(&vf_fft_scale)
89 : // no clobber
90 );
91 #elif defined(FFT_FLOATING)
92 // First VF block to have vector unit determine what op it is doing
93 asm volatile (R"(
94 vmsv vx1, %[lane_start]
95 vmsv vx2, %[sel_block]
96 vmsv vx3, %[sel_block_op]
97 vmsv vx4, %[tf_scale]
98 vmsv vx5, %[half_cfs]
99 vf 0(%[vf_ptr])
100 )": // no output registers
101 : [lane_start]"r"(lane_start),
102 [sel_block]"r"(sel_block),
103 [sel_block_op]"r"(sel_block_op),
104 [tf_scale]"r"(tf_scale),
105 [half_cfs]"r"(half_cur_fft_size),
106 [vf_ptr]"r"(&vf_fft_init)
107 : // no clobber
108 );
109
110 // Second VF block loads tf and op2 then calculates scale factor
111 asm volatile (R"(
112 vmsv vx4, %[tf_real]
113 vmsv vx5, %[tf_imag]
114 vmsv vx6, %[workspace_real]
115 vmsv vx7, %[workspace_imag]
116 vf 0(%[vf_ptr])
117 )": // no output registers
118 : [tf_real]"r"(tf_real),
119 [tf_imag]"r"(tf_imag),
120 [workspace_real]"r"(workspace_real),
121 [workspace_imag]"r"(workspace_imag),
122 [vf_ptr]"r"(&vf_fft_scale)
123 : // no clobber
124 );
125 #else
126 #error no mode selected in vec-fft/vec-fft.c
127 #endif
128
129 // Third VF block actually calculates the results
130 asm volatile (R"(
131 vmsv vx5, %[workspace_real]
132 vmsv vx6, %[workspace_imag]
133 vf 0(%[vf_ptr])
134 )": // no output registers
135 : [workspace_real]"r"(workspace_real),
136 [workspace_imag]"r"(workspace_imag),
137 [vf_ptr]"r"(&vf_fft_exec)
138 : // no clobber
139 );
140
141 // Fourth VF block stores first result
142 asm volatile (R"(
143 vmsv vx3, %[workspace_real]
144 vmsv vx4, %[workspace_imag]
145 vf 0(%[vf_ptr])
146 )": // no output registers
147 : [workspace_real]"r"(workspace_real),
148 [workspace_imag]"r"(workspace_imag),
149 [vf_ptr]"r"(&vf_fft_store1)
150 : "memory"
151 );
152
153 // Fifth VF block stores second result
154 asm volatile (R"(
155 vmsv vx3, %[workspace_real]
156 vmsv vx4, %[workspace_imag]
157 vf 0(%[vf_ptr])
158 )": // no output registers
159 : [workspace_real]"r"(workspace_real),
160 [workspace_imag]"r"(workspace_imag),
161 [vf_ptr]"r"(&vf_fft_store2)
162 : "memory"
163 );
164
165 }
166 }
167
168 asm volatile ("fence"); // Make sure all that work from vector unit is visible to CPU
169
170 return;
171 }