47d13cd1f998822e0edce6ac2240f2b1c3c58a12
[riscv-tests.git] / benchmarks / vec-fft / vec-fft.c
1 // *************************************************************************
2 // multiply function (c version)
3 // -------------------------------------------------------------------------
4 #include "vec-fft.h"
5 #include "fft_const.h"
6
7 int log2down(int in)
8 {
9 int counter = -1;
10 while(in > 0) { counter++; in = in >> 1; }
11 return counter;
12 }
13
14 void fft(fftval_t workspace_real[], fftval_t workspace_imag[],
15 const fftval_t tf_real[], const fftval_t tf_imag[]) //size is FFT_SIZE
16 {
17 const int num_stage_ops = FFT_SIZE >> 1;
18 const int logfftsize = log2down(FFT_SIZE);
19
20 int given_vl;
21 // First, setup hwacha to what we need:
22 #if defined(FFT_FIXED)
23 // num_stage_ops VL, 9 x-reg (1 zero, 2 ctrl, 4 data, 2 scratch), 1 fpu (avert bug)
24 asm volatile ("vsetcfg 9, 1");
25 #elif defined(FFT_FLOATING)
26 asm volatile ("vsetcfg 8, 6");
27 #if defined(FP_SINGLE)
28 // asm volatile ("vsetprec 32");
29 #elif defined(FP_HALF)
30 // asm volatile ("vsetprec 16");
31 #elif defined(FP_DOUBLE)
32 #else
33 #error wat
34 #endif
35 #endif
36 asm volatile ("vsetvl %[gvl], %[nvl]" : [gvl]"=r"(given_vl) : [nvl]"r"(num_stage_ops));
37 asm volatile ("fence"); // Make sure prefilling of workspace is complete
38
39 for(int stage = 0; stage < logfftsize; stage++)
40 {
41 const int half_cur_fft_size = (1 << stage);
42 const int sel_block_op = half_cur_fft_size-1;
43 const int sel_block = ~sel_block_op;
44 const int tf_scale = logfftsize - stage - 1;
45
46 // Stripmining loop
47 for(int lane_start = 0; lane_start < num_stage_ops; lane_start += given_vl)
48 {
49 // Setup new vector length for this stripmining pass
50 const int needed_vl = num_stage_ops - lane_start;
51 asm volatile ("vsetvl %[gvl], %[nvl]" : [gvl]"=r"(given_vl) : [nvl]"r"(needed_vl));
52
53 #if defined(FFT_FIXED)
54 // First VF block to have vector unit determine what op it is doing
55 asm volatile (R"(
56 vmsv vx1, %[lane_start]
57 vmsv vx2, %[sel_block]
58 vmsv vx3, %[sel_block_op]
59 vmsv vx4, %[tf_scale]
60 vmsv vx5, %[half_cfs]
61 vf 0(%[vf_ptr])
62 )": // no output registers
63 : [lane_start]"r"(lane_start),
64 [sel_block]"r"(sel_block),
65 [sel_block_op]"r"(sel_block_op),
66 [tf_scale]"r"(tf_scale),
67 [half_cfs]"r"(half_cur_fft_size),
68 [vf_ptr]"r"(&vf_fft_init)
69 : // no clobber
70 );
71
72 // Second VF block loads tf and op2 then calculates scale factor
73 asm volatile (R"(
74 vmsv vx4, %[tf_real]
75 vmsv vx5, %[tf_imag]
76 vmsv vx6, %[workspace_real]
77 vmsv vx7, %[workspace_imag]
78 vmsv vx8, %[fix_pt]
79 vf 0(%[vf_ptr])
80 )": // no output registers
81 : [tf_real]"r"(tf_real),
82 [tf_imag]"r"(tf_imag),
83 [workspace_real]"r"(workspace_real),
84 [workspace_imag]"r"(workspace_imag),
85 [fix_pt]"r"(FIX_PT),
86 [vf_ptr]"r"(&vf_fft_scale)
87 : // no clobber
88 );
89 #elif defined(FFT_FLOATING)
90 // First VF block to have vector unit determine what op it is doing
91 asm volatile (R"(
92 vmsv vx1, %[lane_start]
93 vmsv vx2, %[sel_block]
94 vmsv vx3, %[sel_block_op]
95 vmsv vx4, %[tf_scale]
96 vmsv vx5, %[half_cfs]
97 vf 0(%[vf_ptr])
98 )": // no output registers
99 : [lane_start]"r"(lane_start),
100 [sel_block]"r"(sel_block),
101 [sel_block_op]"r"(sel_block_op),
102 [tf_scale]"r"(tf_scale),
103 [half_cfs]"r"(half_cur_fft_size),
104 [vf_ptr]"r"(&vf_fft_init)
105 : // no clobber
106 );
107
108 // Second VF block loads tf and op2 then calculates scale factor
109 asm volatile (R"(
110 vmsv vx4, %[tf_real]
111 vmsv vx5, %[tf_imag]
112 vmsv vx6, %[workspace_real]
113 vmsv vx7, %[workspace_imag]
114 vf 0(%[vf_ptr])
115 )": // no output registers
116 : [tf_real]"r"(tf_real),
117 [tf_imag]"r"(tf_imag),
118 [workspace_real]"r"(workspace_real),
119 [workspace_imag]"r"(workspace_imag),
120 [vf_ptr]"r"(&vf_fft_scale)
121 : // no clobber
122 );
123 #else
124 #error no mode selected in vec-fft/vec-fft.c
125 #endif
126
127 // Third VF block actually calculates the results
128 asm volatile (R"(
129 vmsv vx5, %[workspace_real]
130 vmsv vx6, %[workspace_imag]
131 vf 0(%[vf_ptr])
132 )": // no output registers
133 : [workspace_real]"r"(workspace_real),
134 [workspace_imag]"r"(workspace_imag),
135 [vf_ptr]"r"(&vf_fft_exec)
136 : // no clobber
137 );
138
139 // Fourth VF block stores first result
140 asm volatile (R"(
141 vmsv vx3, %[workspace_real]
142 vmsv vx4, %[workspace_imag]
143 vf 0(%[vf_ptr])
144 )": // no output registers
145 : [workspace_real]"r"(workspace_real),
146 [workspace_imag]"r"(workspace_imag),
147 [vf_ptr]"r"(&vf_fft_store1)
148 : "memory"
149 );
150
151 // Fifth VF block stores second result
152 asm volatile (R"(
153 vmsv vx3, %[workspace_real]
154 vmsv vx4, %[workspace_imag]
155 vf 0(%[vf_ptr])
156 )": // no output registers
157 : [workspace_real]"r"(workspace_real),
158 [workspace_imag]"r"(workspace_imag),
159 [vf_ptr]"r"(&vf_fft_store2)
160 : "memory"
161 );
162
163 }
164 }
165
166 asm volatile ("fence"); // Make sure all that work from vector unit is visible to CPU
167
168 return;
169 }