1 // See LICENSE for license details.
3 // *************************************************************************
4 // multiply function (c version)
5 // -------------------------------------------------------------------------
12 while(in
> 0) { counter
++; in
= in
>> 1; }
16 void fft(fftval_t workspace_real
[], fftval_t workspace_imag
[],
17 const fftval_t tf_real
[], const fftval_t tf_imag
[]) //size is FFT_SIZE
19 const int num_stage_ops
= FFT_SIZE
>> 1;
20 const int logfftsize
= log2down(FFT_SIZE
);
23 // First, setup hwacha to what we need:
24 #if defined(FFT_FIXED)
25 // num_stage_ops VL, 9 x-reg (1 zero, 2 ctrl, 4 data, 2 scratch), 1 fpu (avert bug)
26 asm volatile ("vsetcfg 9, 1");
27 #elif defined(FFT_FLOATING)
28 asm volatile ("vsetcfg 8, 6");
29 #if defined(FP_SINGLE)
30 // asm volatile ("vsetprec 32");
31 #elif defined(FP_HALF)
32 // asm volatile ("vsetprec 16");
33 #elif defined(FP_DOUBLE)
38 asm volatile ("vsetvl %[gvl], %[nvl]" : [gvl
]"=r"(given_vl
) : [nvl
]"r"(num_stage_ops
));
39 asm volatile ("fence"); // Make sure prefilling of workspace is complete
41 for(int stage
= 0; stage
< logfftsize
; stage
++)
43 const int half_cur_fft_size
= (1 << stage
);
44 const int sel_block_op
= half_cur_fft_size
-1;
45 const int sel_block
= ~sel_block_op
;
46 const int tf_scale
= logfftsize
- stage
- 1;
49 for(int lane_start
= 0; lane_start
< num_stage_ops
; lane_start
+= given_vl
)
51 // Setup new vector length for this stripmining pass
52 const int needed_vl
= num_stage_ops
- lane_start
;
53 asm volatile ("vsetvl %[gvl], %[nvl]" : [gvl
]"=r"(given_vl
) : [nvl
]"r"(needed_vl
));
55 #if defined(FFT_FIXED)
56 // First VF block to have vector unit determine what op it is doing
58 vmsv vx1, %[lane_start]
59 vmsv vx2, %[sel_block]
60 vmsv vx3, %[sel_block_op]
64 )": // no output registers
65 : [lane_start
]"r"(lane_start
),
66 [sel_block
]"r"(sel_block
),
67 [sel_block_op
]"r"(sel_block_op
),
68 [tf_scale
]"r"(tf_scale
),
69 [half_cfs
]"r"(half_cur_fft_size
),
70 [vf_ptr
]"r"(&vf_fft_init
)
74 // Second VF block loads tf and op2 then calculates scale factor
78 vmsv vx6, %[workspace_real]
79 vmsv vx7, %[workspace_imag]
82 )": // no output registers
83 : [tf_real
]"r"(tf_real
),
84 [tf_imag
]"r"(tf_imag
),
85 [workspace_real
]"r"(workspace_real
),
86 [workspace_imag
]"r"(workspace_imag
),
88 [vf_ptr
]"r"(&vf_fft_scale
)
91 #elif defined(FFT_FLOATING)
92 // First VF block to have vector unit determine what op it is doing
94 vmsv vx1, %[lane_start]
95 vmsv vx2, %[sel_block]
96 vmsv vx3, %[sel_block_op]
100 )": // no output registers
101 : [lane_start
]"r"(lane_start
),
102 [sel_block
]"r"(sel_block
),
103 [sel_block_op
]"r"(sel_block_op
),
104 [tf_scale
]"r"(tf_scale
),
105 [half_cfs
]"r"(half_cur_fft_size
),
106 [vf_ptr
]"r"(&vf_fft_init
)
110 // Second VF block loads tf and op2 then calculates scale factor
114 vmsv vx6, %[workspace_real]
115 vmsv vx7, %[workspace_imag]
117 )": // no output registers
118 : [tf_real
]"r"(tf_real
),
119 [tf_imag
]"r"(tf_imag
),
120 [workspace_real
]"r"(workspace_real
),
121 [workspace_imag
]"r"(workspace_imag
),
122 [vf_ptr
]"r"(&vf_fft_scale
)
126 #error no mode selected in vec-fft/vec-fft.c
129 // Third VF block actually calculates the results
131 vmsv vx5, %[workspace_real]
132 vmsv vx6, %[workspace_imag]
134 )": // no output registers
135 : [workspace_real
]"r"(workspace_real
),
136 [workspace_imag
]"r"(workspace_imag
),
137 [vf_ptr
]"r"(&vf_fft_exec
)
141 // Fourth VF block stores first result
143 vmsv vx3, %[workspace_real]
144 vmsv vx4, %[workspace_imag]
146 )": // no output registers
147 : [workspace_real
]"r"(workspace_real
),
148 [workspace_imag
]"r"(workspace_imag
),
149 [vf_ptr
]"r"(&vf_fft_store1
)
153 // Fifth VF block stores second result
155 vmsv vx3, %[workspace_real]
156 vmsv vx4, %[workspace_imag]
158 )": // no output registers
159 : [workspace_real
]"r"(workspace_real
),
160 [workspace_imag
]"r"(workspace_imag
),
161 [vf_ptr
]"r"(&vf_fft_store2
)
168 asm volatile ("fence"); // Make sure all that work from vector unit is visible to CPU