1 // See LICENSE for license details.
3 // *************************************************************************
4 // multiply filter bencmark
5 // -------------------------------------------------------------------------
7 // This benchmark tests the software multiply implemenation. The
8 // input data (and reference data) should be generated using the
9 // multiply_gendata.pl perl script and dumped to a file named
10 // dataset1.h You should not change anything except the
11 // HOST_DEBUG and VERIFY macros for your timing run.
15 //--------------------------------------------------------------------------
18 // Set HOST_DEBUG to 1 if you are going to compile this for a host
19 // machine (ie Athena/Linux) for debug purposes and set HOST_DEBUG
20 // to 0 if you are compiling with the smips-gcc toolchain.
26 // Set PREALLOCATE to 1 if you want to preallocate the benchmark
27 // function before starting stats. If you have instruction/data
28 // caches and you don't want to count the overhead of misses, then
29 // you will need to use preallocation.
35 // Set VERIFY to 1 if you want the program to check that the sort
36 // function returns the right answer. When you are doing your
37 // benchmarking you should set this to 0 so that the verification
38 // is not included in your timing.
44 // Set SET_STATS to 1 if you want to carve out the piece that actually
45 // does the computation.
51 // Set MINIMAL to 1 if you want to run the core FFT kernel without
52 // any instrumentation or warm-up.
57 //--------------------------------------------------------------------------
58 // Platform Specific Includes
64 void printstr(const char*);
69 //--------------------------------------------------------------------------
70 // Input/Reference Data
72 #include "fft_const.h"
74 //--------------------------------------------------------------------------
79 void setup_input(int n
, fftval_t in_real
[], fftval_t in_imag
[])
82 for(i
=0; i
< n
; i
++) {
83 in_real
[i
] = input_data_real
[i
];
84 in_imag
[i
] = input_data_imag
[i
];
87 void setup_warm_tf(int n
, fftval_t in_real
[], fftval_t in_imag
[])
90 for(i
=0; i
< n
; i
++) {
91 in_real
[i
] = tf_real
[i
];
92 in_imag
[i
] = tf_imag
[i
];
96 fftval_t
calculate_error( int n
, const fftval_t test_real
[], const fftval_t test_imag
[])
98 fftval_t current_max
= 0;
99 printf("idx, real expected, real observed, imag expected, imag observed %d\n", 0);
101 #if defined(FFT_FIXED)
102 for(int i
= 0; i
< n
; i
++)
104 const double scale
= 1 << FIX_PT
;
105 const double real_diff
= (test_real
[i
] - output_data_real
[i
])/scale
;
106 const double imag_diff
= (test_imag
[i
] - output_data_imag
[i
])/scale
;
108 const double i_sq_error
= real_diff
*real_diff
+ imag_diff
*imag_diff
;
109 if(i_sq_error
> current_max
) {
110 printf("i = %d, current error: %d\n", i
, (long)current_max
);
111 current_max
= i_sq_error
;
114 #elif defined(FFT_FLOATING)
115 fftval_t real_expect
= 0.0;
116 fftval_t imag_expect
= 0.0;
117 for(int i
= 0; i
< n
; i
++)
119 /* TODO: Fix error caculation for half precision */
120 const fftval_t real_diff
= (test_real
[i
] - output_data_real
[i
]);
121 const fftval_t imag_diff
= (test_imag
[i
] - output_data_imag
[i
]);
122 fftval_t i_sq_error
= real_diff
*real_diff
+ imag_diff
*imag_diff
;
125 long tr
= (long)(test_real
[i
] * 1000000000);
126 long ti
= (long)(test_imag
[i
] * 1000000000);
127 long er
= (long)(output_data_real
[i
] * 1000000000);
128 long ei
= (long)(output_data_imag
[i
] * 1000000000);
130 printf("i = %d, expected (%d,%d) and got (%d,%d), diff (%d,%d)\n",
138 fftbit_t tr
, ti
, er
, ei
;
142 er
= output_data_real
[i
];
143 ei
= output_data_imag
[i
];
149 bits
.v
= test_real
[i
]; tr
= bits
.u
;
150 bits
.v
= test_imag
[i
]; ti
= bits
.u
;
151 bits
.v
= output_data_real
[i
]; er
= bits
.u
;
152 bits
.v
= output_data_imag
[i
]; ei
= bits
.u
;
154 printf("%d: %d %d %d %d\n", i
, er
, tr
, ei
, ti
);
155 // printf("%4d\t" FFT_PRI "\t" FFT_PRI "\t" FFT_PRI "\t" FFT_PRI "\n",
156 // i, er, tr, ei, ti);
160 if(i_sq_error
> current_max
) {
161 printf("i = %d, max error (ppb): %ld\n", i
, (long)(current_max
* 1000000000));
162 current_max
= i_sq_error
;
163 real_expect
= output_data_real
[i
];
164 imag_expect
= output_data_imag
[i
];
169 printf("real expected: %d\n", (long)(real_expect));
170 printf("imag expected: %d\n", (long)(imag_expect));
177 void finishTest( double max_sq_error
, long long num_cycles
, long long num_retired
)
179 int passed
= max_sq_error
< 10e-8;
181 if( passed
) printstr("*** PASSED ***");
182 else printstr("*** FAILED ***");
184 printf(" (num_cycles = %ld, num_inst_retired = %ld)\n", num_cycles
, num_retired
);
186 passed
= passed
? 1 : 2; // if it passed, return 1
191 void setStats( int enable
)
193 #if ( !HOST_DEBUG && SET_STATS )
194 //asm( "mtpcr %0, cr10" : : "r" (enable) );
198 long long getCycles()
200 long long cycles
= 1337;
201 #if ( !HOST_DEBUG && SET_STATS )
202 __asm__
__volatile__( "rdcycle %0" : "=r" (cycles
) );
207 long long getInstRetired()
209 long long inst_retired
= 1338;
210 #if ( !HOST_DEBUG && SET_STATS )
211 __asm__
__volatile__( "rdinstret %0" : "=r" (inst_retired
) );
216 #endif /* !MINIMAL */
218 //--------------------------------------------------------------------------
220 #define HWACHA_RADIX 2
222 #ifdef DATA_IN_UNPERMUTED
223 void permute(fftval_t workspace_real
[], fftval_t workspace_imag
[])
225 const int logradix
= log2down(HWACHA_RADIX
);
226 const int term_mask
= HWACHA_RADIX
-1;
227 const int num_term
= log2down(FFT_SIZE
)/logradix
;
228 for(int i
= 0; i
< FFT_SIZE
; i
++)
230 // Get permuted address
233 for(int cur_fft_size
=HWACHA_RADIX
; cur_fft_size
<= FFT_SIZE
; cur_fft_size
= cur_fft_size
<< logradix
)
235 permuted
= (permuted
<< logradix
) | (i_left
& term_mask
);
236 i_left
= i_left
>> logradix
;
238 // If addresses are different and i < permuted (so we only do permutation once)
241 fftval_t t
= workspace_real
[i
];
242 fftval_t u
= workspace_imag
[i
];
243 workspace_real
[i
] = workspace_real
[permuted
];
244 workspace_imag
[i
] = workspace_imag
[permuted
];
245 workspace_real
[permuted
] = t
;
246 workspace_imag
[permuted
] = u
;
250 #endif /* DATA_IN_UNPERMUTED */
256 #ifdef DATA_IN_UNPERMUTED
257 permute(input_data_real
, input_data_imag
);
259 fft(input_data_real
, input_data_imag
, tf_real
, tf_imag
);
260 // calculate_error(FFT_SIZE, input_data_real, input_data_imag);
268 static fftval_t workspace_real
[FFT_SIZE
];
269 static fftval_t workspace_imag
[FFT_SIZE
];
270 static fftval_t warm_tf_real
[FFT_SIZE
];
271 static fftval_t warm_tf_imag
[FFT_SIZE
];
272 setup_input(FFT_SIZE
, workspace_real
, workspace_imag
);
273 setup_warm_tf(FFT_SIZE
, warm_tf_real
, warm_tf_imag
);
276 fft(workspace_real
, workspace_imag
, warm_tf_real
, warm_tf_imag
);
277 setup_input(FFT_SIZE
, workspace_real
, workspace_imag
);
280 long long start_cycles
, start_retired
, stop_cycles
, stop_retired
;
281 start_cycles
= getCycles();
282 start_retired
= getInstRetired();
284 #ifdef DATA_IN_UNPERMUTED
285 permute(workspace_real
, workspace_imag
);
288 fft(workspace_real
, workspace_imag
, warm_tf_real
, warm_tf_imag
);
291 stop_cycles
= getCycles();
292 stop_retired
= getInstRetired();
293 long long num_cycles
= stop_cycles
- start_cycles
;
294 long long num_retired
= stop_retired
- start_retired
;
296 const double max_sq_error
= calculate_error(FFT_SIZE
, workspace_real
, workspace_imag
);
299 finishTest(max_sq_error
, num_cycles
, num_retired
);