1 // *************************************************************************
2 // multiply filter bencmark
3 // -------------------------------------------------------------------------
5 // This benchmark tests the software multiply implemenation. The
6 // input data (and reference data) should be generated using the
7 // multiply_gendata.pl perl script and dumped to a file named
8 // dataset1.h You should not change anything except the
9 // HOST_DEBUG and VERIFY macros for your timing run.
13 //--------------------------------------------------------------------------
16 // Set HOST_DEBUG to 1 if you are going to compile this for a host
17 // machine (ie Athena/Linux) for debug purposes and set HOST_DEBUG
18 // to 0 if you are compiling with the smips-gcc toolchain.
24 // Set PREALLOCATE to 1 if you want to preallocate the benchmark
25 // function before starting stats. If you have instruction/data
26 // caches and you don't want to count the overhead of misses, then
27 // you will need to use preallocation.
33 // Set VERIFY to 1 if you want the program to check that the sort
34 // function returns the right answer. When you are doing your
35 // benchmarking you should set this to 0 so that the verification
36 // is not included in your timing.
42 // Set SET_STATS to 1 if you want to carve out the piece that actually
43 // does the computation.
49 // Set MINIMAL to 1 if you want to run the core FFT kernel without
50 // any instrumentation or warm-up.
55 //--------------------------------------------------------------------------
56 // Platform Specific Includes
62 void printstr(const char*);
67 //--------------------------------------------------------------------------
68 // Input/Reference Data
70 #include "fft_const.h"
72 //--------------------------------------------------------------------------
77 void setup_input(int n
, fftval_t in_real
[], fftval_t in_imag
[])
80 for(i
=0; i
< n
; i
++) {
81 in_real
[i
] = input_data_real
[i
];
82 in_imag
[i
] = input_data_imag
[i
];
85 void setup_warm_tf(int n
, fftval_t in_real
[], fftval_t in_imag
[])
88 for(i
=0; i
< n
; i
++) {
89 in_real
[i
] = tf_real
[i
];
90 in_imag
[i
] = tf_imag
[i
];
94 fftval_t
calculate_error( int n
, const fftval_t test_real
[], const fftval_t test_imag
[])
96 fftval_t current_max
= 0;
97 printf("idx, real expected, real observed, imag expected, imag observed %d\n", 0);
99 #if defined(FFT_FIXED)
100 for(int i
= 0; i
< n
; i
++)
102 const double scale
= 1 << FIX_PT
;
103 const double real_diff
= (test_real
[i
] - output_data_real
[i
])/scale
;
104 const double imag_diff
= (test_imag
[i
] - output_data_imag
[i
])/scale
;
106 const double i_sq_error
= real_diff
*real_diff
+ imag_diff
*imag_diff
;
107 if(i_sq_error
> current_max
) {
108 printf("i = %d, current error: %d\n", i
, (long)current_max
);
109 current_max
= i_sq_error
;
112 #elif defined(FFT_FLOATING)
113 fftval_t real_expect
= 0.0;
114 fftval_t imag_expect
= 0.0;
115 for(int i
= 0; i
< n
; i
++)
117 /* TODO: Fix error caculation for half precision */
118 const fftval_t real_diff
= (test_real
[i
] - output_data_real
[i
]);
119 const fftval_t imag_diff
= (test_imag
[i
] - output_data_imag
[i
]);
120 fftval_t i_sq_error
= real_diff
*real_diff
+ imag_diff
*imag_diff
;
123 long tr
= (long)(test_real
[i
] * 1000000000);
124 long ti
= (long)(test_imag
[i
] * 1000000000);
125 long er
= (long)(output_data_real
[i
] * 1000000000);
126 long ei
= (long)(output_data_imag
[i
] * 1000000000);
128 printf("i = %d, expected (%d,%d) and got (%d,%d), diff (%d,%d)\n",
136 fftbit_t tr
, ti
, er
, ei
;
140 er
= output_data_real
[i
];
141 ei
= output_data_imag
[i
];
147 bits
.v
= test_real
[i
]; tr
= bits
.u
;
148 bits
.v
= test_imag
[i
]; ti
= bits
.u
;
149 bits
.v
= output_data_real
[i
]; er
= bits
.u
;
150 bits
.v
= output_data_imag
[i
]; ei
= bits
.u
;
152 printf("%d: %d %d %d %d\n", i
, er
, tr
, ei
, ti
);
153 // printf("%4d\t" FFT_PRI "\t" FFT_PRI "\t" FFT_PRI "\t" FFT_PRI "\n",
154 // i, er, tr, ei, ti);
158 if(i_sq_error
> current_max
) {
159 printf("i = %d, max error (ppb): %ld\n", i
, (long)(current_max
* 1000000000));
160 current_max
= i_sq_error
;
161 real_expect
= output_data_real
[i
];
162 imag_expect
= output_data_imag
[i
];
167 printf("real expected: %d\n", (long)(real_expect));
168 printf("imag expected: %d\n", (long)(imag_expect));
175 void finishTest( double max_sq_error
, long long num_cycles
, long long num_retired
)
177 int passed
= max_sq_error
< 10e-8;
179 if( passed
) printstr("*** PASSED ***");
180 else printstr("*** FAILED ***");
182 printf(" (num_cycles = %ld, num_inst_retired = %ld)\n", num_cycles
, num_retired
);
184 passed
= passed
? 1 : 2; // if it passed, return 1
189 void setStats( int enable
)
191 #if ( !HOST_DEBUG && SET_STATS )
192 //asm( "mtpcr %0, cr10" : : "r" (enable) );
196 long long getCycles()
198 long long cycles
= 1337;
199 #if ( !HOST_DEBUG && SET_STATS )
200 __asm__
__volatile__( "rdcycle %0" : "=r" (cycles
) );
205 long long getInstRetired()
207 long long inst_retired
= 1338;
208 #if ( !HOST_DEBUG && SET_STATS )
209 __asm__
__volatile__( "rdinstret %0" : "=r" (inst_retired
) );
214 #endif /* !MINIMAL */
216 //--------------------------------------------------------------------------
218 #define HWACHA_RADIX 2
220 #ifdef DATA_IN_UNPERMUTED
221 void permute(fftval_t workspace_real
[], fftval_t workspace_imag
[])
223 const int logradix
= log2down(HWACHA_RADIX
);
224 const int term_mask
= HWACHA_RADIX
-1;
225 const int num_term
= log2down(FFT_SIZE
)/logradix
;
226 for(int i
= 0; i
< FFT_SIZE
; i
++)
228 // Get permuted address
231 for(int cur_fft_size
=HWACHA_RADIX
; cur_fft_size
<= FFT_SIZE
; cur_fft_size
= cur_fft_size
<< logradix
)
233 permuted
= (permuted
<< logradix
) | (i_left
& term_mask
);
234 i_left
= i_left
>> logradix
;
236 // If addresses are different and i < permuted (so we only do permutation once)
239 fftval_t t
= workspace_real
[i
];
240 fftval_t u
= workspace_imag
[i
];
241 workspace_real
[i
] = workspace_real
[permuted
];
242 workspace_imag
[i
] = workspace_imag
[permuted
];
243 workspace_real
[permuted
] = t
;
244 workspace_imag
[permuted
] = u
;
248 #endif /* DATA_IN_UNPERMUTED */
254 #ifdef DATA_IN_UNPERMUTED
255 permute(input_data_real
, input_data_imag
);
257 fft(input_data_real
, input_data_imag
, tf_real
, tf_imag
);
258 // calculate_error(FFT_SIZE, input_data_real, input_data_imag);
266 static fftval_t workspace_real
[FFT_SIZE
];
267 static fftval_t workspace_imag
[FFT_SIZE
];
268 static fftval_t warm_tf_real
[FFT_SIZE
];
269 static fftval_t warm_tf_imag
[FFT_SIZE
];
270 setup_input(FFT_SIZE
, workspace_real
, workspace_imag
);
271 setup_warm_tf(FFT_SIZE
, warm_tf_real
, warm_tf_imag
);
274 fft(workspace_real
, workspace_imag
, warm_tf_real
, warm_tf_imag
);
275 setup_input(FFT_SIZE
, workspace_real
, workspace_imag
);
278 long long start_cycles
, start_retired
, stop_cycles
, stop_retired
;
279 start_cycles
= getCycles();
280 start_retired
= getInstRetired();
282 #ifdef DATA_IN_UNPERMUTED
283 permute(workspace_real
, workspace_imag
);
286 fft(workspace_real
, workspace_imag
, warm_tf_real
, warm_tf_imag
);
289 stop_cycles
= getCycles();
290 stop_retired
= getInstRetired();
291 long long num_cycles
= stop_cycles
- start_cycles
;
292 long long num_retired
= stop_retired
- start_retired
;
294 const double max_sq_error
= calculate_error(FFT_SIZE
, workspace_real
, workspace_imag
);
297 finishTest(max_sq_error
, num_cycles
, num_retired
);