benchmarks/vec-fft/vec-fft_main.c

   1 // *************************************************************************
   2 // multiply filter bencmark
   3 // -------------------------------------------------------------------------
   4 //
   5 // This benchmark tests the software multiply implemenation. The
   6 // input data (and reference data) should be generated using the
   7 // multiply_gendata.pl perl script and dumped to a file named
   8 // dataset1.h You should not change anything except the
   9 // HOST_DEBUG and VERIFY macros for your timing run.
  10
  11 #include "vec-fft.h"
  12
  13 //--------------------------------------------------------------------------
  14 // Macros
  15
  16 // Set HOST_DEBUG to 1 if you are going to compile this for a host
  17 // machine (ie Athena/Linux) for debug purposes and set HOST_DEBUG
  18 // to 0 if you are compiling with the smips-gcc toolchain.
  19
  20 #ifndef HOST_DEBUG
  21 #define HOST_DEBUG 0
  22 #endif
  23
  24 // Set PREALLOCATE to 1 if you want to preallocate the benchmark
  25 // function before starting stats. If you have instruction/data
  26 // caches and you don't want to count the overhead of misses, then
  27 // you will need to use preallocation.
  28
  29 #ifndef PREALLOCATE
  30 #define PREALLOCATE 0
  31 #endif
  32
  33 // Set VERIFY to 1 if you want the program to check that the sort
  34 // function returns the right answer. When you are doing your
  35 // benchmarking you should set this to 0 so that the verification
  36 // is not included in your timing.
  37
  38 #ifndef VERIFY
  39 #define VERIFY     1
  40 #endif
  41
  42 // Set SET_STATS to 1 if you want to carve out the piece that actually
  43 // does the computation.
  44
  45 #ifndef SET_STATS
  46 #define SET_STATS 0
  47 #endif
  48
  49 // Set MINIMAL to 1 if you want to run the core FFT kernel without
  50 // any instrumentation or warm-up.
  51 #ifndef MINIMAL
  52 #define MINIMAL 1
  53 #endif
  54
  55 //--------------------------------------------------------------------------
  56 // Platform Specific Includes
  57
  58 #if HOST_DEBUG
  59    #include <stdio.h>
  60    #include <stdlib.h>
  61 #else
  62 void printstr(const char*);
  63 void exit();
  64 #endif
  65
  66
  67 //--------------------------------------------------------------------------
  68 // Input/Reference Data
  69
  70 #include "fft_const.h"
  71
  72 //--------------------------------------------------------------------------
  73 // Helper functions
  74
  75 #if !MINIMAL
  76
  77 void setup_input(int n, fftval_t in_real[], fftval_t in_imag[])
  78 {
  79   int i;
  80   for(i=0; i < n; i++) {
  81     in_real[i] = input_data_real[i];
  82     in_imag[i] = input_data_imag[i];
  83   }
  84 }
  85 void setup_warm_tf(int n, fftval_t in_real[], fftval_t in_imag[])
  86 {
  87   int i;
  88   for(i=0; i < n; i++) {
  89     in_real[i] = tf_real[i];
  90     in_imag[i] = tf_imag[i];
  91   }
  92 }
  93
  94 fftval_t calculate_error( int n, const fftval_t test_real[], const fftval_t test_imag[])
  95 {
  96   fftval_t current_max = 0;
  97   printf("idx, real expected, real observed, imag expected, imag observed %d\n", 0);
  98
  99 #if defined(FFT_FIXED)
 100   for(int i = 0; i < n; i++)
 101   {
 102     const double scale = 1 << FIX_PT;
 103     const double real_diff = (test_real[i] - output_data_real[i])/scale;
 104     const double imag_diff = (test_imag[i] - output_data_imag[i])/scale;
 105
 106     const double i_sq_error = real_diff*real_diff + imag_diff*imag_diff;
 107     if(i_sq_error > current_max) {
 108       printf("i = %d, current error: %d\n", i, (long)current_max);
 109       current_max = i_sq_error;
 110     }
 111   }
 112 #elif defined(FFT_FLOATING)
 113   fftval_t real_expect = 0.0;
 114   fftval_t imag_expect = 0.0;
 115   for(int i = 0; i < n; i++)
 116   {
 117     /* TODO: Fix error caculation for half precision */
 118     const fftval_t real_diff = (test_real[i] - output_data_real[i]);
 119     const fftval_t imag_diff = (test_imag[i] - output_data_imag[i]);
 120     fftval_t i_sq_error = real_diff*real_diff + imag_diff*imag_diff;
 121
 122 #if 0
 123     long tr = (long)(test_real[i] * 1000000000);
 124     long ti = (long)(test_imag[i] * 1000000000);
 125     long er = (long)(output_data_real[i] * 1000000000);
 126     long ei = (long)(output_data_imag[i] * 1000000000);
 127
 128     printf("i = %d, expected (%d,%d) and got (%d,%d), diff (%d,%d)\n",
 129            i,
 130            er, ei,
 131            tr, ti,
 132            er-tr, ei-ti);
 133 #endif
 134
 135 #if 1
 136     fftbit_t tr, ti, er, ei;
 137 #ifdef FP_HALF
 138     tr = test_real[i];
 139     ti = test_imag[i];
 140     er = output_data_real[i];
 141     ei = output_data_imag[i];
 142 #else
 143     union bits {
 144       fftval_t v;
 145       fftbit_t u;
 146     } bits;
 147     bits.v = test_real[i]; tr = bits.u;
 148     bits.v = test_imag[i]; ti = bits.u;
 149     bits.v = output_data_real[i]; er = bits.u;
 150     bits.v = output_data_imag[i]; ei = bits.u;
 151 #endif
 152     printf("%d: %d %d %d %d\n", i, er, tr, ei, ti);
 153     // printf("%4d\t" FFT_PRI "\t" FFT_PRI "\t" FFT_PRI "\t" FFT_PRI "\n",
 154     //       i, er, tr, ei, ti);
 155 #endif
 156
 157 #if 0
 158     if(i_sq_error > current_max) {
 159       printf("i = %d, max error (ppb): %ld\n", i, (long)(current_max * 1000000000));
 160       current_max = i_sq_error;
 161       real_expect = output_data_real[i];
 162       imag_expect = output_data_imag[i];
 163     }
 164 #endif
 165   }
 166 /*
 167   printf("real expected: %d\n", (long)(real_expect));
 168   printf("imag expected: %d\n", (long)(imag_expect));
 169 */
 170 #endif
 171
 172   return current_max;
 173 }
 174
 175 void finishTest( double max_sq_error, long long num_cycles, long long num_retired)
 176 {
 177   int passed = max_sq_error < 10e-8;
 178
 179   if( passed ) printstr("*** PASSED ***");
 180   else printstr("*** FAILED ***");
 181
 182   printf(" (num_cycles = %ld, num_inst_retired = %ld)\n", num_cycles, num_retired);
 183
 184   passed = passed ? 1 : 2; // if it passed, return 1
 185
 186   exit();
 187 }
 188
 189 void setStats( int enable )
 190 {
 191 #if ( !HOST_DEBUG && SET_STATS )
 192   //asm( "mtpcr %0, cr10" : : "r" (enable) );
 193 #endif
 194 }
 195
 196 long long getCycles()
 197 {
 198    long long cycles = 1337;
 199 #if ( !HOST_DEBUG && SET_STATS )
 200   __asm__ __volatile__( "rdcycle %0" : "=r" (cycles) );
 201 #endif
 202   return cycles;
 203 }
 204
 205 long long getInstRetired()
 206 {
 207    long long inst_retired = 1338;
 208 #if ( !HOST_DEBUG && SET_STATS )
 209   __asm__ __volatile__( "rdinstret %0" : "=r" (inst_retired) );
 210 #endif
 211   return inst_retired;
 212 }
 213
 214 #endif /* !MINIMAL */
 215
 216 //--------------------------------------------------------------------------
 217 // Main
 218 #define HWACHA_RADIX 2
 219
 220 #ifdef DATA_IN_UNPERMUTED
 221 void permute(fftval_t workspace_real[], fftval_t workspace_imag[])
 222 {
 223   const int logradix = log2down(HWACHA_RADIX);
 224   const int term_mask = HWACHA_RADIX-1;
 225   const int num_term = log2down(FFT_SIZE)/logradix;
 226   for(int i = 0; i < FFT_SIZE; i++)
 227   {
 228     // Get permuted address
 229     int i_left = i;
 230     int permuted = 0;
 231     for(int cur_fft_size=HWACHA_RADIX; cur_fft_size <= FFT_SIZE; cur_fft_size = cur_fft_size << logradix)
 232     {
 233       permuted = (permuted << logradix) | (i_left & term_mask);
 234       i_left = i_left >> logradix;
 235     }
 236     // If addresses are different and i < permuted (so we only do permutation once)
 237     if(i < permuted)
 238     {
 239       fftval_t t = workspace_real[i];
 240       fftval_t u = workspace_imag[i];
 241       workspace_real[i] = workspace_real[permuted];
 242       workspace_imag[i] = workspace_imag[permuted];
 243       workspace_real[permuted] = t;
 244       workspace_imag[permuted] = u;
 245     }
 246   }
 247 }
 248 #endif /* DATA_IN_UNPERMUTED */
 249
 250 #if MINIMAL
 251
 252 int main(void)
 253 {
 254 #ifdef DATA_IN_UNPERMUTED
 255   permute(input_data_real, input_data_imag);
 256 #endif
 257   fft(input_data_real, input_data_imag, tf_real, tf_imag);
 258 //  calculate_error(FFT_SIZE, input_data_real, input_data_imag);
 259   exit();
 260 }
 261
 262 #else /* !MINIMAL */
 263
 264 int main(void)
 265 {
 266   static fftval_t workspace_real[FFT_SIZE];
 267   static fftval_t workspace_imag[FFT_SIZE];
 268   static fftval_t warm_tf_real[FFT_SIZE];
 269   static fftval_t warm_tf_imag[FFT_SIZE];
 270   setup_input(FFT_SIZE, workspace_real, workspace_imag);
 271   setup_warm_tf(FFT_SIZE, warm_tf_real, warm_tf_imag);
 272
 273 #if PREALLOCATE
 274   fft(workspace_real, workspace_imag, warm_tf_real, warm_tf_imag);
 275   setup_input(FFT_SIZE, workspace_real, workspace_imag);
 276 #endif
 277
 278   long long start_cycles, start_retired, stop_cycles, stop_retired;
 279   start_cycles = getCycles();
 280   start_retired = getInstRetired();
 281
 282 #ifdef DATA_IN_UNPERMUTED
 283   permute(workspace_real, workspace_imag);
 284 #endif
 285   setStats(1);
 286   fft(workspace_real, workspace_imag, warm_tf_real, warm_tf_imag);
 287   setStats(0);
 288
 289   stop_cycles = getCycles();
 290   stop_retired = getInstRetired();
 291   long long num_cycles = stop_cycles - start_cycles;
 292   long long num_retired = stop_retired - start_retired;
 293
 294   const double max_sq_error = calculate_error(FFT_SIZE, workspace_real, workspace_imag);
 295
 296   // Check the results
 297   finishTest(max_sq_error, num_cycles, num_retired);
 298 }
 299
 300 #endif /* MINIMAL */