benchmarks/vec-fft/vec-fft.c

   1 // *************************************************************************
   2 // multiply function (c version)
   3 // -------------------------------------------------------------------------
   4 #include "vec-fft.h"
   5 #include "fft_const.h"
   6
   7 int log2down(int in)
   8 {
   9   int counter = -1;
  10   while(in > 0) { counter++; in = in >> 1; }
  11   return counter;
  12 }
  13
  14 void fft(fftval_t workspace_real[], fftval_t workspace_imag[],
  15          const fftval_t tf_real[],  const fftval_t tf_imag[]) //size is FFT_SIZE
  16 {
  17   const int num_stage_ops = FFT_SIZE >> 1;
  18   const int logfftsize = log2down(FFT_SIZE);
  19
  20   int given_vl;
  21   // First, setup hwacha to what we need:
  22 #if defined(FFT_FIXED)
  23   //   num_stage_ops VL, 9 x-reg (1 zero, 2 ctrl, 4 data, 2 scratch), 1 fpu (avert bug)
  24   asm volatile ("vsetcfg 9, 1");
  25 #elif defined(FFT_FLOATING)
  26   asm volatile ("vsetcfg 8, 6");
  27   #if defined(FP_SINGLE)
  28 //    asm volatile ("vsetprec 32");
  29   #elif defined(FP_HALF)
  30 //    asm volatile ("vsetprec 16");
  31   #elif defined(FP_DOUBLE)
  32   #else
  33     #error wat
  34   #endif
  35 #endif
  36   asm volatile ("vsetvl %[gvl], %[nvl]" : [gvl]"=r"(given_vl) : [nvl]"r"(num_stage_ops));
  37   asm volatile ("fence"); // Make sure prefilling of workspace is complete
  38
  39   for(int stage = 0; stage < logfftsize; stage++)
  40   {
  41     const int half_cur_fft_size = (1 << stage);
  42     const int sel_block_op = half_cur_fft_size-1;
  43     const int sel_block = ~sel_block_op;
  44     const int tf_scale = logfftsize - stage - 1;
  45
  46     // Stripmining loop
  47     for(int lane_start = 0; lane_start < num_stage_ops; lane_start += given_vl)
  48     {
  49       // Setup new vector length for this stripmining pass
  50       const int needed_vl = num_stage_ops - lane_start;
  51       asm volatile ("vsetvl %[gvl], %[nvl]" : [gvl]"=r"(given_vl) : [nvl]"r"(needed_vl));
  52
  53 #if defined(FFT_FIXED)
  54       // First VF block to have vector unit determine what op it is doing
  55       asm volatile (R"(
  56           vmsv vx1, %[lane_start]
  57           vmsv vx2, %[sel_block]
  58           vmsv vx3, %[sel_block_op]
  59           vmsv vx4, %[tf_scale]
  60           vmsv vx5, %[half_cfs]
  61           vf 0(%[vf_ptr])
  62       )": // no output registers
  63         : [lane_start]"r"(lane_start),
  64           [sel_block]"r"(sel_block),
  65           [sel_block_op]"r"(sel_block_op),
  66           [tf_scale]"r"(tf_scale),
  67           [half_cfs]"r"(half_cur_fft_size),
  68           [vf_ptr]"r"(&vf_fft_init)
  69         : // no clobber
  70       );
  71
  72       // Second VF block loads tf and op2 then calculates scale factor
  73       asm volatile (R"(
  74           vmsv vx4, %[tf_real]
  75           vmsv vx5, %[tf_imag]
  76           vmsv vx6, %[workspace_real]
  77           vmsv vx7, %[workspace_imag]
  78           vmsv vx8, %[fix_pt]
  79           vf 0(%[vf_ptr])
  80       )": // no output registers
  81         : [tf_real]"r"(tf_real),
  82           [tf_imag]"r"(tf_imag),
  83           [workspace_real]"r"(workspace_real),
  84           [workspace_imag]"r"(workspace_imag),
  85           [fix_pt]"r"(FIX_PT),
  86           [vf_ptr]"r"(&vf_fft_scale)
  87         : // no clobber
  88       );
  89 #elif defined(FFT_FLOATING)
  90       // First VF block to have vector unit determine what op it is doing
  91       asm volatile (R"(
  92           vmsv vx1, %[lane_start]
  93           vmsv vx2, %[sel_block]
  94           vmsv vx3, %[sel_block_op]
  95           vmsv vx4, %[tf_scale]
  96           vmsv vx5, %[half_cfs]
  97           vf 0(%[vf_ptr])
  98       )": // no output registers
  99         : [lane_start]"r"(lane_start),
 100           [sel_block]"r"(sel_block),
 101           [sel_block_op]"r"(sel_block_op),
 102           [tf_scale]"r"(tf_scale),
 103           [half_cfs]"r"(half_cur_fft_size),
 104           [vf_ptr]"r"(&vf_fft_init)
 105         : // no clobber
 106       );
 107
 108       // Second VF block loads tf and op2 then calculates scale factor
 109       asm volatile (R"(
 110           vmsv vx4, %[tf_real]
 111           vmsv vx5, %[tf_imag]
 112           vmsv vx6, %[workspace_real]
 113           vmsv vx7, %[workspace_imag]
 114           vf 0(%[vf_ptr])
 115       )": // no output registers
 116         : [tf_real]"r"(tf_real),
 117           [tf_imag]"r"(tf_imag),
 118           [workspace_real]"r"(workspace_real),
 119           [workspace_imag]"r"(workspace_imag),
 120           [vf_ptr]"r"(&vf_fft_scale)
 121         : // no clobber
 122       );
 123 #else
 124   #error no mode selected in vec-fft/vec-fft.c
 125 #endif
 126
 127       // Third VF block actually calculates the results
 128       asm volatile (R"(
 129           vmsv vx5, %[workspace_real]
 130           vmsv vx6, %[workspace_imag]
 131           vf 0(%[vf_ptr])
 132       )": // no output registers
 133         : [workspace_real]"r"(workspace_real),
 134           [workspace_imag]"r"(workspace_imag),
 135           [vf_ptr]"r"(&vf_fft_exec)
 136         : // no clobber
 137       );
 138
 139       // Fourth VF block stores first result
 140       asm volatile (R"(
 141           vmsv vx3, %[workspace_real]
 142           vmsv vx4, %[workspace_imag]
 143           vf 0(%[vf_ptr])
 144       )": // no output registers
 145         : [workspace_real]"r"(workspace_real),
 146           [workspace_imag]"r"(workspace_imag),
 147           [vf_ptr]"r"(&vf_fft_store1)
 148         : "memory"
 149       );
 150
 151       // Fifth VF block stores second result
 152       asm volatile (R"(
 153           vmsv vx3, %[workspace_real]
 154           vmsv vx4, %[workspace_imag]
 155           vf 0(%[vf_ptr])
 156       )": // no output registers
 157         : [workspace_real]"r"(workspace_real),
 158           [workspace_imag]"r"(workspace_imag),
 159           [vf_ptr]"r"(&vf_fft_store2)
 160         : "memory"
 161       );
 162
 163     }
 164   }
 165
 166   asm volatile ("fence"); // Make sure all that work from vector unit is visible to CPU
 167
 168   return;
 169 }