benchmarks/vec-fft/vec-fft.c

   1 // See LICENSE for license details.
   2
   3 // *************************************************************************
   4 // multiply function (c version)
   5 // -------------------------------------------------------------------------
   6 #include "vec-fft.h"
   7 #include "fft_const.h"
   8
   9 int log2down(int in)
  10 {
  11   int counter = -1;
  12   while(in > 0) { counter++; in = in >> 1; }
  13   return counter;
  14 }
  15
  16 void fft(fftval_t workspace_real[], fftval_t workspace_imag[],
  17          const fftval_t tf_real[],  const fftval_t tf_imag[]) //size is FFT_SIZE
  18 {
  19   const int num_stage_ops = FFT_SIZE >> 1;
  20   const int logfftsize = log2down(FFT_SIZE);
  21
  22   int given_vl;
  23   // First, setup hwacha to what we need:
  24 #if defined(FFT_FIXED)
  25   //   num_stage_ops VL, 9 x-reg (1 zero, 2 ctrl, 4 data, 2 scratch), 1 fpu (avert bug)
  26   asm volatile ("vsetcfg 9, 1");
  27 #elif defined(FFT_FLOATING)
  28   asm volatile ("vsetcfg 8, 6");
  29   #if defined(FP_SINGLE)
  30 //    asm volatile ("vsetprec 32");
  31   #elif defined(FP_HALF)
  32 //    asm volatile ("vsetprec 16");
  33   #elif defined(FP_DOUBLE)
  34   #else
  35     #error wat
  36   #endif
  37 #endif
  38   asm volatile ("vsetvl %[gvl], %[nvl]" : [gvl]"=r"(given_vl) : [nvl]"r"(num_stage_ops));
  39   asm volatile ("fence"); // Make sure prefilling of workspace is complete
  40
  41   for(int stage = 0; stage < logfftsize; stage++)
  42   {
  43     const int half_cur_fft_size = (1 << stage);
  44     const int sel_block_op = half_cur_fft_size-1;
  45     const int sel_block = ~sel_block_op;
  46     const int tf_scale = logfftsize - stage - 1;
  47
  48     // Stripmining loop
  49     for(int lane_start = 0; lane_start < num_stage_ops; lane_start += given_vl)
  50     {
  51       // Setup new vector length for this stripmining pass
  52       const int needed_vl = num_stage_ops - lane_start;
  53       asm volatile ("vsetvl %[gvl], %[nvl]" : [gvl]"=r"(given_vl) : [nvl]"r"(needed_vl));
  54
  55 #if defined(FFT_FIXED)
  56       // First VF block to have vector unit determine what op it is doing
  57       asm volatile (R"(
  58           vmsv vx1, %[lane_start]
  59           vmsv vx2, %[sel_block]
  60           vmsv vx3, %[sel_block_op]
  61           vmsv vx4, %[tf_scale]
  62           vmsv vx5, %[half_cfs]
  63           vf 0(%[vf_ptr])
  64       )": // no output registers
  65         : [lane_start]"r"(lane_start),
  66           [sel_block]"r"(sel_block),
  67           [sel_block_op]"r"(sel_block_op),
  68           [tf_scale]"r"(tf_scale),
  69           [half_cfs]"r"(half_cur_fft_size),
  70           [vf_ptr]"r"(&vf_fft_init)
  71         : // no clobber
  72       );
  73
  74       // Second VF block loads tf and op2 then calculates scale factor
  75       asm volatile (R"(
  76           vmsv vx4, %[tf_real]
  77           vmsv vx5, %[tf_imag]
  78           vmsv vx6, %[workspace_real]
  79           vmsv vx7, %[workspace_imag]
  80           vmsv vx8, %[fix_pt]
  81           vf 0(%[vf_ptr])
  82       )": // no output registers
  83         : [tf_real]"r"(tf_real),
  84           [tf_imag]"r"(tf_imag),
  85           [workspace_real]"r"(workspace_real),
  86           [workspace_imag]"r"(workspace_imag),
  87           [fix_pt]"r"(FIX_PT),
  88           [vf_ptr]"r"(&vf_fft_scale)
  89         : // no clobber
  90       );
  91 #elif defined(FFT_FLOATING)
  92       // First VF block to have vector unit determine what op it is doing
  93       asm volatile (R"(
  94           vmsv vx1, %[lane_start]
  95           vmsv vx2, %[sel_block]
  96           vmsv vx3, %[sel_block_op]
  97           vmsv vx4, %[tf_scale]
  98           vmsv vx5, %[half_cfs]
  99           vf 0(%[vf_ptr])
 100       )": // no output registers
 101         : [lane_start]"r"(lane_start),
 102           [sel_block]"r"(sel_block),
 103           [sel_block_op]"r"(sel_block_op),
 104           [tf_scale]"r"(tf_scale),
 105           [half_cfs]"r"(half_cur_fft_size),
 106           [vf_ptr]"r"(&vf_fft_init)
 107         : // no clobber
 108       );
 109
 110       // Second VF block loads tf and op2 then calculates scale factor
 111       asm volatile (R"(
 112           vmsv vx4, %[tf_real]
 113           vmsv vx5, %[tf_imag]
 114           vmsv vx6, %[workspace_real]
 115           vmsv vx7, %[workspace_imag]
 116           vf 0(%[vf_ptr])
 117       )": // no output registers
 118         : [tf_real]"r"(tf_real),
 119           [tf_imag]"r"(tf_imag),
 120           [workspace_real]"r"(workspace_real),
 121           [workspace_imag]"r"(workspace_imag),
 122           [vf_ptr]"r"(&vf_fft_scale)
 123         : // no clobber
 124       );
 125 #else
 126   #error no mode selected in vec-fft/vec-fft.c
 127 #endif
 128
 129       // Third VF block actually calculates the results
 130       asm volatile (R"(
 131           vmsv vx5, %[workspace_real]
 132           vmsv vx6, %[workspace_imag]
 133           vf 0(%[vf_ptr])
 134       )": // no output registers
 135         : [workspace_real]"r"(workspace_real),
 136           [workspace_imag]"r"(workspace_imag),
 137           [vf_ptr]"r"(&vf_fft_exec)
 138         : // no clobber
 139       );
 140
 141       // Fourth VF block stores first result
 142       asm volatile (R"(
 143           vmsv vx3, %[workspace_real]
 144           vmsv vx4, %[workspace_imag]
 145           vf 0(%[vf_ptr])
 146       )": // no output registers
 147         : [workspace_real]"r"(workspace_real),
 148           [workspace_imag]"r"(workspace_imag),
 149           [vf_ptr]"r"(&vf_fft_store1)
 150         : "memory"
 151       );
 152
 153       // Fifth VF block stores second result
 154       asm volatile (R"(
 155           vmsv vx3, %[workspace_real]
 156           vmsv vx4, %[workspace_imag]
 157           vf 0(%[vf_ptr])
 158       )": // no output registers
 159         : [workspace_real]"r"(workspace_real),
 160           [workspace_imag]"r"(workspace_imag),
 161           [vf_ptr]"r"(&vf_fft_store2)
 162         : "memory"
 163       );
 164
 165     }
 166   }
 167
 168   asm volatile ("fence"); // Make sure all that work from vector unit is visible to CPU
 169
 170   return;
 171 }