benchmarks/vec-fft/vec-vfft.S

   1 # See LICENSE for license details.
   2
   3         .text
   4         .align 2
   5
   6 #include "fft_const.h"
   7
   8 #if defined(FFT_FIXED)
   9   #define PTR_SHIFT 2
  10   #define PTR_SIZE  4
  11
  12   #define DATA_LOAD lw
  13   #define DATA_STORE sw
  14
  15   #define FFT_MUL mul
  16   #define FFT_ADD add
  17   #define FFT_SUB sub
  18
  19   #define REG0 x3
  20   #define REG1 x4
  21   #define REG2 x5
  22   #define REG3 x6
  23   #define REG4 x7
  24   #define REG5 x8
  25 #elif defined(FFT_FLOATING)
  26   #if defined(FP_HALF)
  27     #define PTR_SHIFT 1
  28     #define PTR_SIZE  2
  29
  30     #define DATA_LOAD flh
  31     #define DATA_STORE fsh
  32
  33     #define FFT_MUL fmul.h
  34     #define FFT_ADD fadd.h
  35     #define FFT_SUB fsub.h
  36   #elif defined(FP_SINGLE)
  37     #define PTR_SHIFT 2
  38     #define PTR_SIZE  4
  39
  40     #define DATA_LOAD flw
  41     #define DATA_STORE fsw
  42
  43     #define FFT_MUL fmul.s
  44     #define FFT_ADD fadd.s
  45     #define FFT_SUB fsub.s
  46   #elif defined(FP_DOUBLE)
  47     #define PTR_SHIFT 3
  48     #define PTR_SIZE  8
  49
  50     #define DATA_LOAD fld
  51     #define DATA_STORE fsd
  52
  53     #define FFT_MUL fmul.d
  54     #define FFT_ADD fadd.d
  55     #define FFT_SUB fsub.d
  56   #endif
  57
  58   #define REG0 f0
  59   #define REG1 f1
  60   #define REG2 f2
  61   #define REG3 f3
  62   #define REG4 f4
  63   #define REG5 f5
  64 #else
  65   #error FFT_FIXED or FFT_FLOATING not defined
  66 #endif
  67
  68         .globl vf_test
  69 vf_test:
  70         utidx x2
  71         add x1, x1, x2
  72         add x1, x1, x1
  73         stop
  74
  75         .globl vf_fft_init
  76 vf_fft_init:
  77 # IN:
  78 #   x1: lane start (utidx=0 actually has this pos due to stripmining)
  79 #   x2: bit mask to select FFT block from op idx
  80 #   x3: bit mask to select operand in FFT block from op idx
  81 #   x4: necessary shift to adjust TF appropriately ( REMOVED )
  82 #   x5: half the current FFT size (add to get the second op)
  83 # OUT:
  84 #   x1: Has the first operand pos = (opid & i_x2) << 1 + (opid & i_x3)
  85 #   x2: Has the second operand pos = o_x1 + i_x5
  86 #   x3: Has the twiddle factor pos = (opid & i_x3) << i_x4
  87         utidx x6
  88         add x6, x1, x6 # x6 <= opid
  89         and x2, x2, x6 # x2 <= opid & i_x2
  90         and x3, x3, x6 # x3 <= opid & i_x3
  91         slli x2, x2, 1 # x2 <= (opid & i_x2) << 1
  92
  93         add x1, x2, x3 # x1 is now the proper result
  94         add x2, x1, x5 # x2 is now the proper result
  95         sll x3, x3, x4
  96
  97         stop
  98
  99         .globl vf_fft_scale
 100 vf_fft_scale:
 101 # IN:
 102 #   x1: Has the first operand pos (reused)
 103 #   x2: Has the second operand pos (reused)
 104 #   x3: Has the twiddle factor pos (reused)
 105 #   x4: Has the tf real ptr
 106 #   x5: Has the tf imag ptr
 107 #   x6: Has the workspace real ptr
 108 #   x7: Has the workspace imag ptr
 109 #   x8: Has the fixed point shift ( REMOVED )
 110 # OUT:
 111 #   x1: Has the first operand offset = i_x1 << 3
 112 #   x2: Has the second operand offset = i_x2 << 3
 113 #   x3: Has the scale factor real
 114 #   x4: Has the scale factor imag
 115         # Convert positions into actual memory offsets from table start
 116         slli x1, x1, PTR_SHIFT # x1 <= i_x1 << 3 (proper result)
 117         slli x2, x2, PTR_SHIFT # x2 <= i_x2 << 3 (proper result)
 118         slli x3, x3, PTR_SHIFT # x3 <= i_x3 << 3 (tf offset)
 119
 120         # Compute memory locations
 121         add x4, x4, x3 # x4 <= load address for tf real
 122         add x5, x5, x3 # x5 <= load address for tf imag
 123         add x6, x6, x2 # x6 <= load address for op2 real
 124         add x7, x7, x2 # x7 <= load address for op2 imag
 125
 126         # Actually read memory
 127         DATA_LOAD REG1, 0(x4) # tf real (a)
 128         DATA_LOAD REG2, 0(x5) # tf imag (bi)
 129         DATA_LOAD REG3, 0(x6) # op2 real (c)
 130         DATA_LOAD REG4, 0(x7) # op2 imag (di)
 131
 132         # Do the math using 3 multiplies
 133         FFT_ADD REG0, REG1, REG2 # REG0 <= a + b
 134         FFT_SUB REG2, REG2, REG1 # REG2 <= b - a
 135         FFT_MUL REG0, REG0, REG4 # REG0 <= (a+b)d
 136 #ifdef FFT_FIXED
 137         sra REG0, REG0, REG5 # DO NOT SHIFT FOR FLOATING
 138 #endif
 139         FFT_MUL REG2, REG2, REG3 # REG2 <= (b-a)c
 140 #ifdef FFT_FIXED
 141         sra REG2, REG2, REG5 # DO NOT SHIFT FOR FLOATING
 142 #endif
 143         FFT_ADD REG3, REG3, REG4 # REG3 <= c + d
 144         FFT_MUL REG4, REG1, REG3 # REG4 <= a(c+d)
 145 #ifdef FFT_FIXED
 146         sra REG4, REG4, REG5 # DO NOT SHIFT FOR FLOATING
 147 #endif
 148
 149         # Prepare final result
 150         FFT_SUB REG0, REG4, REG0 # REG0 <= a(c+d) - (a+b)d (scale real)
 151         FFT_ADD REG1, REG4, REG2 # REG1 <= a(c+d) + (b-a)c (scale imag)
 152
 153         stop
 154 /*
 155         # Four multiply version
 156         # Do the multiplications (a+bi)(c+di) needs ac ad bc bd
 157         mul x3, x4, x6 # x3 <= ac
 158         mul x4, x4, x7 # x4 <= adi
 159         mul x6, x5, x6 # x6 <= bc
 160         mul x5, x5, x7 # x5 <= bdi
 161         sra x3, x3, x8 # These 4 shifts make sure the fixed pt properly aligned
 162         sra x4, x4, x8
 163         sra x5, x5, x8
 164         sra x6, x6, x8
 165
 166         # Do the additions (ac - bd) and (bc + ad)
 167         sub x3, x3, x5 # x3 <= ac - bd (proper result)
 168         add x4, x4, x6 # x4 <= bc + ad (proper result)
 169 */
 170
 171         .globl vf_fft_exec
 172 vf_fft_exec:
 173 # IN:
 174 #   x1: Has the first operand offset (reused)
 175 #   x2: Has the second operand offset (reused)
 176 #   x3: Has the scale factor real (reused)
 177 #   x4: Has the scale factor imag (reused)
 178 #   x5: Has the workspace real ptr
 179 #   x6: Has the workspace imag ptr
 180 # OUT:
 181 #   x1: Has the first operand offset (carry)
 182 #   x2: Has the second operand offset (carry)
 183 #   x5: Has the first result real
 184 #   x6: Has the first result imag
 185 #   x7: Has the second result real
 186 #   x8: Has the second result imag
 187         # Compute first operand memory locations
 188         add x5, x5, x1 # x5 <= load address for op1 real
 189         add x6, x6, x1 # x6 <= load address for op1 imag
 190
 191         #actually read memory
 192         DATA_LOAD REG2, 0(x5) # op1 real
 193         DATA_LOAD REG3, 0(x6) # op1 imag
 194
 195         # Do the add/subs (res1=op1+scale), (res2=op1-scale)
 196         FFT_SUB REG4, REG2, REG0 # res2 real
 197         FFT_SUB REG5, REG3, REG1 # res2 imag
 198         FFT_ADD REG2, REG2, REG0 # res1 real
 199         FFT_ADD REG3, REG3, REG1 # res1 imag
 200
 201         stop
 202
 203         .globl vf_fft_store1
 204 vf_fft_store1:
 205 # IN:
 206 #   x1: Has the first operand offset (reused)
 207 #   x2: Has the second operand offset (reused)
 208 #   x3: Has the workspace real ptr
 209 #   x4: Has the workspace imag ptr
 210 #   x5: Has the first result real (reused)
 211 #   x6: Has the first result imag (reused)
 212 #   x7: Has the second result real (reused)
 213 #   x8: Has the second result imag (reused)
 214 # OUT:
 215 #   x2: Has the second operand offset (carry)
 216 #   x7: Has the second result real (carry)
 217 #   x8: Has the second result imag (carry)
 218         # Compute first result memory locations
 219         add x3, x3, x1
 220         add x4, x4, x1
 221
 222         # actually write memory
 223         DATA_STORE REG2, 0(x3)
 224         DATA_STORE REG3, 0(x4)
 225
 226         stop
 227
 228         .globl vf_fft_store2
 229 vf_fft_store2:
 230 # IN:
 231 #   x2: Has the second operand offset
 232 #   x3: Has the workspace real ptr
 233 #   x4: Has the workspace imag ptr
 234 #   x7: Has the second result real
 235 #   x8: Has the second result imag
 236 # OUT: (none)
 237         # Compute second result memory locations
 238         add x3, x3, x2
 239         add x4, x4, x2
 240
 241         # actually write memory
 242         DATA_STORE REG4, 0(x3)
 243         DATA_STORE REG5, 0(x4)
 244
 245         stop