benchmarks/vec-fft/vec-vfft.S

   1         .text
   2         .align 2
   3
   4 #include "fft_const.h"
   5
   6 #if defined(FFT_FIXED)
   7   #define PTR_SHIFT 2
   8   #define PTR_SIZE  4
   9
  10   #define DATA_LOAD lw
  11   #define DATA_STORE sw
  12
  13   #define FFT_MUL mul
  14   #define FFT_ADD add
  15   #define FFT_SUB sub
  16
  17   #define REG0 x3
  18   #define REG1 x4
  19   #define REG2 x5
  20   #define REG3 x6
  21   #define REG4 x7
  22   #define REG5 x8
  23 #elif defined(FFT_FLOATING)
  24   #if defined(FP_HALF)
  25     #define PTR_SHIFT 1
  26     #define PTR_SIZE  2
  27
  28     #define DATA_LOAD flh
  29     #define DATA_STORE fsh
  30
  31     #define FFT_MUL fmul.h
  32     #define FFT_ADD fadd.h
  33     #define FFT_SUB fsub.h
  34   #elif defined(FP_SINGLE)
  35     #define PTR_SHIFT 2
  36     #define PTR_SIZE  4
  37
  38     #define DATA_LOAD flw
  39     #define DATA_STORE fsw
  40
  41     #define FFT_MUL fmul.s
  42     #define FFT_ADD fadd.s
  43     #define FFT_SUB fsub.s
  44   #elif defined(FP_DOUBLE)
  45     #define PTR_SHIFT 3
  46     #define PTR_SIZE  8
  47
  48     #define DATA_LOAD fld
  49     #define DATA_STORE fsd
  50
  51     #define FFT_MUL fmul.d
  52     #define FFT_ADD fadd.d
  53     #define FFT_SUB fsub.d
  54   #endif
  55
  56   #define REG0 f0
  57   #define REG1 f1
  58   #define REG2 f2
  59   #define REG3 f3
  60   #define REG4 f4
  61   #define REG5 f5
  62 #else
  63   #error FFT_FIXED or FFT_FLOATING not defined
  64 #endif
  65
  66         .globl vf_test
  67 vf_test:
  68         utidx x2
  69         add x1, x1, x2
  70         add x1, x1, x1
  71         stop
  72
  73         .globl vf_fft_init
  74 vf_fft_init:
  75 # IN:
  76 #   x1: lane start (utidx=0 actually has this pos due to stripmining)
  77 #   x2: bit mask to select FFT block from op idx
  78 #   x3: bit mask to select operand in FFT block from op idx
  79 #   x4: necessary shift to adjust TF appropriately ( REMOVED )
  80 #   x5: half the current FFT size (add to get the second op)
  81 # OUT:
  82 #   x1: Has the first operand pos = (opid & i_x2) << 1 + (opid & i_x3)
  83 #   x2: Has the second operand pos = o_x1 + i_x5
  84 #   x3: Has the twiddle factor pos = (opid & i_x3) << i_x4
  85         utidx x6
  86         add x6, x1, x6 # x6 <= opid
  87         and x2, x2, x6 # x2 <= opid & i_x2
  88         and x3, x3, x6 # x3 <= opid & i_x3
  89         slli x2, x2, 1 # x2 <= (opid & i_x2) << 1
  90
  91         add x1, x2, x3 # x1 is now the proper result
  92         add x2, x1, x5 # x2 is now the proper result
  93         sll x3, x3, x4
  94
  95         stop
  96
  97         .globl vf_fft_scale
  98 vf_fft_scale:
  99 # IN:
 100 #   x1: Has the first operand pos (reused)
 101 #   x2: Has the second operand pos (reused)
 102 #   x3: Has the twiddle factor pos (reused)
 103 #   x4: Has the tf real ptr
 104 #   x5: Has the tf imag ptr
 105 #   x6: Has the workspace real ptr
 106 #   x7: Has the workspace imag ptr
 107 #   x8: Has the fixed point shift ( REMOVED )
 108 # OUT:
 109 #   x1: Has the first operand offset = i_x1 << 3
 110 #   x2: Has the second operand offset = i_x2 << 3
 111 #   x3: Has the scale factor real
 112 #   x4: Has the scale factor imag
 113         # Convert positions into actual memory offsets from table start
 114         slli x1, x1, PTR_SHIFT # x1 <= i_x1 << 3 (proper result)
 115         slli x2, x2, PTR_SHIFT # x2 <= i_x2 << 3 (proper result)
 116         slli x3, x3, PTR_SHIFT # x3 <= i_x3 << 3 (tf offset)
 117
 118         # Compute memory locations
 119         add x4, x4, x3 # x4 <= load address for tf real
 120         add x5, x5, x3 # x5 <= load address for tf imag
 121         add x6, x6, x2 # x6 <= load address for op2 real
 122         add x7, x7, x2 # x7 <= load address for op2 imag
 123
 124         # Actually read memory
 125         DATA_LOAD REG1, 0(x4) # tf real (a)
 126         DATA_LOAD REG2, 0(x5) # tf imag (bi)
 127         DATA_LOAD REG3, 0(x6) # op2 real (c)
 128         DATA_LOAD REG4, 0(x7) # op2 imag (di)
 129
 130         # Do the math using 3 multiplies
 131         FFT_ADD REG0, REG1, REG2 # REG0 <= a + b
 132         FFT_SUB REG2, REG2, REG1 # REG2 <= b - a
 133         FFT_MUL REG0, REG0, REG4 # REG0 <= (a+b)d
 134 #ifdef FFT_FIXED
 135         sra REG0, REG0, REG5 # DO NOT SHIFT FOR FLOATING
 136 #endif
 137         FFT_MUL REG2, REG2, REG3 # REG2 <= (b-a)c
 138 #ifdef FFT_FIXED
 139         sra REG2, REG2, REG5 # DO NOT SHIFT FOR FLOATING
 140 #endif
 141         FFT_ADD REG3, REG3, REG4 # REG3 <= c + d
 142         FFT_MUL REG4, REG1, REG3 # REG4 <= a(c+d)
 143 #ifdef FFT_FIXED
 144         sra REG4, REG4, REG5 # DO NOT SHIFT FOR FLOATING
 145 #endif
 146
 147         # Prepare final result
 148         FFT_SUB REG0, REG4, REG0 # REG0 <= a(c+d) - (a+b)d (scale real)
 149         FFT_ADD REG1, REG4, REG2 # REG1 <= a(c+d) + (b-a)c (scale imag)
 150
 151         stop
 152 /*
 153         # Four multiply version
 154         # Do the multiplications (a+bi)(c+di) needs ac ad bc bd
 155         mul x3, x4, x6 # x3 <= ac
 156         mul x4, x4, x7 # x4 <= adi
 157         mul x6, x5, x6 # x6 <= bc
 158         mul x5, x5, x7 # x5 <= bdi
 159         sra x3, x3, x8 # These 4 shifts make sure the fixed pt properly aligned
 160         sra x4, x4, x8
 161         sra x5, x5, x8
 162         sra x6, x6, x8
 163
 164         # Do the additions (ac - bd) and (bc + ad)
 165         sub x3, x3, x5 # x3 <= ac - bd (proper result)
 166         add x4, x4, x6 # x4 <= bc + ad (proper result)
 167 */
 168
 169         .globl vf_fft_exec
 170 vf_fft_exec:
 171 # IN:
 172 #   x1: Has the first operand offset (reused)
 173 #   x2: Has the second operand offset (reused)
 174 #   x3: Has the scale factor real (reused)
 175 #   x4: Has the scale factor imag (reused)
 176 #   x5: Has the workspace real ptr
 177 #   x6: Has the workspace imag ptr
 178 # OUT:
 179 #   x1: Has the first operand offset (carry)
 180 #   x2: Has the second operand offset (carry)
 181 #   x5: Has the first result real
 182 #   x6: Has the first result imag
 183 #   x7: Has the second result real
 184 #   x8: Has the second result imag
 185         # Compute first operand memory locations
 186         add x5, x5, x1 # x5 <= load address for op1 real
 187         add x6, x6, x1 # x6 <= load address for op1 imag
 188
 189         #actually read memory
 190         DATA_LOAD REG2, 0(x5) # op1 real
 191         DATA_LOAD REG3, 0(x6) # op1 imag
 192
 193         # Do the add/subs (res1=op1+scale), (res2=op1-scale)
 194         FFT_SUB REG4, REG2, REG0 # res2 real
 195         FFT_SUB REG5, REG3, REG1 # res2 imag
 196         FFT_ADD REG2, REG2, REG0 # res1 real
 197         FFT_ADD REG3, REG3, REG1 # res1 imag
 198
 199         stop
 200
 201         .globl vf_fft_store1
 202 vf_fft_store1:
 203 # IN:
 204 #   x1: Has the first operand offset (reused)
 205 #   x2: Has the second operand offset (reused)
 206 #   x3: Has the workspace real ptr
 207 #   x4: Has the workspace imag ptr
 208 #   x5: Has the first result real (reused)
 209 #   x6: Has the first result imag (reused)
 210 #   x7: Has the second result real (reused)
 211 #   x8: Has the second result imag (reused)
 212 # OUT:
 213 #   x2: Has the second operand offset (carry)
 214 #   x7: Has the second result real (carry)
 215 #   x8: Has the second result imag (carry)
 216         # Compute first result memory locations
 217         add x3, x3, x1
 218         add x4, x4, x1
 219
 220         # actually write memory
 221         DATA_STORE REG2, 0(x3)
 222         DATA_STORE REG3, 0(x4)
 223
 224         stop
 225
 226         .globl vf_fft_store2
 227 vf_fft_store2:
 228 # IN:
 229 #   x2: Has the second operand offset
 230 #   x3: Has the workspace real ptr
 231 #   x4: Has the workspace imag ptr
 232 #   x7: Has the second result real
 233 #   x8: Has the second result imag
 234 # OUT: (none)
 235         # Compute second result memory locations
 236         add x3, x3, x2
 237         add x4, x4, x2
 238
 239         # actually write memory
 240         DATA_STORE REG4, 0(x3)
 241         DATA_STORE REG5, 0(x4)
 242
 243         stop