# See LICENSE for license details. .text .align 2 #include "fft_const.h" #if defined(FFT_FIXED) #define PTR_SHIFT 2 #define PTR_SIZE 4 #define DATA_LOAD lw #define DATA_STORE sw #define FFT_MUL mul #define FFT_ADD add #define FFT_SUB sub #define REG0 x3 #define REG1 x4 #define REG2 x5 #define REG3 x6 #define REG4 x7 #define REG5 x8 #elif defined(FFT_FLOATING) #if defined(FP_HALF) #define PTR_SHIFT 1 #define PTR_SIZE 2 #define DATA_LOAD flh #define DATA_STORE fsh #define FFT_MUL fmul.h #define FFT_ADD fadd.h #define FFT_SUB fsub.h #elif defined(FP_SINGLE) #define PTR_SHIFT 2 #define PTR_SIZE 4 #define DATA_LOAD flw #define DATA_STORE fsw #define FFT_MUL fmul.s #define FFT_ADD fadd.s #define FFT_SUB fsub.s #elif defined(FP_DOUBLE) #define PTR_SHIFT 3 #define PTR_SIZE 8 #define DATA_LOAD fld #define DATA_STORE fsd #define FFT_MUL fmul.d #define FFT_ADD fadd.d #define FFT_SUB fsub.d #endif #define REG0 f0 #define REG1 f1 #define REG2 f2 #define REG3 f3 #define REG4 f4 #define REG5 f5 #else #error FFT_FIXED or FFT_FLOATING not defined #endif .globl vf_test vf_test: utidx x2 add x1, x1, x2 add x1, x1, x1 stop .globl vf_fft_init vf_fft_init: # IN: # x1: lane start (utidx=0 actually has this pos due to stripmining) # x2: bit mask to select FFT block from op idx # x3: bit mask to select operand in FFT block from op idx # x4: necessary shift to adjust TF appropriately ( REMOVED ) # x5: half the current FFT size (add to get the second op) # OUT: # x1: Has the first operand pos = (opid & i_x2) << 1 + (opid & i_x3) # x2: Has the second operand pos = o_x1 + i_x5 # x3: Has the twiddle factor pos = (opid & i_x3) << i_x4 utidx x6 add x6, x1, x6 # x6 <= opid and x2, x2, x6 # x2 <= opid & i_x2 and x3, x3, x6 # x3 <= opid & i_x3 slli x2, x2, 1 # x2 <= (opid & i_x2) << 1 add x1, x2, x3 # x1 is now the proper result add x2, x1, x5 # x2 is now the proper result sll x3, x3, x4 stop .globl vf_fft_scale vf_fft_scale: # IN: # x1: Has the first operand pos (reused) # x2: Has the second operand pos (reused) # x3: Has the twiddle factor pos (reused) # x4: Has the tf real ptr # x5: Has the tf imag ptr # x6: Has the workspace real ptr # x7: Has the workspace imag ptr # x8: Has the fixed point shift ( REMOVED ) # OUT: # x1: Has the first operand offset = i_x1 << 3 # x2: Has the second operand offset = i_x2 << 3 # x3: Has the scale factor real # x4: Has the scale factor imag # Convert positions into actual memory offsets from table start slli x1, x1, PTR_SHIFT # x1 <= i_x1 << 3 (proper result) slli x2, x2, PTR_SHIFT # x2 <= i_x2 << 3 (proper result) slli x3, x3, PTR_SHIFT # x3 <= i_x3 << 3 (tf offset) # Compute memory locations add x4, x4, x3 # x4 <= load address for tf real add x5, x5, x3 # x5 <= load address for tf imag add x6, x6, x2 # x6 <= load address for op2 real add x7, x7, x2 # x7 <= load address for op2 imag # Actually read memory DATA_LOAD REG1, 0(x4) # tf real (a) DATA_LOAD REG2, 0(x5) # tf imag (bi) DATA_LOAD REG3, 0(x6) # op2 real (c) DATA_LOAD REG4, 0(x7) # op2 imag (di) # Do the math using 3 multiplies FFT_ADD REG0, REG1, REG2 # REG0 <= a + b FFT_SUB REG2, REG2, REG1 # REG2 <= b - a FFT_MUL REG0, REG0, REG4 # REG0 <= (a+b)d #ifdef FFT_FIXED sra REG0, REG0, REG5 # DO NOT SHIFT FOR FLOATING #endif FFT_MUL REG2, REG2, REG3 # REG2 <= (b-a)c #ifdef FFT_FIXED sra REG2, REG2, REG5 # DO NOT SHIFT FOR FLOATING #endif FFT_ADD REG3, REG3, REG4 # REG3 <= c + d FFT_MUL REG4, REG1, REG3 # REG4 <= a(c+d) #ifdef FFT_FIXED sra REG4, REG4, REG5 # DO NOT SHIFT FOR FLOATING #endif # Prepare final result FFT_SUB REG0, REG4, REG0 # REG0 <= a(c+d) - (a+b)d (scale real) FFT_ADD REG1, REG4, REG2 # REG1 <= a(c+d) + (b-a)c (scale imag) stop /* # Four multiply version # Do the multiplications (a+bi)(c+di) needs ac ad bc bd mul x3, x4, x6 # x3 <= ac mul x4, x4, x7 # x4 <= adi mul x6, x5, x6 # x6 <= bc mul x5, x5, x7 # x5 <= bdi sra x3, x3, x8 # These 4 shifts make sure the fixed pt properly aligned sra x4, x4, x8 sra x5, x5, x8 sra x6, x6, x8 # Do the additions (ac - bd) and (bc + ad) sub x3, x3, x5 # x3 <= ac - bd (proper result) add x4, x4, x6 # x4 <= bc + ad (proper result) */ .globl vf_fft_exec vf_fft_exec: # IN: # x1: Has the first operand offset (reused) # x2: Has the second operand offset (reused) # x3: Has the scale factor real (reused) # x4: Has the scale factor imag (reused) # x5: Has the workspace real ptr # x6: Has the workspace imag ptr # OUT: # x1: Has the first operand offset (carry) # x2: Has the second operand offset (carry) # x5: Has the first result real # x6: Has the first result imag # x7: Has the second result real # x8: Has the second result imag # Compute first operand memory locations add x5, x5, x1 # x5 <= load address for op1 real add x6, x6, x1 # x6 <= load address for op1 imag #actually read memory DATA_LOAD REG2, 0(x5) # op1 real DATA_LOAD REG3, 0(x6) # op1 imag # Do the add/subs (res1=op1+scale), (res2=op1-scale) FFT_SUB REG4, REG2, REG0 # res2 real FFT_SUB REG5, REG3, REG1 # res2 imag FFT_ADD REG2, REG2, REG0 # res1 real FFT_ADD REG3, REG3, REG1 # res1 imag stop .globl vf_fft_store1 vf_fft_store1: # IN: # x1: Has the first operand offset (reused) # x2: Has the second operand offset (reused) # x3: Has the workspace real ptr # x4: Has the workspace imag ptr # x5: Has the first result real (reused) # x6: Has the first result imag (reused) # x7: Has the second result real (reused) # x8: Has the second result imag (reused) # OUT: # x2: Has the second operand offset (carry) # x7: Has the second result real (carry) # x8: Has the second result imag (carry) # Compute first result memory locations add x3, x3, x1 add x4, x4, x1 # actually write memory DATA_STORE REG2, 0(x3) DATA_STORE REG3, 0(x4) stop .globl vf_fft_store2 vf_fft_store2: # IN: # x2: Has the second operand offset # x3: Has the workspace real ptr # x4: Has the workspace imag ptr # x7: Has the second result real # x8: Has the second result imag # OUT: (none) # Compute second result memory locations add x3, x3, x2 add x4, x4, x2 # actually write memory DATA_STORE REG4, 0(x3) DATA_STORE REG5, 0(x4) stop