#*****************************************************************************
# cmplxmult function (assembly version)
#-----------------------------------------------------------------------------


#--------------------------------------------------------------------------
# Headers and Defines
#--------------------------------------------------------------------------
 
# Here are some defines that make writing assembly code easier.

# I'm using the knowledge that rN will be placed in register a0, rA will be
# placed into register a1, etc., based on the calling convention for functions.
 

#define rN      a0
#define rA      a1
#define rB      a2
#define rC      a3

#define rVlen   a6
#define rStride a7

#define rAI t0
#define rBI t1
#define rCI t2

# WARNING: do not write to the s0,...,s9 registers without first saving them to
# the stack.  

#--------------------------------------------------------------------------
# void scalar_cmplxmult_asm( int n, float a[], float b[], float c[] )
#--------------------------------------------------------------------------

        .text
        .align 2
        .globl scalar_cmplxmult_asm
        .type  scalar_cmplxmult_asm,@function

scalar_cmplxmult_asm:

        # *****   Scalar Example   *****

        blez rN, done    # exit early if n < 0 

loop:
      # The following code is a naive implementation...
      # Re-ordering instructions may increase performance, also, 
      # RISC-V supports instrucitons such as the "fmuladd" and "fmulsub".
      # fmsub.s fa2,fa4,fa3,ft1
      # Finally, unrolling and other fun transformations can also provide
      # performance gains.

        flw  f2, 0(rA)  
        flw  f3, 4(rA)  
        flw  f4, 0(rB)  
        flw  f5, 4(rB)  
        fmul.s f6, f2, f4
        fmul.s f7, f3, f5
        fmul.s f8, f3, f4
        fmul.s f9, f2, f5
        fsub.s f10, f6, f7
        fadd.s f11, f8, f9
        fsw  f10, 0(rC)  
        fsw  f11, 4(rC)  
        addi rN, rN, -1
        addi rA, rA, 8 
        addi rB, rB, 8 
        addi rC, rC, 8 
        bne  rN, zero, loop
done:
        ret

 
#--------------------------------------------------------------------------
# void vt_cmplxmult_asm( int n, float a[], float b[], float c[] )
#--------------------------------------------------------------------------
 

        # ***** Vector-Thread Example *****

        .globl vt_cmplxmult_asm
        .type  vt_cmplxmult_asm,@function

        # HINT: because you are dealing with an array of structures, a regular,
        # vanilla vector-load/vector-store won't work here!

vt_cmplxmult_asm:
        
        blez rN, cpdone  
        la a4, vtcode
        li rStride, 8

        vvcfgivl rVlen, rN, 1, 7

stripmineloop:

         # ADD YOUR CODE HERE....
        vsetvl rVlen, rN   # set the vector length
                           # rN is the desired (application) vector length
                           # rVLen is what vector length we were given

        vflstw vf2, rA, rStride       # real number vector load of A
        addi rAI, rA, 4
        vflstw vf4, rB, rStride  # real number vector load of B
        addi rBI, rB, 4
        vflstw vf3, rAI, rStride #imaginary number vector load of A
        vflstw vf5, rBI, rStride #imaginary vector number load of B

        vf 0(a4)           # jump to vector-fetch code

        vfsstw vf0, rC, rStride       # real number vector store C
        addi rCI, rC, 4
        vfsstw vf1, rCI, rStride # imaginary

        slli a5, rVlen, 3
        sub rN, rN, rVlen  # book keeping
        add rA, rA, a5
        add rB, rB, a5
        add rC, rC, a5
        bne rN, zero, stripmineloop
         # Step 0: set the vector length 
         # Step 1: perform your vector loads
         # Step 2: jump to the vector-fetch code to perform the calculation
         # Step 3: perform the vector store
         # Step 4: book keeping, update the pointers, etc.

cpdone:    
        fence.v.l 
        ret

vtcode:
        # ADD YOUR VECTOR-ELEMENT CODE HERE ... 
        fmul.s f0, f2, f4
        fmsub.s f0, f3, f5, f0

        fmul.s f1, f2, f5
        fmadd.s f1, f3, f4, f1
        stop
        
        # The C code uses a jalr instruction to call this function
        # so we can use a jr to return back to where the function
        # was called.  Also known as "ret", for "return".

        ret