1 #*****************************************************************************
2 # matmul function (assembly version)
3 #-----------------------------------------------------------------------------
6 #--------------------------------------------------------------------------
8 #--------------------------------------------------------------------------
10 # Here are some defines that make writing assembly code easier.
12 # I'm using the knowledge that rLda will be placed in register a0, rA will be
13 # placed into register a1, etc., based on the calling convention for functions.
27 # address of VT function
31 # desired app vector length (number of elements to vectorize)
44 # WARNING: do not write to the s0,...,s9 registers without first saving them to
47 #--------------------------------------------------------------------------
48 # void scalar_matmul_asm( int n, float a[], float b[], float c[] )
49 #--------------------------------------------------------------------------
53 .globl scalar_matmul_asm
54 .type scalar_matmul_asm,@function
58 # ***** Scalar Example *****
60 blez rLda, done # exit early if lda < 0
68 mul rTemp0, rJ, rLda # calculate indices... I'm being SUPER naive here:
69 add rATemp, rK, rTemp0 # this could be a lot more clever!
70 slli rATemp, rATemp, 2
71 add rATemp, rA, rATemp
74 add rBTemp, rI, rTemp0
75 slli rBTemp, rBTemp, 2
76 add rBTemp, rB, rBTemp
79 add rCTemp, rI, rTemp0
80 slli rCTemp, rCTemp, 2
81 add rCTemp, rC, rCTemp
83 flw f2, 0(rATemp) # again, I'm being very lazy...
84 # I can lift this out of the inner loop!
103 #--------------------------------------------------------------------------
104 # void vt_matmul_asm( int n, float a[], float b[], float c[] )
105 #--------------------------------------------------------------------------
108 # ***** Vector-Thread Example *****
111 .type vt_matmul_asm,@function
120 # turn on vector unit
125 blez rLda, cpdone # exit early if lda < 0
131 #for starters ask for all the registers. We shouldn't need this many
132 #but we'll trim it when we have correctness in hand
133 vvcfgivl rVlen, rNum, 1, 5
142 sub rNum, rN, rI # book keeping
143 vsetvl rVlen, rNum # set the vector length
144 # rN is the desired (application) vector length
145 # rVlen is what vector length we were given
147 #####################################
149 #####################################
150 mul rTemp0, rJ, rLda4
151 add rCTemp, rI4, rTemp0
153 add rCTemp, rC, rCTemp
156 add rCTemp, rCTemp, rLda4
160 #################################
161 # address calculation lifts #
162 #################################
163 mul rTemp0, rJ, rLda4
164 add rATmp2, rA, rTemp0
171 #####################################
173 #####################################
175 add rATemp, rK4, rATmp2
176 vflstw vf0, rATemp, zero
178 add rATemp, rATemp, rLda4
179 vflstw vf3, rATemp, zero
182 #####################################
184 #####################################
185 mul rTemp0, rK, rLda4
186 add rBTemp, rBTmp2, rTemp0
190 #####################################
192 #####################################
193 add rATemp, rK4, rATmp2
194 addi rATemp, rATemp, 4
195 vflstw vf0, rATemp, zero
197 add rATemp, rATemp, rLda4
198 vflstw vf3, rATemp, zero
201 #####################################
203 #####################################
204 add rBTemp, rBTemp, rLda4
208 #####################################
210 #####################################
211 add rATemp, rK4, rATmp2
212 addi rATemp, rATemp, 8
213 vflstw vf0, rATemp, zero
215 add rATemp, rATemp, rLda4
216 vflstw vf3, rATemp, zero
219 #####################################
221 #####################################
222 add rBTemp, rBTemp, rLda4
227 #####################################
229 #####################################
230 add rATemp, rK4, rATmp2
231 addi rATemp, rATemp, 12
232 vflstw vf0, rATemp, zero
234 add rATemp, rATemp, rLda4
235 vflstw vf3, rATemp, zero
238 #####################################
240 #####################################
241 add rBTemp, rBTemp, rLda4
247 blt rK, rLda, vec_loopk
250 #####################################
252 #####################################
254 sub rCTemp, rCTemp, rLda4
258 blt rI, rLda, vec_loopi
262 blt rJ, rLda, vec_loopj
276 # ADD YOUR VECTOR-ELEMENT CODE HERE ...
278 #TODO: hit this with a fused multiply add.
280 fmadd.s f2, f0, f1, f2
281 fmadd.s f4, f3, f1, f4
282 #fmadd.s f6, f5, f1, f6
283 #fmadd.s f8, f7, f1, f8
291 # turn on vector unit
297 blez rLda, cpdone # exit early if lda < 0
298 vvcfgivl rVlen, rNum, 1, 1
309 add rATemp, rI, rTemp0
310 slli rATemp, rATemp, 2
311 add rATemp, rA, rATemp
314 add rBTemp, rJ, rTemp0
315 slli rBTemp, rBTemp, 2
316 add rBTemp, rB, rBTemp
320 vflstw vf0, rBTemp, rLda4
332 # The C code uses a jalr instruction to call this function
333 # so we can use a jr to return back to where the function
334 # was called. Also known as "ret", for "return".
340 #####################################
341 # NOPS TO AVOID OVERPREFETCH #
342 #####################################
343 # srli rTemp0, rLda, 4
344 #nop_lp: addi rTemp0, rTemp0, -1
345 # bgez rTemp0, nop_lp