1 #*****************************************************************************
2 # matmul function (assembly version)
3 #-----------------------------------------------------------------------------
6 #--------------------------------------------------------------------------
8 #--------------------------------------------------------------------------
12 # Here are some defines that make writing assembly code easier.
14 # I'm using the knowledge that rLda will be placed in register a0, rA will be
15 # placed into register a1, etc., based on the calling convention for functions.
29 # address of VT function
33 # desired app vector length (number of elements to vectorize)
46 # WARNING: do not write to the s0,...,s9 registers without first saving them to
49 #--------------------------------------------------------------------------
50 # void scalar_matmul_asm( int n, float a[], float b[], float c[] )
51 #--------------------------------------------------------------------------
55 .globl scalar_matmul_asm
56 .type scalar_matmul_asm,@function
60 # ***** Scalar Example *****
62 blez rLda, done # exit early if lda < 0
70 mul rTemp0, rJ, rLda # calculate indices... I'm being SUPER naive here:
71 add rATemp, rK, rTemp0 # this could be a lot more clever!
72 slli rATemp, rATemp, 2
73 add rATemp, rA, rATemp
76 add rBTemp, rI, rTemp0
77 slli rBTemp, rBTemp, 2
78 add rBTemp, rB, rBTemp
81 add rCTemp, rI, rTemp0
82 slli rCTemp, rCTemp, 2
83 add rCTemp, rC, rCTemp
85 flw f2, 0(rATemp) # again, I'm being very lazy...
86 # I can lift this out of the inner loop!
105 #--------------------------------------------------------------------------
106 # void vt_matmul_asm( int n, float a[], float b[], float c[] )
107 #--------------------------------------------------------------------------
110 # ***** Vector-Thread Example *****
113 .type vt_matmul_asm,@function
122 # turn on vector unit
125 blez rLda, cpdone # exit early if lda < 0
131 #for starters ask for all the registers. We shouldn't need this many
132 #but we'll trim it when we have correctness in hand
133 vvcfgivl rVlen, rNum, 1, 5
142 sub rNum, rN, rI # book keeping
143 vsetvl rVlen, rNum # set the vector length
144 # rN is the desired (application) vector length
145 # rVlen is what vector length we were given
147 #####################################
149 #####################################
150 mul rTemp0, rJ, rLda4
151 add rCTemp, rI4, rTemp0
153 add rCTemp, rC, rCTemp
156 add rCTemp, rCTemp, rLda4
160 #################################
161 # address calculation lifts #
162 #################################
163 mul rTemp0, rJ, rLda4
164 add rATmp2, rA, rTemp0
171 #####################################
173 #####################################
175 add rATemp, rK4, rATmp2
176 vflstw vf0, rATemp, zero
178 add rATemp, rATemp, rLda4
179 vflstw vf3, rATemp, zero
182 #####################################
184 #####################################
185 mul rTemp0, rK, rLda4
186 add rBTemp, rBTmp2, rTemp0
190 #####################################
192 #####################################
193 add rATemp, rK4, rATmp2
194 addi rATemp, rATemp, 4
195 vflstw vf0, rATemp, zero
197 add rATemp, rATemp, rLda4
198 vflstw vf3, rATemp, zero
201 #####################################
203 #####################################
204 add rBTemp, rBTemp, rLda4
208 #####################################
210 #####################################
211 add rATemp, rK4, rATmp2
212 addi rATemp, rATemp, 8
213 vflstw vf0, rATemp, zero
215 add rATemp, rATemp, rLda4
216 vflstw vf3, rATemp, zero
219 #####################################
221 #####################################
222 add rBTemp, rBTemp, rLda4
227 #####################################
229 #####################################
230 add rATemp, rK4, rATmp2
231 addi rATemp, rATemp, 12
232 vflstw vf0, rATemp, zero
234 add rATemp, rATemp, rLda4
235 vflstw vf3, rATemp, zero
238 #####################################
240 #####################################
241 add rBTemp, rBTemp, rLda4
247 blt rK, rLda, vec_loopk
250 #####################################
252 #####################################
254 sub rCTemp, rCTemp, rLda4
258 blt rI, rLda, vec_loopi
262 blt rJ, rLda, vec_loopj
276 # ADD YOUR VECTOR-ELEMENT CODE HERE ...
278 #TODO: hit this with a fused multiply add.
280 fmadd.s f2, f0, f1, f2
281 fmadd.s f4, f3, f1, f4
282 #fmadd.s f6, f5, f1, f6
283 #fmadd.s f8, f7, f1, f8
291 # turn on vector unit
295 blez rLda, cpdone # exit early if lda < 0
296 vvcfgivl rVlen, rNum, 1, 1
307 add rATemp, rI, rTemp0
308 slli rATemp, rATemp, 2
309 add rATemp, rA, rATemp
312 add rBTemp, rJ, rTemp0
313 slli rBTemp, rBTemp, 2
314 add rBTemp, rB, rBTemp
318 vflstw vf0, rBTemp, rLda4
330 # The C code uses a jalr instruction to call this function
331 # so we can use a jr to return back to where the function
332 # was called. Also known as "ret", for "return".
338 #####################################
339 # NOPS TO AVOID OVERPREFETCH #
340 #####################################
341 # srli rTemp0, rLda, 4
342 #nop_lp: addi rTemp0, rTemp0, -1
343 # bgez rTemp0, nop_lp