1 # See LICENSE for license details.
3 #*****************************************************************************
4 # matmul function (assembly version)
5 #-----------------------------------------------------------------------------
8 #--------------------------------------------------------------------------
10 #--------------------------------------------------------------------------
14 # Here are some defines that make writing assembly code easier.
16 # I'm using the knowledge that rLda will be placed in register a0, rA will be
17 # placed into register a1, etc., based on the calling convention for functions.
31 # address of VT function
35 # desired app vector length (number of elements to vectorize)
48 # WARNING: do not write to the s0,...,s9 registers without first saving them to
51 #--------------------------------------------------------------------------
52 # void scalar_matmul_asm( int n, float a[], float b[], float c[] )
53 #--------------------------------------------------------------------------
57 .globl scalar_matmul_asm
58 .type scalar_matmul_asm,@function
62 # ***** Scalar Example *****
64 blez rLda, done # exit early if lda < 0
72 mul rTemp0, rJ, rLda # calculate indices... I'm being SUPER naive here:
73 add rATemp, rK, rTemp0 # this could be a lot more clever!
74 slli rATemp, rATemp, 2
75 add rATemp, rA, rATemp
78 add rBTemp, rI, rTemp0
79 slli rBTemp, rBTemp, 2
80 add rBTemp, rB, rBTemp
83 add rCTemp, rI, rTemp0
84 slli rCTemp, rCTemp, 2
85 add rCTemp, rC, rCTemp
87 flw f2, 0(rATemp) # again, I'm being very lazy...
88 # I can lift this out of the inner loop!
107 #--------------------------------------------------------------------------
108 # void vt_matmul_asm( int n, float a[], float b[], float c[] )
109 #--------------------------------------------------------------------------
112 # ***** Vector-Thread Example *****
115 .type vt_matmul_asm,@function
124 # turn on vector unit
127 blez rLda, cpdone # exit early if lda < 0
133 #for starters ask for all the registers. We shouldn't need this many
134 #but we'll trim it when we have correctness in hand
135 vvcfgivl rVlen, rNum, 1, 5
144 sub rNum, rN, rI # book keeping
145 vsetvl rVlen, rNum # set the vector length
146 # rN is the desired (application) vector length
147 # rVlen is what vector length we were given
149 #####################################
151 #####################################
152 mul rTemp0, rJ, rLda4
153 add rCTemp, rI4, rTemp0
155 add rCTemp, rC, rCTemp
158 add rCTemp, rCTemp, rLda4
162 #################################
163 # address calculation lifts #
164 #################################
165 mul rTemp0, rJ, rLda4
166 add rATmp2, rA, rTemp0
173 #####################################
175 #####################################
177 add rATemp, rK4, rATmp2
178 vflstw vf0, rATemp, zero
180 add rATemp, rATemp, rLda4
181 vflstw vf3, rATemp, zero
184 #####################################
186 #####################################
187 mul rTemp0, rK, rLda4
188 add rBTemp, rBTmp2, rTemp0
192 #####################################
194 #####################################
195 add rATemp, rK4, rATmp2
196 addi rATemp, rATemp, 4
197 vflstw vf0, rATemp, zero
199 add rATemp, rATemp, rLda4
200 vflstw vf3, rATemp, zero
203 #####################################
205 #####################################
206 add rBTemp, rBTemp, rLda4
210 #####################################
212 #####################################
213 add rATemp, rK4, rATmp2
214 addi rATemp, rATemp, 8
215 vflstw vf0, rATemp, zero
217 add rATemp, rATemp, rLda4
218 vflstw vf3, rATemp, zero
221 #####################################
223 #####################################
224 add rBTemp, rBTemp, rLda4
229 #####################################
231 #####################################
232 add rATemp, rK4, rATmp2
233 addi rATemp, rATemp, 12
234 vflstw vf0, rATemp, zero
236 add rATemp, rATemp, rLda4
237 vflstw vf3, rATemp, zero
240 #####################################
242 #####################################
243 add rBTemp, rBTemp, rLda4
249 blt rK, rLda, vec_loopk
252 #####################################
254 #####################################
256 sub rCTemp, rCTemp, rLda4
260 blt rI, rLda, vec_loopi
264 blt rJ, rLda, vec_loopj
278 # ADD YOUR VECTOR-ELEMENT CODE HERE ...
280 #TODO: hit this with a fused multiply add.
282 fmadd.s f2, f0, f1, f2
283 fmadd.s f4, f3, f1, f4
284 #fmadd.s f6, f5, f1, f6
285 #fmadd.s f8, f7, f1, f8
293 # turn on vector unit
297 blez rLda, cpdone # exit early if lda < 0
298 vvcfgivl rVlen, rNum, 1, 1
309 add rATemp, rI, rTemp0
310 slli rATemp, rATemp, 2
311 add rATemp, rA, rATemp
314 add rBTemp, rJ, rTemp0
315 slli rBTemp, rBTemp, 2
316 add rBTemp, rB, rBTemp
320 vflstw vf0, rBTemp, rLda4
332 # The C code uses a jalr instruction to call this function
333 # so we can use a jr to return back to where the function
334 # was called. Also known as "ret", for "return".
340 #####################################
341 # NOPS TO AVOID OVERPREFETCH #
342 #####################################
343 # srli rTemp0, rLda, 4
344 #nop_lp: addi rTemp0, rTemp0, -1
345 # bgez rTemp0, nop_lp