minor mt updates
[riscv-tests.git] / benchmarks / vec-matmul / vec_matmul_asm.S
1 #*****************************************************************************
2 # matmul function (assembly version)
3 #-----------------------------------------------------------------------------
4
5
6 #--------------------------------------------------------------------------
7 # Headers and Defines
8 #--------------------------------------------------------------------------
9
10 #include "pcr.h"
11
12 # Here are some defines that make writing assembly code easier.
13
14 # I'm using the knowledge that rLda will be placed in register a0, rA will be
15 # placed into register a1, etc., based on the calling convention for functions.
16
17 #define rN a0
18 #define rLda a0
19 #define rA a1
20 #define rB a2
21 #define rC a3
22 #define rATmp2 v0
23 #define rBTmp2 s0
24
25
26 # given vector-length
27 #define rVlen a7
28
29 # address of VT function
30 #define rVTAddr v1
31 #define rTemp0 t0
32
33 # desired app vector length (number of elements to vectorize)
34 #define rNum t1
35
36 #define rATemp t2
37 #define rBTemp t3
38 #define rCTemp t4
39 #define rI t5
40 #define rJ s1
41 #define rK s2
42 #define rLda4 a4
43 #define rK4 a5
44 #define rI4 a6
45
46 # WARNING: do not write to the s0,...,s9 registers without first saving them to
47 # the stack!
48
49 #--------------------------------------------------------------------------
50 # void scalar_matmul_asm( int n, float a[], float b[], float c[] )
51 #--------------------------------------------------------------------------
52
53 .text
54 .align 2
55 .globl scalar_matmul_asm
56 .type scalar_matmul_asm,@function
57
58 scalar_matmul_asm:
59
60 # ***** Scalar Example *****
61
62 blez rLda, done # exit early if lda < 0
63
64 move rJ, zero
65 loopj:
66 move rI, zero
67 loopi:
68 move rK, zero
69 loopk:
70 mul rTemp0, rJ, rLda # calculate indices... I'm being SUPER naive here:
71 add rATemp, rK, rTemp0 # this could be a lot more clever!
72 slli rATemp, rATemp, 2
73 add rATemp, rA, rATemp
74
75 mul rTemp0, rK, rLda
76 add rBTemp, rI, rTemp0
77 slli rBTemp, rBTemp, 2
78 add rBTemp, rB, rBTemp
79
80 mul rTemp0, rJ, rLda
81 add rCTemp, rI, rTemp0
82 slli rCTemp, rCTemp, 2
83 add rCTemp, rC, rCTemp
84
85 flw f2, 0(rATemp) # again, I'm being very lazy...
86 # I can lift this out of the inner loop!
87 flw f3, 0(rBTemp)
88 flw f4, 0(rCTemp)
89 fmul.s f3, f2, f3
90 fadd.s f4, f4, f3
91 fsw f4, 0(rCTemp)
92 endk:
93 addi rK, rK, 1
94 blt rK, rLda, loopk
95 endi:
96 addi rI, rI, 1
97 blt rI, rLda, loopi
98 endj:
99 addi rJ, rJ, 1
100 blt rJ, rLda, loopj
101 done:
102 ret
103
104
105 #--------------------------------------------------------------------------
106 # void vt_matmul_asm( int n, float a[], float b[], float c[] )
107 #--------------------------------------------------------------------------
108
109
110 # ***** Vector-Thread Example *****
111
112 .globl vt_matmul_asm
113 .type vt_matmul_asm,@function
114
115 vt_matmul_asm:
116 addi sp, sp, -24
117 sd s0, 0(sp)
118 sd s1, 8(sp)
119 sd s2, 16(sp)
120
121
122 # turn on vector unit
123 setpcr status, SR_EV
124
125 blez rLda, cpdone # exit early if lda < 0
126
127
128 la rVTAddr, vtcode
129 slli rLda4, rLda, 2
130
131 #for starters ask for all the registers. We shouldn't need this many
132 #but we'll trim it when we have correctness in hand
133 vvcfgivl rVlen, rNum, 1, 5
134
135
136 move rJ, zero
137 vec_loopj:
138 move rI, zero
139 vec_loopi:
140 slli rI4, rI, 2
141
142 sub rNum, rN, rI # book keeping
143 vsetvl rVlen, rNum # set the vector length
144 # rN is the desired (application) vector length
145 # rVlen is what vector length we were given
146
147 #####################################
148 # LOADS FOR C #
149 #####################################
150 mul rTemp0, rJ, rLda4
151 add rCTemp, rI4, rTemp0
152
153 add rCTemp, rC, rCTemp
154 vflw vf2, rCTemp
155
156 add rCTemp, rCTemp, rLda4
157 vflw vf4, rCTemp
158
159
160 #################################
161 # address calculation lifts #
162 #################################
163 mul rTemp0, rJ, rLda4
164 add rATmp2, rA, rTemp0
165
166 add rBTmp2, rI4, rB
167 move rK, zero
168 vec_loopk:
169 slli rK4, rK, 2
170
171 #####################################
172 # LOADS FOR A #
173 #####################################
174
175 add rATemp, rK4, rATmp2
176 vflstw vf0, rATemp, zero
177
178 add rATemp, rATemp, rLda4
179 vflstw vf3, rATemp, zero
180
181
182 #####################################
183 # LOADS FOR B #
184 #####################################
185 mul rTemp0, rK, rLda4
186 add rBTemp, rBTmp2, rTemp0
187 vflw vf1, rBTemp
188 vf 0(rVTAddr)
189
190 #####################################
191 # LOADS FOR A #
192 #####################################
193 add rATemp, rK4, rATmp2
194 addi rATemp, rATemp, 4
195 vflstw vf0, rATemp, zero
196
197 add rATemp, rATemp, rLda4
198 vflstw vf3, rATemp, zero
199
200
201 #####################################
202 # LOADS FOR B #
203 #####################################
204 add rBTemp, rBTemp, rLda4
205 vflw vf1, rBTemp
206 vf 0(rVTAddr)
207
208 #####################################
209 # LOADS FOR A #
210 #####################################
211 add rATemp, rK4, rATmp2
212 addi rATemp, rATemp, 8
213 vflstw vf0, rATemp, zero
214
215 add rATemp, rATemp, rLda4
216 vflstw vf3, rATemp, zero
217
218
219 #####################################
220 # LOADS FOR B #
221 #####################################
222 add rBTemp, rBTemp, rLda4
223 vflw vf1, rBTemp
224 vf 0(rVTAddr)
225
226
227 #####################################
228 # LOADS FOR A #
229 #####################################
230 add rATemp, rK4, rATmp2
231 addi rATemp, rATemp, 12
232 vflstw vf0, rATemp, zero
233
234 add rATemp, rATemp, rLda4
235 vflstw vf3, rATemp, zero
236
237
238 #####################################
239 # LOADS FOR B #
240 #####################################
241 add rBTemp, rBTemp, rLda4
242 vflw vf1, rBTemp
243 vf 0(rVTAddr)
244
245 vec_endk:
246 addi rK, rK, 4
247 blt rK, rLda, vec_loopk
248
249 vec_endi:
250 #####################################
251 # STORES FOR C #
252 #####################################
253 vfsw vf4, rCTemp
254 sub rCTemp, rCTemp, rLda4
255 vfsw vf2, rCTemp
256
257 add rI, rI, rVlen
258 blt rI, rLda, vec_loopi
259 vec_endj:
260 addi rJ, rJ, 2
261 # fence.v.l
262 blt rJ, rLda, vec_loopj
263
264
265 cpdone:
266 fence.v.l
267 ld s0, 0(sp)
268 ld s1, 8(sp)
269 ld s2, 16(sp)
270 addi sp, sp, 24
271
272
273 ret
274
275 vtcode:
276 # ADD YOUR VECTOR-ELEMENT CODE HERE ...
277
278 #TODO: hit this with a fused multiply add.
279
280 fmadd.s f2, f0, f1, f2
281 fmadd.s f4, f3, f1, f4
282 #fmadd.s f6, f5, f1, f6
283 #fmadd.s f8, f7, f1, f8
284
285
286 #fmul.s f1, f0, f1
287 #fadd.s f2, f2, f1
288 stop
289
290 transpose:
291 # turn on vector unit
292 setpcr status, SR_EV
293
294
295 blez rLda, cpdone # exit early if lda < 0
296 vvcfgivl rVlen, rNum, 1, 1
297
298 move rI, zero
299 tloopi:
300 sub rNum, rLda, rI
301 vsetvl rVlen, rNum
302
303 move rJ, zero
304 tloopj:
305
306 mul rTemp0, rJ, rLda
307 add rATemp, rI, rTemp0
308 slli rATemp, rATemp, 2
309 add rATemp, rA, rATemp
310
311 mul rTemp0, rI, rLda
312 add rBTemp, rJ, rTemp0
313 slli rBTemp, rBTemp, 2
314 add rBTemp, rB, rBTemp
315
316 #flw f0, 0(rBTemp)
317 #fsw f0, 0(rATemp)
318 vflstw vf0, rBTemp, rLda4
319 vfsw vf0, rATemp
320
321 tendj:
322 addi rJ, rJ, 1
323 blt rJ, rLda, tloopj
324 tendi:
325 #addi rI, rI, 1
326 add rI, rI, rVlen
327 blt rI, rLda, tloopi
328
329 ret
330 # The C code uses a jalr instruction to call this function
331 # so we can use a jr to return back to where the function
332 # was called. Also known as "ret", for "return".
333
334 ret
335
336
337
338 #####################################
339 # NOPS TO AVOID OVERPREFETCH #
340 #####################################
341 # srli rTemp0, rLda, 4
342 #nop_lp: addi rTemp0, rTemp0, -1
343 # bgez rTemp0, nop_lp