Update to privileged architecture version 1.7
[riscv-tests.git] / benchmarks / vec-matmul / vec_matmul_asm.S
1 # See LICENSE for license details.
2
3 #*****************************************************************************
4 # matmul function (assembly version)
5 #-----------------------------------------------------------------------------
6
7
8 #--------------------------------------------------------------------------
9 # Headers and Defines
10 #--------------------------------------------------------------------------
11
12 #include "pcr.h"
13
14 # Here are some defines that make writing assembly code easier.
15
16 # I'm using the knowledge that rLda will be placed in register a0, rA will be
17 # placed into register a1, etc., based on the calling convention for functions.
18
19 #define rN a0
20 #define rLda a0
21 #define rA a1
22 #define rB a2
23 #define rC a3
24 #define rATmp2 v0
25 #define rBTmp2 s0
26
27
28 # given vector-length
29 #define rVlen a7
30
31 # address of VT function
32 #define rVTAddr v1
33 #define rTemp0 t0
34
35 # desired app vector length (number of elements to vectorize)
36 #define rNum t1
37
38 #define rATemp t2
39 #define rBTemp t3
40 #define rCTemp t4
41 #define rI t5
42 #define rJ s1
43 #define rK s2
44 #define rLda4 a4
45 #define rK4 a5
46 #define rI4 a6
47
48 # WARNING: do not write to the s0,...,s9 registers without first saving them to
49 # the stack!
50
51 #--------------------------------------------------------------------------
52 # void scalar_matmul_asm( int n, float a[], float b[], float c[] )
53 #--------------------------------------------------------------------------
54
55 .text
56 .align 2
57 .globl scalar_matmul_asm
58 .type scalar_matmul_asm,@function
59
60 scalar_matmul_asm:
61
62 # ***** Scalar Example *****
63
64 blez rLda, done # exit early if lda < 0
65
66 move rJ, zero
67 loopj:
68 move rI, zero
69 loopi:
70 move rK, zero
71 loopk:
72 mul rTemp0, rJ, rLda # calculate indices... I'm being SUPER naive here:
73 add rATemp, rK, rTemp0 # this could be a lot more clever!
74 slli rATemp, rATemp, 2
75 add rATemp, rA, rATemp
76
77 mul rTemp0, rK, rLda
78 add rBTemp, rI, rTemp0
79 slli rBTemp, rBTemp, 2
80 add rBTemp, rB, rBTemp
81
82 mul rTemp0, rJ, rLda
83 add rCTemp, rI, rTemp0
84 slli rCTemp, rCTemp, 2
85 add rCTemp, rC, rCTemp
86
87 flw f2, 0(rATemp) # again, I'm being very lazy...
88 # I can lift this out of the inner loop!
89 flw f3, 0(rBTemp)
90 flw f4, 0(rCTemp)
91 fmul.s f3, f2, f3
92 fadd.s f4, f4, f3
93 fsw f4, 0(rCTemp)
94 endk:
95 addi rK, rK, 1
96 blt rK, rLda, loopk
97 endi:
98 addi rI, rI, 1
99 blt rI, rLda, loopi
100 endj:
101 addi rJ, rJ, 1
102 blt rJ, rLda, loopj
103 done:
104 ret
105
106
107 #--------------------------------------------------------------------------
108 # void vt_matmul_asm( int n, float a[], float b[], float c[] )
109 #--------------------------------------------------------------------------
110
111
112 # ***** Vector-Thread Example *****
113
114 .globl vt_matmul_asm
115 .type vt_matmul_asm,@function
116
117 vt_matmul_asm:
118 addi sp, sp, -24
119 sd s0, 0(sp)
120 sd s1, 8(sp)
121 sd s2, 16(sp)
122
123
124 # turn on vector unit
125 setpcr status, SR_EV
126
127 blez rLda, cpdone # exit early if lda < 0
128
129
130 la rVTAddr, vtcode
131 slli rLda4, rLda, 2
132
133 #for starters ask for all the registers. We shouldn't need this many
134 #but we'll trim it when we have correctness in hand
135 vvcfgivl rVlen, rNum, 1, 5
136
137
138 move rJ, zero
139 vec_loopj:
140 move rI, zero
141 vec_loopi:
142 slli rI4, rI, 2
143
144 sub rNum, rN, rI # book keeping
145 vsetvl rVlen, rNum # set the vector length
146 # rN is the desired (application) vector length
147 # rVlen is what vector length we were given
148
149 #####################################
150 # LOADS FOR C #
151 #####################################
152 mul rTemp0, rJ, rLda4
153 add rCTemp, rI4, rTemp0
154
155 add rCTemp, rC, rCTemp
156 vflw vf2, rCTemp
157
158 add rCTemp, rCTemp, rLda4
159 vflw vf4, rCTemp
160
161
162 #################################
163 # address calculation lifts #
164 #################################
165 mul rTemp0, rJ, rLda4
166 add rATmp2, rA, rTemp0
167
168 add rBTmp2, rI4, rB
169 move rK, zero
170 vec_loopk:
171 slli rK4, rK, 2
172
173 #####################################
174 # LOADS FOR A #
175 #####################################
176
177 add rATemp, rK4, rATmp2
178 vflstw vf0, rATemp, zero
179
180 add rATemp, rATemp, rLda4
181 vflstw vf3, rATemp, zero
182
183
184 #####################################
185 # LOADS FOR B #
186 #####################################
187 mul rTemp0, rK, rLda4
188 add rBTemp, rBTmp2, rTemp0
189 vflw vf1, rBTemp
190 vf 0(rVTAddr)
191
192 #####################################
193 # LOADS FOR A #
194 #####################################
195 add rATemp, rK4, rATmp2
196 addi rATemp, rATemp, 4
197 vflstw vf0, rATemp, zero
198
199 add rATemp, rATemp, rLda4
200 vflstw vf3, rATemp, zero
201
202
203 #####################################
204 # LOADS FOR B #
205 #####################################
206 add rBTemp, rBTemp, rLda4
207 vflw vf1, rBTemp
208 vf 0(rVTAddr)
209
210 #####################################
211 # LOADS FOR A #
212 #####################################
213 add rATemp, rK4, rATmp2
214 addi rATemp, rATemp, 8
215 vflstw vf0, rATemp, zero
216
217 add rATemp, rATemp, rLda4
218 vflstw vf3, rATemp, zero
219
220
221 #####################################
222 # LOADS FOR B #
223 #####################################
224 add rBTemp, rBTemp, rLda4
225 vflw vf1, rBTemp
226 vf 0(rVTAddr)
227
228
229 #####################################
230 # LOADS FOR A #
231 #####################################
232 add rATemp, rK4, rATmp2
233 addi rATemp, rATemp, 12
234 vflstw vf0, rATemp, zero
235
236 add rATemp, rATemp, rLda4
237 vflstw vf3, rATemp, zero
238
239
240 #####################################
241 # LOADS FOR B #
242 #####################################
243 add rBTemp, rBTemp, rLda4
244 vflw vf1, rBTemp
245 vf 0(rVTAddr)
246
247 vec_endk:
248 addi rK, rK, 4
249 blt rK, rLda, vec_loopk
250
251 vec_endi:
252 #####################################
253 # STORES FOR C #
254 #####################################
255 vfsw vf4, rCTemp
256 sub rCTemp, rCTemp, rLda4
257 vfsw vf2, rCTemp
258
259 add rI, rI, rVlen
260 blt rI, rLda, vec_loopi
261 vec_endj:
262 addi rJ, rJ, 2
263 # fence.v.l
264 blt rJ, rLda, vec_loopj
265
266
267 cpdone:
268 fence.v.l
269 ld s0, 0(sp)
270 ld s1, 8(sp)
271 ld s2, 16(sp)
272 addi sp, sp, 24
273
274
275 ret
276
277 vtcode:
278 # ADD YOUR VECTOR-ELEMENT CODE HERE ...
279
280 #TODO: hit this with a fused multiply add.
281
282 fmadd.s f2, f0, f1, f2
283 fmadd.s f4, f3, f1, f4
284 #fmadd.s f6, f5, f1, f6
285 #fmadd.s f8, f7, f1, f8
286
287
288 #fmul.s f1, f0, f1
289 #fadd.s f2, f2, f1
290 stop
291
292 transpose:
293 # turn on vector unit
294 setpcr status, SR_EV
295
296
297 blez rLda, cpdone # exit early if lda < 0
298 vvcfgivl rVlen, rNum, 1, 1
299
300 move rI, zero
301 tloopi:
302 sub rNum, rLda, rI
303 vsetvl rVlen, rNum
304
305 move rJ, zero
306 tloopj:
307
308 mul rTemp0, rJ, rLda
309 add rATemp, rI, rTemp0
310 slli rATemp, rATemp, 2
311 add rATemp, rA, rATemp
312
313 mul rTemp0, rI, rLda
314 add rBTemp, rJ, rTemp0
315 slli rBTemp, rBTemp, 2
316 add rBTemp, rB, rBTemp
317
318 #flw f0, 0(rBTemp)
319 #fsw f0, 0(rATemp)
320 vflstw vf0, rBTemp, rLda4
321 vfsw vf0, rATemp
322
323 tendj:
324 addi rJ, rJ, 1
325 blt rJ, rLda, tloopj
326 tendi:
327 #addi rI, rI, 1
328 add rI, rI, rVlen
329 blt rI, rLda, tloopi
330
331 ret
332 # The C code uses a jalr instruction to call this function
333 # so we can use a jr to return back to where the function
334 # was called. Also known as "ret", for "return".
335
336 ret
337
338
339
340 #####################################
341 # NOPS TO AVOID OVERPREFETCH #
342 #####################################
343 # srli rTemp0, rLda, 4
344 #nop_lp: addi rTemp0, rTemp0, -1
345 # bgez rTemp0, nop_lp