Add FreedomU500 & incorporate feedback
[riscv-tests.git] / mt / cv_matmul.c
1 #include "stdlib.h"
2
3 #include "util.h"
4
5 #include "dataset.h"
6 void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
7 {
8 //----------------------------------------------------------------version 2.11 optmize j,use core 1 j from 0 to 15 MSI 98k i = j*lda
9 //----------------------------------------------------------------version 2.12 not use i = j *lda MSI 95k
10 static __thread data_t TempA[8];
11 static __thread data_t TempB[8];
12 static __thread int j,m,n,i,k;
13
14 if(coreid == 1 || ncores == 1)
15 {
16 for ( j = 16; j < 32; j++ )
17 {
18
19 for ( m = 0; m < 4; m++ )
20 {
21
22 TempA[0] = A[j*lda+0+8*m];
23 TempA[1] = A[j*lda+1+8*m];
24 TempA[2] = A[j*lda+2+8*m];
25 TempA[3] = A[j*lda+3+8*m];
26 TempA[4] = A[j*lda+4+8*m];
27 TempA[5] = A[j*lda+5+8*m];
28 TempA[6] = A[j*lda+6+8*m];
29 TempA[7] = A[j*lda+7+8*m];
30
31 for( n = 0; n < 4; n++)
32 {
33 TempB[0] = B[(0+8*m)*lda+0+8*n];
34 TempB[1] = B[(0+8*m)*lda+1+8*n];
35 TempB[2] = B[(0+8*m)*lda+2+8*n];
36 TempB[3] = B[(0+8*m)*lda+3+8*n];
37 TempB[4] = B[(0+8*m)*lda+4+8*n];
38 TempB[5] = B[(0+8*m)*lda+5+8*n];
39 TempB[6] = B[(0+8*m)*lda+6+8*n];
40 TempB[7] = B[(0+8*m)*lda+7+8*n];
41
42 C[0+8*n+j*lda] += TempA[0] * TempB[0];
43 C[1+8*n+j*lda] += TempA[0] * TempB[1];
44 C[2+8*n+j*lda] += TempA[0] * TempB[2];
45 C[3+8*n+j*lda] += TempA[0] * TempB[3];
46 C[4+8*n+j*lda] += TempA[0] * TempB[4];
47 C[5+8*n+j*lda] += TempA[0] * TempB[5];
48 C[6+8*n+j*lda] += TempA[0] * TempB[6];
49 C[7+8*n+j*lda] += TempA[0] * TempB[7];
50
51
52
53 TempB[0] = B[(1+8*m)*lda+0+8*n];
54 TempB[1] = B[(1+8*m)*lda+1+8*n];
55 TempB[2] = B[(1+8*m)*lda+2+8*n];
56 TempB[3] = B[(1+8*m)*lda+3+8*n];
57 TempB[4] = B[(1+8*m)*lda+4+8*n];
58 TempB[5] = B[(1+8*m)*lda+5+8*n];
59 TempB[6] = B[(1+8*m)*lda+6+8*n];
60 TempB[7] = B[(1+8*m)*lda+7+8*n];
61
62 C[0+8*n+j*lda] += TempA[1] * TempB[0];
63 C[1+8*n+j*lda] += TempA[1] * TempB[1];
64 C[2+8*n+j*lda] += TempA[1] * TempB[2];
65 C[3+8*n+j*lda] += TempA[1] * TempB[3];
66 C[4+8*n+j*lda] += TempA[1] * TempB[4];
67 C[5+8*n+j*lda] += TempA[1] * TempB[5];
68 C[6+8*n+j*lda] += TempA[1] * TempB[6];
69 C[7+8*n+j*lda] += TempA[1] * TempB[7];
70
71
72
73 TempB[0] = B[(2+8*m)*lda+0+8*n];
74 TempB[1] = B[(2+8*m)*lda+1+8*n];
75 TempB[2] = B[(2+8*m)*lda+2+8*n];
76 TempB[3] = B[(2+8*m)*lda+3+8*n];
77 TempB[4] = B[(2+8*m)*lda+4+8*n];
78 TempB[5] = B[(2+8*m)*lda+5+8*n];
79 TempB[6] = B[(2+8*m)*lda+6+8*n];
80 TempB[7] = B[(2+8*m)*lda+7+8*n];
81
82 C[0+8*n+j*lda] += TempA[2] * TempB[0];
83 C[1+8*n+j*lda] += TempA[2] * TempB[1];
84 C[2+8*n+j*lda] += TempA[2] * TempB[2];
85 C[3+8*n+j*lda] += TempA[2] * TempB[3];
86 C[4+8*n+j*lda] += TempA[2] * TempB[4];
87 C[5+8*n+j*lda] += TempA[2] * TempB[5];
88 C[6+8*n+j*lda] += TempA[2] * TempB[6];
89 C[7+8*n+j*lda] += TempA[2] * TempB[7];
90
91
92
93 TempB[0] = B[(3+8*m)*lda+0+8*n];
94 TempB[1] = B[(3+8*m)*lda+1+8*n];
95 TempB[2] = B[(3+8*m)*lda+2+8*n];
96 TempB[3] = B[(3+8*m)*lda+3+8*n];
97 TempB[4] = B[(3+8*m)*lda+4+8*n];
98 TempB[5] = B[(3+8*m)*lda+5+8*n];
99 TempB[6] = B[(3+8*m)*lda+6+8*n];
100 TempB[7] = B[(3+8*m)*lda+7+8*n];
101
102 C[0+8*n+j*lda] += TempA[3] * TempB[0];
103 C[1+8*n+j*lda] += TempA[3] * TempB[1];
104 C[2+8*n+j*lda] += TempA[3] * TempB[2];
105 C[3+8*n+j*lda] += TempA[3] * TempB[3];
106 C[4+8*n+j*lda] += TempA[3] * TempB[4];
107 C[5+8*n+j*lda] += TempA[3] * TempB[5];
108 C[6+8*n+j*lda] += TempA[3] * TempB[6];
109 C[7+8*n+j*lda] += TempA[3] * TempB[7];
110
111
112 TempB[0] = B[(4+8*m)*lda+0+8*n];
113 TempB[1] = B[(4+8*m)*lda+1+8*n];
114 TempB[2] = B[(4+8*m)*lda+2+8*n];
115 TempB[3] = B[(4+8*m)*lda+3+8*n];
116 TempB[4] = B[(4+8*m)*lda+4+8*n];
117 TempB[5] = B[(4+8*m)*lda+5+8*n];
118 TempB[6] = B[(4+8*m)*lda+6+8*n];
119 TempB[7] = B[(4+8*m)*lda+7+8*n];
120
121 C[0+8*n+j*lda] += TempA[4] * TempB[0];
122 C[1+8*n+j*lda] += TempA[4] * TempB[1];
123 C[2+8*n+j*lda] += TempA[4] * TempB[2];
124 C[3+8*n+j*lda] += TempA[4] * TempB[3];
125 C[4+8*n+j*lda] += TempA[4] * TempB[4];
126 C[5+8*n+j*lda] += TempA[4] * TempB[5];
127 C[6+8*n+j*lda] += TempA[4] * TempB[6];
128 C[7+8*n+j*lda] += TempA[4] * TempB[7];
129
130
131
132 TempB[0] = B[(5+8*m)*lda+0+8*n];
133 TempB[1] = B[(5+8*m)*lda+1+8*n];
134 TempB[2] = B[(5+8*m)*lda+2+8*n];
135 TempB[3] = B[(5+8*m)*lda+3+8*n];
136 TempB[4] = B[(5+8*m)*lda+4+8*n];
137 TempB[5] = B[(5+8*m)*lda+5+8*n];
138 TempB[6] = B[(5+8*m)*lda+6+8*n];
139 TempB[7] = B[(5+8*m)*lda+7+8*n];
140
141 C[0+8*n+j*lda] += TempA[5] * TempB[0];
142 C[1+8*n+j*lda] += TempA[5] * TempB[1];
143 C[2+8*n+j*lda] += TempA[5] * TempB[2];
144 C[3+8*n+j*lda] += TempA[5] * TempB[3];
145 C[4+8*n+j*lda] += TempA[5] * TempB[4];
146 C[5+8*n+j*lda] += TempA[5] * TempB[5];
147 C[6+8*n+j*lda] += TempA[5] * TempB[6];
148 C[7+8*n+j*lda] += TempA[5] * TempB[7];
149
150
151
152 TempB[0] = B[(6+8*m)*lda+0+8*n];
153 TempB[1] = B[(6+8*m)*lda+1+8*n];
154 TempB[2] = B[(6+8*m)*lda+2+8*n];
155 TempB[3] = B[(6+8*m)*lda+3+8*n];
156 TempB[4] = B[(6+8*m)*lda+4+8*n];
157 TempB[5] = B[(6+8*m)*lda+5+8*n];
158 TempB[6] = B[(6+8*m)*lda+6+8*n];
159 TempB[7] = B[(6+8*m)*lda+7+8*n];
160
161 C[0+8*n+j*lda] += TempA[6] * TempB[0];
162 C[1+8*n+j*lda] += TempA[6] * TempB[1];
163 C[2+8*n+j*lda] += TempA[6] * TempB[2];
164 C[3+8*n+j*lda] += TempA[6] * TempB[3];
165 C[4+8*n+j*lda] += TempA[6] * TempB[4];
166 C[5+8*n+j*lda] += TempA[6] * TempB[5];
167 C[6+8*n+j*lda] += TempA[6] * TempB[6];
168 C[7+8*n+j*lda] += TempA[6] * TempB[7];
169
170
171 TempB[0] = B[(7+8*m)*lda+0+8*n];
172 TempB[1] = B[(7+8*m)*lda+1+8*n];
173 TempB[2] = B[(7+8*m)*lda+2+8*n];
174 TempB[3] = B[(7+8*m)*lda+3+8*n];
175 TempB[4] = B[(7+8*m)*lda+4+8*n];
176 TempB[5] = B[(7+8*m)*lda+5+8*n];
177 TempB[6] = B[(7+8*m)*lda+6+8*n];
178 TempB[7] = B[(7+8*m)*lda+7+8*n];
179
180 C[0+8*n+j*lda] += TempA[7] * TempB[0];
181 C[1+8*n+j*lda] += TempA[7] * TempB[1];
182 C[2+8*n+j*lda] += TempA[7] * TempB[2];
183 C[3+8*n+j*lda] += TempA[7] * TempB[3];
184 C[4+8*n+j*lda] += TempA[7] * TempB[4];
185 C[5+8*n+j*lda] += TempA[7] * TempB[5];
186 C[6+8*n+j*lda] += TempA[7] * TempB[6];
187 C[7+8*n+j*lda] += TempA[7] * TempB[7];
188 }
189
190 }
191 }
192 }
193 if(coreid ==0)
194 {
195 for ( j = 0; j < 16; j++ )
196 {
197
198 for ( m = 0; m < 4; m++ )
199 {
200
201 TempA[0] = A[j*lda+0+8*m];
202 TempA[1] = A[j*lda+1+8*m];
203 TempA[2] = A[j*lda+2+8*m];
204 TempA[3] = A[j*lda+3+8*m];
205 TempA[4] = A[j*lda+4+8*m];
206 TempA[5] = A[j*lda+5+8*m];
207 TempA[6] = A[j*lda+6+8*m];
208 TempA[7] = A[j*lda+7+8*m];
209
210 for( n = 0; n < 4; n++)
211 {
212 TempB[0] = B[(0+8*m)*lda+0+8*n];
213 TempB[1] = B[(0+8*m)*lda+1+8*n];
214 TempB[2] = B[(0+8*m)*lda+2+8*n];
215 TempB[3] = B[(0+8*m)*lda+3+8*n];
216 TempB[4] = B[(0+8*m)*lda+4+8*n];
217 TempB[5] = B[(0+8*m)*lda+5+8*n];
218 TempB[6] = B[(0+8*m)*lda+6+8*n];
219 TempB[7] = B[(0+8*m)*lda+7+8*n];
220
221 C[0+8*n+j*lda] += TempA[0] * TempB[0];
222 C[1+8*n+j*lda] += TempA[0] * TempB[1];
223 C[2+8*n+j*lda] += TempA[0] * TempB[2];
224 C[3+8*n+j*lda] += TempA[0] * TempB[3];
225 C[4+8*n+j*lda] += TempA[0] * TempB[4];
226 C[5+8*n+j*lda] += TempA[0] * TempB[5];
227 C[6+8*n+j*lda] += TempA[0] * TempB[6];
228 C[7+8*n+j*lda] += TempA[0] * TempB[7];
229
230
231
232 TempB[0] = B[(1+8*m)*lda+0+8*n];
233 TempB[1] = B[(1+8*m)*lda+1+8*n];
234 TempB[2] = B[(1+8*m)*lda+2+8*n];
235 TempB[3] = B[(1+8*m)*lda+3+8*n];
236 TempB[4] = B[(1+8*m)*lda+4+8*n];
237 TempB[5] = B[(1+8*m)*lda+5+8*n];
238 TempB[6] = B[(1+8*m)*lda+6+8*n];
239 TempB[7] = B[(1+8*m)*lda+7+8*n];
240
241 C[0+8*n+j*lda] += TempA[1] * TempB[0];
242 C[1+8*n+j*lda] += TempA[1] * TempB[1];
243 C[2+8*n+j*lda] += TempA[1] * TempB[2];
244 C[3+8*n+j*lda] += TempA[1] * TempB[3];
245 C[4+8*n+j*lda] += TempA[1] * TempB[4];
246 C[5+8*n+j*lda] += TempA[1] * TempB[5];
247 C[6+8*n+j*lda] += TempA[1] * TempB[6];
248 C[7+8*n+j*lda] += TempA[1] * TempB[7];
249
250
251
252 TempB[0] = B[(2+8*m)*lda+0+8*n];
253 TempB[1] = B[(2+8*m)*lda+1+8*n];
254 TempB[2] = B[(2+8*m)*lda+2+8*n];
255 TempB[3] = B[(2+8*m)*lda+3+8*n];
256 TempB[4] = B[(2+8*m)*lda+4+8*n];
257 TempB[5] = B[(2+8*m)*lda+5+8*n];
258 TempB[6] = B[(2+8*m)*lda+6+8*n];
259 TempB[7] = B[(2+8*m)*lda+7+8*n];
260
261 C[0+8*n+j*lda] += TempA[2] * TempB[0];
262 C[1+8*n+j*lda] += TempA[2] * TempB[1];
263 C[2+8*n+j*lda] += TempA[2] * TempB[2];
264 C[3+8*n+j*lda] += TempA[2] * TempB[3];
265 C[4+8*n+j*lda] += TempA[2] * TempB[4];
266 C[5+8*n+j*lda] += TempA[2] * TempB[5];
267 C[6+8*n+j*lda] += TempA[2] * TempB[6];
268 C[7+8*n+j*lda] += TempA[2] * TempB[7];
269
270
271
272 TempB[0] = B[(3+8*m)*lda+0+8*n];
273 TempB[1] = B[(3+8*m)*lda+1+8*n];
274 TempB[2] = B[(3+8*m)*lda+2+8*n];
275 TempB[3] = B[(3+8*m)*lda+3+8*n];
276 TempB[4] = B[(3+8*m)*lda+4+8*n];
277 TempB[5] = B[(3+8*m)*lda+5+8*n];
278 TempB[6] = B[(3+8*m)*lda+6+8*n];
279 TempB[7] = B[(3+8*m)*lda+7+8*n];
280
281 C[0+8*n+j*lda] += TempA[3] * TempB[0];
282 C[1+8*n+j*lda] += TempA[3] * TempB[1];
283 C[2+8*n+j*lda] += TempA[3] * TempB[2];
284 C[3+8*n+j*lda] += TempA[3] * TempB[3];
285 C[4+8*n+j*lda] += TempA[3] * TempB[4];
286 C[5+8*n+j*lda] += TempA[3] * TempB[5];
287 C[6+8*n+j*lda] += TempA[3] * TempB[6];
288 C[7+8*n+j*lda] += TempA[3] * TempB[7];
289
290
291 TempB[0] = B[(4+8*m)*lda+0+8*n];
292 TempB[1] = B[(4+8*m)*lda+1+8*n];
293 TempB[2] = B[(4+8*m)*lda+2+8*n];
294 TempB[3] = B[(4+8*m)*lda+3+8*n];
295 TempB[4] = B[(4+8*m)*lda+4+8*n];
296 TempB[5] = B[(4+8*m)*lda+5+8*n];
297 TempB[6] = B[(4+8*m)*lda+6+8*n];
298 TempB[7] = B[(4+8*m)*lda+7+8*n];
299
300 C[0+8*n+j*lda] += TempA[4] * TempB[0];
301 C[1+8*n+j*lda] += TempA[4] * TempB[1];
302 C[2+8*n+j*lda] += TempA[4] * TempB[2];
303 C[3+8*n+j*lda] += TempA[4] * TempB[3];
304 C[4+8*n+j*lda] += TempA[4] * TempB[4];
305 C[5+8*n+j*lda] += TempA[4] * TempB[5];
306 C[6+8*n+j*lda] += TempA[4] * TempB[6];
307 C[7+8*n+j*lda] += TempA[4] * TempB[7];
308
309
310
311 TempB[0] = B[(5+8*m)*lda+0+8*n];
312 TempB[1] = B[(5+8*m)*lda+1+8*n];
313 TempB[2] = B[(5+8*m)*lda+2+8*n];
314 TempB[3] = B[(5+8*m)*lda+3+8*n];
315 TempB[4] = B[(5+8*m)*lda+4+8*n];
316 TempB[5] = B[(5+8*m)*lda+5+8*n];
317 TempB[6] = B[(5+8*m)*lda+6+8*n];
318 TempB[7] = B[(5+8*m)*lda+7+8*n];
319
320 C[0+8*n+j*lda] += TempA[5] * TempB[0];
321 C[1+8*n+j*lda] += TempA[5] * TempB[1];
322 C[2+8*n+j*lda] += TempA[5] * TempB[2];
323 C[3+8*n+j*lda] += TempA[5] * TempB[3];
324 C[4+8*n+j*lda] += TempA[5] * TempB[4];
325 C[5+8*n+j*lda] += TempA[5] * TempB[5];
326 C[6+8*n+j*lda] += TempA[5] * TempB[6];
327 C[7+8*n+j*lda] += TempA[5] * TempB[7];
328
329
330
331 TempB[0] = B[(6+8*m)*lda+0+8*n];
332 TempB[1] = B[(6+8*m)*lda+1+8*n];
333 TempB[2] = B[(6+8*m)*lda+2+8*n];
334 TempB[3] = B[(6+8*m)*lda+3+8*n];
335 TempB[4] = B[(6+8*m)*lda+4+8*n];
336 TempB[5] = B[(6+8*m)*lda+5+8*n];
337 TempB[6] = B[(6+8*m)*lda+6+8*n];
338 TempB[7] = B[(6+8*m)*lda+7+8*n];
339
340 C[0+8*n+j*lda] += TempA[6] * TempB[0];
341 C[1+8*n+j*lda] += TempA[6] * TempB[1];
342 C[2+8*n+j*lda] += TempA[6] * TempB[2];
343 C[3+8*n+j*lda] += TempA[6] * TempB[3];
344 C[4+8*n+j*lda] += TempA[6] * TempB[4];
345 C[5+8*n+j*lda] += TempA[6] * TempB[5];
346 C[6+8*n+j*lda] += TempA[6] * TempB[6];
347 C[7+8*n+j*lda] += TempA[6] * TempB[7];
348
349
350 TempB[0] = B[(7+8*m)*lda+0+8*n];
351 TempB[1] = B[(7+8*m)*lda+1+8*n];
352 TempB[2] = B[(7+8*m)*lda+2+8*n];
353 TempB[3] = B[(7+8*m)*lda+3+8*n];
354 TempB[4] = B[(7+8*m)*lda+4+8*n];
355 TempB[5] = B[(7+8*m)*lda+5+8*n];
356 TempB[6] = B[(7+8*m)*lda+6+8*n];
357 TempB[7] = B[(7+8*m)*lda+7+8*n];
358
359 C[0+8*n+j*lda] += TempA[7] * TempB[0];
360 C[1+8*n+j*lda] += TempA[7] * TempB[1];
361 C[2+8*n+j*lda] += TempA[7] * TempB[2];
362 C[3+8*n+j*lda] += TempA[7] * TempB[3];
363 C[4+8*n+j*lda] += TempA[7] * TempB[4];
364 C[5+8*n+j*lda] += TempA[7] * TempB[5];
365 C[6+8*n+j*lda] += TempA[7] * TempB[6];
366 C[7+8*n+j*lda] += TempA[7] * TempB[7];
367 }
368
369 }
370 }
371 }
372
373
374 }