6 void __attribute__((noinline
)) matmul(const int coreid
, const int ncores
, const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
8 //----------------------------------------------------------------version 2.11 optmize j,use core 1 j from 0 to 15 MSI 98k i = j*lda
9 //----------------------------------------------------------------version 2.12 not use i = j *lda MSI 95k
10 static __thread data_t TempA
[8];
11 static __thread data_t TempB
[8];
12 static __thread
int j
,m
,n
,i
,k
;
14 if(coreid
== 1 || ncores
== 1)
16 for ( j
= 16; j
< 32; j
++ )
19 for ( m
= 0; m
< 4; m
++ )
22 TempA
[0] = A
[j
*lda
+0+8*m
];
23 TempA
[1] = A
[j
*lda
+1+8*m
];
24 TempA
[2] = A
[j
*lda
+2+8*m
];
25 TempA
[3] = A
[j
*lda
+3+8*m
];
26 TempA
[4] = A
[j
*lda
+4+8*m
];
27 TempA
[5] = A
[j
*lda
+5+8*m
];
28 TempA
[6] = A
[j
*lda
+6+8*m
];
29 TempA
[7] = A
[j
*lda
+7+8*m
];
31 for( n
= 0; n
< 4; n
++)
33 TempB
[0] = B
[(0+8*m
)*lda
+0+8*n
];
34 TempB
[1] = B
[(0+8*m
)*lda
+1+8*n
];
35 TempB
[2] = B
[(0+8*m
)*lda
+2+8*n
];
36 TempB
[3] = B
[(0+8*m
)*lda
+3+8*n
];
37 TempB
[4] = B
[(0+8*m
)*lda
+4+8*n
];
38 TempB
[5] = B
[(0+8*m
)*lda
+5+8*n
];
39 TempB
[6] = B
[(0+8*m
)*lda
+6+8*n
];
40 TempB
[7] = B
[(0+8*m
)*lda
+7+8*n
];
42 C
[0+8*n
+j
*lda
] += TempA
[0] * TempB
[0];
43 C
[1+8*n
+j
*lda
] += TempA
[0] * TempB
[1];
44 C
[2+8*n
+j
*lda
] += TempA
[0] * TempB
[2];
45 C
[3+8*n
+j
*lda
] += TempA
[0] * TempB
[3];
46 C
[4+8*n
+j
*lda
] += TempA
[0] * TempB
[4];
47 C
[5+8*n
+j
*lda
] += TempA
[0] * TempB
[5];
48 C
[6+8*n
+j
*lda
] += TempA
[0] * TempB
[6];
49 C
[7+8*n
+j
*lda
] += TempA
[0] * TempB
[7];
53 TempB
[0] = B
[(1+8*m
)*lda
+0+8*n
];
54 TempB
[1] = B
[(1+8*m
)*lda
+1+8*n
];
55 TempB
[2] = B
[(1+8*m
)*lda
+2+8*n
];
56 TempB
[3] = B
[(1+8*m
)*lda
+3+8*n
];
57 TempB
[4] = B
[(1+8*m
)*lda
+4+8*n
];
58 TempB
[5] = B
[(1+8*m
)*lda
+5+8*n
];
59 TempB
[6] = B
[(1+8*m
)*lda
+6+8*n
];
60 TempB
[7] = B
[(1+8*m
)*lda
+7+8*n
];
62 C
[0+8*n
+j
*lda
] += TempA
[1] * TempB
[0];
63 C
[1+8*n
+j
*lda
] += TempA
[1] * TempB
[1];
64 C
[2+8*n
+j
*lda
] += TempA
[1] * TempB
[2];
65 C
[3+8*n
+j
*lda
] += TempA
[1] * TempB
[3];
66 C
[4+8*n
+j
*lda
] += TempA
[1] * TempB
[4];
67 C
[5+8*n
+j
*lda
] += TempA
[1] * TempB
[5];
68 C
[6+8*n
+j
*lda
] += TempA
[1] * TempB
[6];
69 C
[7+8*n
+j
*lda
] += TempA
[1] * TempB
[7];
73 TempB
[0] = B
[(2+8*m
)*lda
+0+8*n
];
74 TempB
[1] = B
[(2+8*m
)*lda
+1+8*n
];
75 TempB
[2] = B
[(2+8*m
)*lda
+2+8*n
];
76 TempB
[3] = B
[(2+8*m
)*lda
+3+8*n
];
77 TempB
[4] = B
[(2+8*m
)*lda
+4+8*n
];
78 TempB
[5] = B
[(2+8*m
)*lda
+5+8*n
];
79 TempB
[6] = B
[(2+8*m
)*lda
+6+8*n
];
80 TempB
[7] = B
[(2+8*m
)*lda
+7+8*n
];
82 C
[0+8*n
+j
*lda
] += TempA
[2] * TempB
[0];
83 C
[1+8*n
+j
*lda
] += TempA
[2] * TempB
[1];
84 C
[2+8*n
+j
*lda
] += TempA
[2] * TempB
[2];
85 C
[3+8*n
+j
*lda
] += TempA
[2] * TempB
[3];
86 C
[4+8*n
+j
*lda
] += TempA
[2] * TempB
[4];
87 C
[5+8*n
+j
*lda
] += TempA
[2] * TempB
[5];
88 C
[6+8*n
+j
*lda
] += TempA
[2] * TempB
[6];
89 C
[7+8*n
+j
*lda
] += TempA
[2] * TempB
[7];
93 TempB
[0] = B
[(3+8*m
)*lda
+0+8*n
];
94 TempB
[1] = B
[(3+8*m
)*lda
+1+8*n
];
95 TempB
[2] = B
[(3+8*m
)*lda
+2+8*n
];
96 TempB
[3] = B
[(3+8*m
)*lda
+3+8*n
];
97 TempB
[4] = B
[(3+8*m
)*lda
+4+8*n
];
98 TempB
[5] = B
[(3+8*m
)*lda
+5+8*n
];
99 TempB
[6] = B
[(3+8*m
)*lda
+6+8*n
];
100 TempB
[7] = B
[(3+8*m
)*lda
+7+8*n
];
102 C
[0+8*n
+j
*lda
] += TempA
[3] * TempB
[0];
103 C
[1+8*n
+j
*lda
] += TempA
[3] * TempB
[1];
104 C
[2+8*n
+j
*lda
] += TempA
[3] * TempB
[2];
105 C
[3+8*n
+j
*lda
] += TempA
[3] * TempB
[3];
106 C
[4+8*n
+j
*lda
] += TempA
[3] * TempB
[4];
107 C
[5+8*n
+j
*lda
] += TempA
[3] * TempB
[5];
108 C
[6+8*n
+j
*lda
] += TempA
[3] * TempB
[6];
109 C
[7+8*n
+j
*lda
] += TempA
[3] * TempB
[7];
112 TempB
[0] = B
[(4+8*m
)*lda
+0+8*n
];
113 TempB
[1] = B
[(4+8*m
)*lda
+1+8*n
];
114 TempB
[2] = B
[(4+8*m
)*lda
+2+8*n
];
115 TempB
[3] = B
[(4+8*m
)*lda
+3+8*n
];
116 TempB
[4] = B
[(4+8*m
)*lda
+4+8*n
];
117 TempB
[5] = B
[(4+8*m
)*lda
+5+8*n
];
118 TempB
[6] = B
[(4+8*m
)*lda
+6+8*n
];
119 TempB
[7] = B
[(4+8*m
)*lda
+7+8*n
];
121 C
[0+8*n
+j
*lda
] += TempA
[4] * TempB
[0];
122 C
[1+8*n
+j
*lda
] += TempA
[4] * TempB
[1];
123 C
[2+8*n
+j
*lda
] += TempA
[4] * TempB
[2];
124 C
[3+8*n
+j
*lda
] += TempA
[4] * TempB
[3];
125 C
[4+8*n
+j
*lda
] += TempA
[4] * TempB
[4];
126 C
[5+8*n
+j
*lda
] += TempA
[4] * TempB
[5];
127 C
[6+8*n
+j
*lda
] += TempA
[4] * TempB
[6];
128 C
[7+8*n
+j
*lda
] += TempA
[4] * TempB
[7];
132 TempB
[0] = B
[(5+8*m
)*lda
+0+8*n
];
133 TempB
[1] = B
[(5+8*m
)*lda
+1+8*n
];
134 TempB
[2] = B
[(5+8*m
)*lda
+2+8*n
];
135 TempB
[3] = B
[(5+8*m
)*lda
+3+8*n
];
136 TempB
[4] = B
[(5+8*m
)*lda
+4+8*n
];
137 TempB
[5] = B
[(5+8*m
)*lda
+5+8*n
];
138 TempB
[6] = B
[(5+8*m
)*lda
+6+8*n
];
139 TempB
[7] = B
[(5+8*m
)*lda
+7+8*n
];
141 C
[0+8*n
+j
*lda
] += TempA
[5] * TempB
[0];
142 C
[1+8*n
+j
*lda
] += TempA
[5] * TempB
[1];
143 C
[2+8*n
+j
*lda
] += TempA
[5] * TempB
[2];
144 C
[3+8*n
+j
*lda
] += TempA
[5] * TempB
[3];
145 C
[4+8*n
+j
*lda
] += TempA
[5] * TempB
[4];
146 C
[5+8*n
+j
*lda
] += TempA
[5] * TempB
[5];
147 C
[6+8*n
+j
*lda
] += TempA
[5] * TempB
[6];
148 C
[7+8*n
+j
*lda
] += TempA
[5] * TempB
[7];
152 TempB
[0] = B
[(6+8*m
)*lda
+0+8*n
];
153 TempB
[1] = B
[(6+8*m
)*lda
+1+8*n
];
154 TempB
[2] = B
[(6+8*m
)*lda
+2+8*n
];
155 TempB
[3] = B
[(6+8*m
)*lda
+3+8*n
];
156 TempB
[4] = B
[(6+8*m
)*lda
+4+8*n
];
157 TempB
[5] = B
[(6+8*m
)*lda
+5+8*n
];
158 TempB
[6] = B
[(6+8*m
)*lda
+6+8*n
];
159 TempB
[7] = B
[(6+8*m
)*lda
+7+8*n
];
161 C
[0+8*n
+j
*lda
] += TempA
[6] * TempB
[0];
162 C
[1+8*n
+j
*lda
] += TempA
[6] * TempB
[1];
163 C
[2+8*n
+j
*lda
] += TempA
[6] * TempB
[2];
164 C
[3+8*n
+j
*lda
] += TempA
[6] * TempB
[3];
165 C
[4+8*n
+j
*lda
] += TempA
[6] * TempB
[4];
166 C
[5+8*n
+j
*lda
] += TempA
[6] * TempB
[5];
167 C
[6+8*n
+j
*lda
] += TempA
[6] * TempB
[6];
168 C
[7+8*n
+j
*lda
] += TempA
[6] * TempB
[7];
171 TempB
[0] = B
[(7+8*m
)*lda
+0+8*n
];
172 TempB
[1] = B
[(7+8*m
)*lda
+1+8*n
];
173 TempB
[2] = B
[(7+8*m
)*lda
+2+8*n
];
174 TempB
[3] = B
[(7+8*m
)*lda
+3+8*n
];
175 TempB
[4] = B
[(7+8*m
)*lda
+4+8*n
];
176 TempB
[5] = B
[(7+8*m
)*lda
+5+8*n
];
177 TempB
[6] = B
[(7+8*m
)*lda
+6+8*n
];
178 TempB
[7] = B
[(7+8*m
)*lda
+7+8*n
];
180 C
[0+8*n
+j
*lda
] += TempA
[7] * TempB
[0];
181 C
[1+8*n
+j
*lda
] += TempA
[7] * TempB
[1];
182 C
[2+8*n
+j
*lda
] += TempA
[7] * TempB
[2];
183 C
[3+8*n
+j
*lda
] += TempA
[7] * TempB
[3];
184 C
[4+8*n
+j
*lda
] += TempA
[7] * TempB
[4];
185 C
[5+8*n
+j
*lda
] += TempA
[7] * TempB
[5];
186 C
[6+8*n
+j
*lda
] += TempA
[7] * TempB
[6];
187 C
[7+8*n
+j
*lda
] += TempA
[7] * TempB
[7];
195 for ( j
= 0; j
< 16; j
++ )
198 for ( m
= 0; m
< 4; m
++ )
201 TempA
[0] = A
[j
*lda
+0+8*m
];
202 TempA
[1] = A
[j
*lda
+1+8*m
];
203 TempA
[2] = A
[j
*lda
+2+8*m
];
204 TempA
[3] = A
[j
*lda
+3+8*m
];
205 TempA
[4] = A
[j
*lda
+4+8*m
];
206 TempA
[5] = A
[j
*lda
+5+8*m
];
207 TempA
[6] = A
[j
*lda
+6+8*m
];
208 TempA
[7] = A
[j
*lda
+7+8*m
];
210 for( n
= 0; n
< 4; n
++)
212 TempB
[0] = B
[(0+8*m
)*lda
+0+8*n
];
213 TempB
[1] = B
[(0+8*m
)*lda
+1+8*n
];
214 TempB
[2] = B
[(0+8*m
)*lda
+2+8*n
];
215 TempB
[3] = B
[(0+8*m
)*lda
+3+8*n
];
216 TempB
[4] = B
[(0+8*m
)*lda
+4+8*n
];
217 TempB
[5] = B
[(0+8*m
)*lda
+5+8*n
];
218 TempB
[6] = B
[(0+8*m
)*lda
+6+8*n
];
219 TempB
[7] = B
[(0+8*m
)*lda
+7+8*n
];
221 C
[0+8*n
+j
*lda
] += TempA
[0] * TempB
[0];
222 C
[1+8*n
+j
*lda
] += TempA
[0] * TempB
[1];
223 C
[2+8*n
+j
*lda
] += TempA
[0] * TempB
[2];
224 C
[3+8*n
+j
*lda
] += TempA
[0] * TempB
[3];
225 C
[4+8*n
+j
*lda
] += TempA
[0] * TempB
[4];
226 C
[5+8*n
+j
*lda
] += TempA
[0] * TempB
[5];
227 C
[6+8*n
+j
*lda
] += TempA
[0] * TempB
[6];
228 C
[7+8*n
+j
*lda
] += TempA
[0] * TempB
[7];
232 TempB
[0] = B
[(1+8*m
)*lda
+0+8*n
];
233 TempB
[1] = B
[(1+8*m
)*lda
+1+8*n
];
234 TempB
[2] = B
[(1+8*m
)*lda
+2+8*n
];
235 TempB
[3] = B
[(1+8*m
)*lda
+3+8*n
];
236 TempB
[4] = B
[(1+8*m
)*lda
+4+8*n
];
237 TempB
[5] = B
[(1+8*m
)*lda
+5+8*n
];
238 TempB
[6] = B
[(1+8*m
)*lda
+6+8*n
];
239 TempB
[7] = B
[(1+8*m
)*lda
+7+8*n
];
241 C
[0+8*n
+j
*lda
] += TempA
[1] * TempB
[0];
242 C
[1+8*n
+j
*lda
] += TempA
[1] * TempB
[1];
243 C
[2+8*n
+j
*lda
] += TempA
[1] * TempB
[2];
244 C
[3+8*n
+j
*lda
] += TempA
[1] * TempB
[3];
245 C
[4+8*n
+j
*lda
] += TempA
[1] * TempB
[4];
246 C
[5+8*n
+j
*lda
] += TempA
[1] * TempB
[5];
247 C
[6+8*n
+j
*lda
] += TempA
[1] * TempB
[6];
248 C
[7+8*n
+j
*lda
] += TempA
[1] * TempB
[7];
252 TempB
[0] = B
[(2+8*m
)*lda
+0+8*n
];
253 TempB
[1] = B
[(2+8*m
)*lda
+1+8*n
];
254 TempB
[2] = B
[(2+8*m
)*lda
+2+8*n
];
255 TempB
[3] = B
[(2+8*m
)*lda
+3+8*n
];
256 TempB
[4] = B
[(2+8*m
)*lda
+4+8*n
];
257 TempB
[5] = B
[(2+8*m
)*lda
+5+8*n
];
258 TempB
[6] = B
[(2+8*m
)*lda
+6+8*n
];
259 TempB
[7] = B
[(2+8*m
)*lda
+7+8*n
];
261 C
[0+8*n
+j
*lda
] += TempA
[2] * TempB
[0];
262 C
[1+8*n
+j
*lda
] += TempA
[2] * TempB
[1];
263 C
[2+8*n
+j
*lda
] += TempA
[2] * TempB
[2];
264 C
[3+8*n
+j
*lda
] += TempA
[2] * TempB
[3];
265 C
[4+8*n
+j
*lda
] += TempA
[2] * TempB
[4];
266 C
[5+8*n
+j
*lda
] += TempA
[2] * TempB
[5];
267 C
[6+8*n
+j
*lda
] += TempA
[2] * TempB
[6];
268 C
[7+8*n
+j
*lda
] += TempA
[2] * TempB
[7];
272 TempB
[0] = B
[(3+8*m
)*lda
+0+8*n
];
273 TempB
[1] = B
[(3+8*m
)*lda
+1+8*n
];
274 TempB
[2] = B
[(3+8*m
)*lda
+2+8*n
];
275 TempB
[3] = B
[(3+8*m
)*lda
+3+8*n
];
276 TempB
[4] = B
[(3+8*m
)*lda
+4+8*n
];
277 TempB
[5] = B
[(3+8*m
)*lda
+5+8*n
];
278 TempB
[6] = B
[(3+8*m
)*lda
+6+8*n
];
279 TempB
[7] = B
[(3+8*m
)*lda
+7+8*n
];
281 C
[0+8*n
+j
*lda
] += TempA
[3] * TempB
[0];
282 C
[1+8*n
+j
*lda
] += TempA
[3] * TempB
[1];
283 C
[2+8*n
+j
*lda
] += TempA
[3] * TempB
[2];
284 C
[3+8*n
+j
*lda
] += TempA
[3] * TempB
[3];
285 C
[4+8*n
+j
*lda
] += TempA
[3] * TempB
[4];
286 C
[5+8*n
+j
*lda
] += TempA
[3] * TempB
[5];
287 C
[6+8*n
+j
*lda
] += TempA
[3] * TempB
[6];
288 C
[7+8*n
+j
*lda
] += TempA
[3] * TempB
[7];
291 TempB
[0] = B
[(4+8*m
)*lda
+0+8*n
];
292 TempB
[1] = B
[(4+8*m
)*lda
+1+8*n
];
293 TempB
[2] = B
[(4+8*m
)*lda
+2+8*n
];
294 TempB
[3] = B
[(4+8*m
)*lda
+3+8*n
];
295 TempB
[4] = B
[(4+8*m
)*lda
+4+8*n
];
296 TempB
[5] = B
[(4+8*m
)*lda
+5+8*n
];
297 TempB
[6] = B
[(4+8*m
)*lda
+6+8*n
];
298 TempB
[7] = B
[(4+8*m
)*lda
+7+8*n
];
300 C
[0+8*n
+j
*lda
] += TempA
[4] * TempB
[0];
301 C
[1+8*n
+j
*lda
] += TempA
[4] * TempB
[1];
302 C
[2+8*n
+j
*lda
] += TempA
[4] * TempB
[2];
303 C
[3+8*n
+j
*lda
] += TempA
[4] * TempB
[3];
304 C
[4+8*n
+j
*lda
] += TempA
[4] * TempB
[4];
305 C
[5+8*n
+j
*lda
] += TempA
[4] * TempB
[5];
306 C
[6+8*n
+j
*lda
] += TempA
[4] * TempB
[6];
307 C
[7+8*n
+j
*lda
] += TempA
[4] * TempB
[7];
311 TempB
[0] = B
[(5+8*m
)*lda
+0+8*n
];
312 TempB
[1] = B
[(5+8*m
)*lda
+1+8*n
];
313 TempB
[2] = B
[(5+8*m
)*lda
+2+8*n
];
314 TempB
[3] = B
[(5+8*m
)*lda
+3+8*n
];
315 TempB
[4] = B
[(5+8*m
)*lda
+4+8*n
];
316 TempB
[5] = B
[(5+8*m
)*lda
+5+8*n
];
317 TempB
[6] = B
[(5+8*m
)*lda
+6+8*n
];
318 TempB
[7] = B
[(5+8*m
)*lda
+7+8*n
];
320 C
[0+8*n
+j
*lda
] += TempA
[5] * TempB
[0];
321 C
[1+8*n
+j
*lda
] += TempA
[5] * TempB
[1];
322 C
[2+8*n
+j
*lda
] += TempA
[5] * TempB
[2];
323 C
[3+8*n
+j
*lda
] += TempA
[5] * TempB
[3];
324 C
[4+8*n
+j
*lda
] += TempA
[5] * TempB
[4];
325 C
[5+8*n
+j
*lda
] += TempA
[5] * TempB
[5];
326 C
[6+8*n
+j
*lda
] += TempA
[5] * TempB
[6];
327 C
[7+8*n
+j
*lda
] += TempA
[5] * TempB
[7];
331 TempB
[0] = B
[(6+8*m
)*lda
+0+8*n
];
332 TempB
[1] = B
[(6+8*m
)*lda
+1+8*n
];
333 TempB
[2] = B
[(6+8*m
)*lda
+2+8*n
];
334 TempB
[3] = B
[(6+8*m
)*lda
+3+8*n
];
335 TempB
[4] = B
[(6+8*m
)*lda
+4+8*n
];
336 TempB
[5] = B
[(6+8*m
)*lda
+5+8*n
];
337 TempB
[6] = B
[(6+8*m
)*lda
+6+8*n
];
338 TempB
[7] = B
[(6+8*m
)*lda
+7+8*n
];
340 C
[0+8*n
+j
*lda
] += TempA
[6] * TempB
[0];
341 C
[1+8*n
+j
*lda
] += TempA
[6] * TempB
[1];
342 C
[2+8*n
+j
*lda
] += TempA
[6] * TempB
[2];
343 C
[3+8*n
+j
*lda
] += TempA
[6] * TempB
[3];
344 C
[4+8*n
+j
*lda
] += TempA
[6] * TempB
[4];
345 C
[5+8*n
+j
*lda
] += TempA
[6] * TempB
[5];
346 C
[6+8*n
+j
*lda
] += TempA
[6] * TempB
[6];
347 C
[7+8*n
+j
*lda
] += TempA
[6] * TempB
[7];
350 TempB
[0] = B
[(7+8*m
)*lda
+0+8*n
];
351 TempB
[1] = B
[(7+8*m
)*lda
+1+8*n
];
352 TempB
[2] = B
[(7+8*m
)*lda
+2+8*n
];
353 TempB
[3] = B
[(7+8*m
)*lda
+3+8*n
];
354 TempB
[4] = B
[(7+8*m
)*lda
+4+8*n
];
355 TempB
[5] = B
[(7+8*m
)*lda
+5+8*n
];
356 TempB
[6] = B
[(7+8*m
)*lda
+6+8*n
];
357 TempB
[7] = B
[(7+8*m
)*lda
+7+8*n
];
359 C
[0+8*n
+j
*lda
] += TempA
[7] * TempB
[0];
360 C
[1+8*n
+j
*lda
] += TempA
[7] * TempB
[1];
361 C
[2+8*n
+j
*lda
] += TempA
[7] * TempB
[2];
362 C
[3+8*n
+j
*lda
] += TempA
[7] * TempB
[3];
363 C
[4+8*n
+j
*lda
] += TempA
[7] * TempB
[4];
364 C
[5+8*n
+j
*lda
] += TempA
[7] * TempB
[5];
365 C
[6+8*n
+j
*lda
] += TempA
[7] * TempB
[6];
366 C
[7+8*n
+j
*lda
] += TempA
[7] * TempB
[7];