Fix build with riscv-gcc version 4.9
[riscv-tests.git] / mt / av_matmul / av_matmul.c
1 //**************************************************************************
2 // Multi-threaded Matrix Multiply benchmark
3 //--------------------------------------------------------------------------
4 // TA : Christopher Celio
5 // Student:
6 //
7 //
8 // This benchmark multiplies two 2-D arrays together and writes the results to
9 // a third vector. The input data (and reference data) should be generated
10 // using the matmul_gendata.pl perl script and dumped to a file named
11 // dataset.h.
12
13
14 // print out arrays, etc.
15 //#define DEBUG
16
17 //--------------------------------------------------------------------------
18 // Includes
19
20 #include <string.h>
21 #include <stdlib.h>
22 #include <stdio.h>
23
24
25 //--------------------------------------------------------------------------
26 // Input/Reference Data
27
28 typedef float data_t;
29 #include "dataset.h"
30
31
32 //--------------------------------------------------------------------------
33 // Basic Utilities and Multi-thread Support
34
35 __thread unsigned long coreid;
36 unsigned long ncores;
37
38 #include "util.h"
39
40 #define stringify_1(s) #s
41 #define stringify(s) stringify_1(s)
42 #define stats(code) do { \
43 unsigned long _c = -rdcycle(), _i = -rdinstret(); \
44 code; \
45 _c += rdcycle(), _i += rdinstret(); \
46 if (coreid == 0) \
47 printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
48 stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
49 } while(0)
50
51
52 //--------------------------------------------------------------------------
53 // Helper functions
54
55 void printArrayMT( char name[], int n, data_t arr[] )
56 {
57 int i;
58 if (coreid != 0)
59 return;
60
61 printf( " %10s :", name );
62 for ( i = 0; i < n; i++ )
63 printf( " %3ld ", (long) arr[i] );
64 printf( "\n" );
65 }
66
67 void __attribute__((noinline)) verifyMT(size_t n, const data_t* test, const data_t* correct)
68 {
69 if (coreid != 0)
70 return;
71
72 size_t i;
73 for (i = 0; i < n; i++)
74 {
75 if (test[i] != correct[i])
76 {
77 printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n",
78 i, (long)test[i], i, (long)correct[i]);
79 exit(-1);
80 }
81 }
82
83 return;
84 }
85
86 //--------------------------------------------------------------------------
87 // matmul function
88
89 // single-thread, naive version
90 void __attribute__((noinline)) matmul_naive(const int lda, const data_t A[], const data_t B[], data_t C[] )
91 {
92 int i, j, k;
93
94 if (coreid > 0)
95 return;
96
97 for ( i = 0; i < lda; i++ )
98 for ( j = 0; j < lda; j++ )
99 {
100 for ( k = 0; k < lda; k++ )
101 {
102 C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
103 }
104 }
105
106 }
107
108
109
110 void __attribute__((noinline)) matmul(const int lda, const data_t A[], const data_t B[], data_t C[] )
111 {
112
113 // ***************************** //
114 // **** ADD YOUR CODE HERE ***** //
115 // ***************************** //
116 //
117 // feel free to make a separate function for MI and MSI versions.
118
119 //-------------------------------------------------------------first working version best 500k
120 /*
121 static __thread int i, j, k;
122 if(coreid == 0)
123 {
124 for ( j = 0; j < lda; j+=2 )
125 {
126 for ( k = 0; k < lda; k++ )
127 {
128 for ( i = 0; i < lda; i++)
129 {
130 C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
131 }
132 }
133 }
134 }
135
136 if(coreid ==1)
137 {
138 for ( j = 1; j < lda; j+=2 )
139 {
140 for ( k = 0;k < lda; k++)
141 {
142 for ( i = 0; i < lda; i++)
143 {
144 C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
145
146 }
147 }
148 }
149 }
150 */
151 //-------------------------------------------------------------version1.1, take read out of inner loop,300k
152 /*
153 static __thread int i, j, k;
154 static __thread data_t TempA;
155
156 if(coreid == 0)
157 {
158 for ( j = 0; j < lda; j+=2 )
159 {
160 for ( k = 0; k < lda; k++ )
161 {
162 TempA = A[j*lda + k];
163 for ( i = 0; i < lda; i++)
164 {
165 C[i + j*lda] += TempA* B[k*lda + i];
166 }
167 }
168 }
169 }
170
171 if(coreid ==1)
172 {
173 for ( j = 1; j < lda; j+=2 )
174 {
175 for ( k = 0;k < lda; k++)
176 {
177 TempA = A[j*lda + k];
178 for ( i = 0; i < lda; i++)
179 {
180 C[i + j*lda] += TempA* B[k*lda + i];
181 }
182 }
183 }
184 }
185 */
186 //-------------------------------------------------------------version2.0, read 8 elements in B at one time. 140k mi, MSI117.0k
187 /*
188 static __thread int i, j, k, m, n;
189 static __thread data_t TempA;
190 static __thread data_t TempB[8];
191
192 if(coreid == 0)
193 {
194 for ( j = 0; j < lda; j+=2 )
195 {
196 for ( k = 0; k < lda; k++ )
197 {
198 TempA = A[j*lda + k];
199 for( n = 0; n < 4; n++)
200 {
201
202 TempB[0] = B[k*lda+0+8*n];
203 TempB[1] = B[k*lda+1+8*n];
204 TempB[2] = B[k*lda+2+8*n];
205 TempB[3] = B[k*lda+3+8*n];
206 TempB[4] = B[k*lda+4+8*n];
207 TempB[5] = B[k*lda+5+8*n];
208 TempB[6] = B[k*lda+6+8*n];
209 TempB[7] = B[k*lda+7+8*n];
210
211 C[0+8*n+j*lda] += TempA * TempB[0];
212 C[1+8*n+j*lda] += TempA * TempB[1];
213 C[2+8*n+j*lda] += TempA * TempB[2];
214 C[3+8*n+j*lda] += TempA * TempB[3];
215 C[4+8*n+j*lda] += TempA * TempB[4];
216 C[5+8*n+j*lda] += TempA * TempB[5];
217 C[6+8*n+j*lda] += TempA * TempB[6];
218 C[7+8*n+j*lda] += TempA * TempB[7];
219
220 }
221
222 }
223 }
224 }
225
226 if(coreid == 1)
227 {
228 for ( j = 1; j < lda; j+=2 )
229 {
230 for ( k = 0; k < lda; k++ )
231 {
232 TempA = A[j*lda + k];
233 for( n = 0; n < 4; n++)
234 {
235
236 TempB[0] = B[k*lda+0+8*n];
237 TempB[1] = B[k*lda+1+8*n];
238 TempB[2] = B[k*lda+2+8*n];
239 TempB[3] = B[k*lda+3+8*n];
240 TempB[4] = B[k*lda+4+8*n];
241 TempB[5] = B[k*lda+5+8*n];
242 TempB[6] = B[k*lda+6+8*n];
243 TempB[7] = B[k*lda+7+8*n];
244
245 C[0+8*n+j*lda] += TempA * TempB[0];
246 C[1+8*n+j*lda] += TempA * TempB[1];
247 C[2+8*n+j*lda] += TempA * TempB[2];
248 C[3+8*n+j*lda] += TempA * TempB[3];
249 C[4+8*n+j*lda] += TempA * TempB[4];
250 C[5+8*n+j*lda] += TempA * TempB[5];
251 C[6+8*n+j*lda] += TempA * TempB[6];
252 C[7+8*n+j*lda] += TempA * TempB[7];
253
254 }
255
256 }
257 }
258 }
259 */
260
261 //-------------------------------------------------------------version2.1, optimize k. 700k. bad move to v2.2.
262 //-------------------------------------------------------------version2.9 take off all inner loops for both cores, MSI,109K. MI 182k
263 //-------------------------------------------------------------version2.10 use i= j*lda inside the n loop increase speed. but not out m and n. tried replace first 3, get 104.9k
264 /*
265 static __thread int j, m, i,n;
266 static __thread data_t TempA[8];
267 static __thread data_t TempB[8];
268
269 if(coreid == 1)
270 {
271 for ( j = 1; j < lda; j+=2 )
272 {
273
274 for ( m = 0; m < 4; m++ )
275 {
276
277 TempA[0] = A[j*lda+0+8*m];
278 TempA[1] = A[j*lda+1+8*m];
279 TempA[2] = A[j*lda+2+8*m];
280 TempA[3] = A[j*lda+3+8*m];
281 TempA[4] = A[j*lda+4+8*m];
282 TempA[5] = A[j*lda+5+8*m];
283 TempA[6] = A[j*lda+6+8*m];
284 TempA[7] = A[j*lda+7+8*m];
285
286 for( n = 0; n < 4; n++)
287 {
288 i = j*lda;
289
290 TempB[0] = B[(0+8*m)*lda+0+8*n];
291 TempB[1] = B[(0+8*m)*lda+1+8*n];
292 TempB[2] = B[(0+8*m)*lda+2+8*n];
293 TempB[3] = B[(0+8*m)*lda+3+8*n];
294 TempB[4] = B[(0+8*m)*lda+4+8*n];
295 TempB[5] = B[(0+8*m)*lda+5+8*n];
296 TempB[6] = B[(0+8*m)*lda+6+8*n];
297 TempB[7] = B[(0+8*m)*lda+7+8*n];
298
299 C[0+8*n+i] += TempA[0] * TempB[0];
300 C[1+8*n+i] += TempA[0] * TempB[1];
301 C[2+8*n+i] += TempA[0] * TempB[2];
302 C[3+8*n+i] += TempA[0] * TempB[3];
303 C[4+8*n+i] += TempA[0] * TempB[4];
304 C[5+8*n+i] += TempA[0] * TempB[5];
305 C[6+8*n+i] += TempA[0] * TempB[6];
306 C[7+8*n+i] += TempA[0] * TempB[7];
307
308
309
310 TempB[0] = B[(1+8*m)*lda+0+8*n];
311 TempB[1] = B[(1+8*m)*lda+1+8*n];
312 TempB[2] = B[(1+8*m)*lda+2+8*n];
313 TempB[3] = B[(1+8*m)*lda+3+8*n];
314 TempB[4] = B[(1+8*m)*lda+4+8*n];
315 TempB[5] = B[(1+8*m)*lda+5+8*n];
316 TempB[6] = B[(1+8*m)*lda+6+8*n];
317 TempB[7] = B[(1+8*m)*lda+7+8*n];
318
319 C[0+8*n+i] += TempA[1] * TempB[0];
320 C[1+8*n+i] += TempA[1] * TempB[1];
321 C[2+8*n+i] += TempA[1] * TempB[2];
322 C[3+8*n+i] += TempA[1] * TempB[3];
323 C[4+8*n+i] += TempA[1] * TempB[4];
324 C[5+8*n+i] += TempA[1] * TempB[5];
325 C[6+8*n+i] += TempA[1] * TempB[6];
326 C[7+8*n+i] += TempA[1] * TempB[7];
327
328
329
330 TempB[0] = B[(2+8*m)*lda+0+8*n];
331 TempB[1] = B[(2+8*m)*lda+1+8*n];
332 TempB[2] = B[(2+8*m)*lda+2+8*n];
333 TempB[3] = B[(2+8*m)*lda+3+8*n];
334 TempB[4] = B[(2+8*m)*lda+4+8*n];
335 TempB[5] = B[(2+8*m)*lda+5+8*n];
336 TempB[6] = B[(2+8*m)*lda+6+8*n];
337 TempB[7] = B[(2+8*m)*lda+7+8*n];
338
339 C[0+8*n+i] += TempA[2] * TempB[0];
340 C[1+8*n+i] += TempA[2] * TempB[1];
341 C[2+8*n+i] += TempA[2] * TempB[2];
342 C[3+8*n+i] += TempA[2] * TempB[3];
343 C[4+8*n+i] += TempA[2] * TempB[4];
344 C[5+8*n+i] += TempA[2] * TempB[5];
345 C[6+8*n+i] += TempA[2] * TempB[6];
346 C[7+8*n+i] += TempA[2] * TempB[7];
347
348
349
350 TempB[0] = B[(3+8*m)*lda+0+8*n];
351 TempB[1] = B[(3+8*m)*lda+1+8*n];
352 TempB[2] = B[(3+8*m)*lda+2+8*n];
353 TempB[3] = B[(3+8*m)*lda+3+8*n];
354 TempB[4] = B[(3+8*m)*lda+4+8*n];
355 TempB[5] = B[(3+8*m)*lda+5+8*n];
356 TempB[6] = B[(3+8*m)*lda+6+8*n];
357 TempB[7] = B[(3+8*m)*lda+7+8*n];
358
359 C[0+8*n+i] += TempA[3] * TempB[0];
360 C[1+8*n+i] += TempA[3] * TempB[1];
361 C[2+8*n+i] += TempA[3] * TempB[2];
362 C[3+8*n+i] += TempA[3] * TempB[3];
363 C[4+8*n+i] += TempA[3] * TempB[4];
364 C[5+8*n+i] += TempA[3] * TempB[5];
365 C[6+8*n+i] += TempA[3] * TempB[6];
366 C[7+8*n+i] += TempA[3] * TempB[7];
367
368
369 TempB[0] = B[(4+8*m)*lda+0+8*n];
370 TempB[1] = B[(4+8*m)*lda+1+8*n];
371 TempB[2] = B[(4+8*m)*lda+2+8*n];
372 TempB[3] = B[(4+8*m)*lda+3+8*n];
373 TempB[4] = B[(4+8*m)*lda+4+8*n];
374 TempB[5] = B[(4+8*m)*lda+5+8*n];
375 TempB[6] = B[(4+8*m)*lda+6+8*n];
376 TempB[7] = B[(4+8*m)*lda+7+8*n];
377
378 C[0+8*n+i] += TempA[4] * TempB[0];
379 C[1+8*n+i] += TempA[4] * TempB[1];
380 C[2+8*n+i] += TempA[4] * TempB[2];
381 C[3+8*n+i] += TempA[4] * TempB[3];
382 C[4+8*n+i] += TempA[4] * TempB[4];
383 C[5+8*n+i] += TempA[4] * TempB[5];
384 C[6+8*n+i] += TempA[4] * TempB[6];
385 C[7+8*n+i] += TempA[4] * TempB[7];
386
387
388
389 TempB[0] = B[(5+8*m)*lda+0+8*n];
390 TempB[1] = B[(5+8*m)*lda+1+8*n];
391 TempB[2] = B[(5+8*m)*lda+2+8*n];
392 TempB[3] = B[(5+8*m)*lda+3+8*n];
393 TempB[4] = B[(5+8*m)*lda+4+8*n];
394 TempB[5] = B[(5+8*m)*lda+5+8*n];
395 TempB[6] = B[(5+8*m)*lda+6+8*n];
396 TempB[7] = B[(5+8*m)*lda+7+8*n];
397
398 C[0+8*n+i] += TempA[5] * TempB[0];
399 C[1+8*n+i] += TempA[5] * TempB[1];
400 C[2+8*n+i] += TempA[5] * TempB[2];
401 C[3+8*n+i] += TempA[5] * TempB[3];
402 C[4+8*n+i] += TempA[5] * TempB[4];
403 C[5+8*n+i] += TempA[5] * TempB[5];
404 C[6+8*n+i] += TempA[5] * TempB[6];
405 C[7+8*n+i] += TempA[5] * TempB[7];
406
407
408
409 TempB[0] = B[(6+8*m)*lda+0+8*n];
410 TempB[1] = B[(6+8*m)*lda+1+8*n];
411 TempB[2] = B[(6+8*m)*lda+2+8*n];
412 TempB[3] = B[(6+8*m)*lda+3+8*n];
413 TempB[4] = B[(6+8*m)*lda+4+8*n];
414 TempB[5] = B[(6+8*m)*lda+5+8*n];
415 TempB[6] = B[(6+8*m)*lda+6+8*n];
416 TempB[7] = B[(6+8*m)*lda+7+8*n];
417
418 C[0+8*n+i] += TempA[6] * TempB[0];
419 C[1+8*n+i] += TempA[6] * TempB[1];
420 C[2+8*n+i] += TempA[6] * TempB[2];
421 C[3+8*n+i] += TempA[6] * TempB[3];
422 C[4+8*n+i] += TempA[6] * TempB[4];
423 C[5+8*n+i] += TempA[6] * TempB[5];
424 C[6+8*n+i] += TempA[6] * TempB[6];
425 C[7+8*n+i] += TempA[6] * TempB[7];
426
427
428 TempB[0] = B[(7+8*m)*lda+0+8*n];
429 TempB[1] = B[(7+8*m)*lda+1+8*n];
430 TempB[2] = B[(7+8*m)*lda+2+8*n];
431 TempB[3] = B[(7+8*m)*lda+3+8*n];
432 TempB[4] = B[(7+8*m)*lda+4+8*n];
433 TempB[5] = B[(7+8*m)*lda+5+8*n];
434 TempB[6] = B[(7+8*m)*lda+6+8*n];
435 TempB[7] = B[(7+8*m)*lda+7+8*n];
436
437 C[0+8*n+i] += TempA[7] * TempB[0];
438 C[1+8*n+i] += TempA[7] * TempB[1];
439 C[2+8*n+i] += TempA[7] * TempB[2];
440 C[3+8*n+i] += TempA[7] * TempB[3];
441 C[4+8*n+i] += TempA[7] * TempB[4];
442 C[5+8*n+i] += TempA[7] * TempB[5];
443 C[6+8*n+i] += TempA[7] * TempB[6];
444 C[7+8*n+i] += TempA[7] * TempB[7];
445 }
446
447 }
448 }
449 }
450 if(coreid == 0)
451 {
452 for ( j = 0; j < lda; j+=2 )
453 {
454
455 for ( m = 0; m < 4; m++ )
456 {
457
458 TempA[0] = A[j*lda+0+8*m];
459 TempA[1] = A[j*lda+1+8*m];
460 TempA[2] = A[j*lda+2+8*m];
461 TempA[3] = A[j*lda+3+8*m];
462 TempA[4] = A[j*lda+4+8*m];
463 TempA[5] = A[j*lda+5+8*m];
464 TempA[6] = A[j*lda+6+8*m];
465 TempA[7] = A[j*lda+7+8*m];
466
467 for( n = 0; n < 4; n++)
468 {
469 i = j*lda;
470
471 TempB[0] = B[(0+8*m)*lda+0+8*n];
472 TempB[1] = B[(0+8*m)*lda+1+8*n];
473 TempB[2] = B[(0+8*m)*lda+2+8*n];
474 TempB[3] = B[(0+8*m)*lda+3+8*n];
475 TempB[4] = B[(0+8*m)*lda+4+8*n];
476 TempB[5] = B[(0+8*m)*lda+5+8*n];
477 TempB[6] = B[(0+8*m)*lda+6+8*n];
478 TempB[7] = B[(0+8*m)*lda+7+8*n];
479
480 C[0+8*n+i] += TempA[0] * TempB[0];
481 C[1+8*n+i] += TempA[0] * TempB[1];
482 C[2+8*n+i] += TempA[0] * TempB[2];
483 C[3+8*n+i] += TempA[0] * TempB[3];
484 C[4+8*n+i] += TempA[0] * TempB[4];
485 C[5+8*n+i] += TempA[0] * TempB[5];
486 C[6+8*n+i] += TempA[0] * TempB[6];
487 C[7+8*n+i] += TempA[0] * TempB[7];
488
489
490
491 TempB[0] = B[(1+8*m)*lda+0+8*n];
492 TempB[1] = B[(1+8*m)*lda+1+8*n];
493 TempB[2] = B[(1+8*m)*lda+2+8*n];
494 TempB[3] = B[(1+8*m)*lda+3+8*n];
495 TempB[4] = B[(1+8*m)*lda+4+8*n];
496 TempB[5] = B[(1+8*m)*lda+5+8*n];
497 TempB[6] = B[(1+8*m)*lda+6+8*n];
498 TempB[7] = B[(1+8*m)*lda+7+8*n];
499
500 C[0+8*n+i] += TempA[1] * TempB[0];
501 C[1+8*n+i] += TempA[1] * TempB[1];
502 C[2+8*n+i] += TempA[1] * TempB[2];
503 C[3+8*n+i] += TempA[1] * TempB[3];
504 C[4+8*n+i] += TempA[1] * TempB[4];
505 C[5+8*n+i] += TempA[1] * TempB[5];
506 C[6+8*n+i] += TempA[1] * TempB[6];
507 C[7+8*n+i] += TempA[1] * TempB[7];
508
509
510
511 TempB[0] = B[(2+8*m)*lda+0+8*n];
512 TempB[1] = B[(2+8*m)*lda+1+8*n];
513 TempB[2] = B[(2+8*m)*lda+2+8*n];
514 TempB[3] = B[(2+8*m)*lda+3+8*n];
515 TempB[4] = B[(2+8*m)*lda+4+8*n];
516 TempB[5] = B[(2+8*m)*lda+5+8*n];
517 TempB[6] = B[(2+8*m)*lda+6+8*n];
518 TempB[7] = B[(2+8*m)*lda+7+8*n];
519
520 C[0+8*n+i] += TempA[2] * TempB[0];
521 C[1+8*n+i] += TempA[2] * TempB[1];
522 C[2+8*n+i] += TempA[2] * TempB[2];
523 C[3+8*n+i] += TempA[2] * TempB[3];
524 C[4+8*n+i] += TempA[2] * TempB[4];
525 C[5+8*n+i] += TempA[2] * TempB[5];
526 C[6+8*n+i] += TempA[2] * TempB[6];
527 C[7+8*n+i] += TempA[2] * TempB[7];
528
529
530
531 TempB[0] = B[(3+8*m)*lda+0+8*n];
532 TempB[1] = B[(3+8*m)*lda+1+8*n];
533 TempB[2] = B[(3+8*m)*lda+2+8*n];
534 TempB[3] = B[(3+8*m)*lda+3+8*n];
535 TempB[4] = B[(3+8*m)*lda+4+8*n];
536 TempB[5] = B[(3+8*m)*lda+5+8*n];
537 TempB[6] = B[(3+8*m)*lda+6+8*n];
538 TempB[7] = B[(3+8*m)*lda+7+8*n];
539
540 C[0+8*n+i] += TempA[3] * TempB[0];
541 C[1+8*n+i] += TempA[3] * TempB[1];
542 C[2+8*n+i] += TempA[3] * TempB[2];
543 C[3+8*n+i] += TempA[3] * TempB[3];
544 C[4+8*n+i] += TempA[3] * TempB[4];
545 C[5+8*n+i] += TempA[3] * TempB[5];
546 C[6+8*n+i] += TempA[3] * TempB[6];
547 C[7+8*n+i] += TempA[3] * TempB[7];
548
549
550 TempB[0] = B[(4+8*m)*lda+0+8*n];
551 TempB[1] = B[(4+8*m)*lda+1+8*n];
552 TempB[2] = B[(4+8*m)*lda+2+8*n];
553 TempB[3] = B[(4+8*m)*lda+3+8*n];
554 TempB[4] = B[(4+8*m)*lda+4+8*n];
555 TempB[5] = B[(4+8*m)*lda+5+8*n];
556 TempB[6] = B[(4+8*m)*lda+6+8*n];
557 TempB[7] = B[(4+8*m)*lda+7+8*n];
558
559 C[0+8*n+i] += TempA[4] * TempB[0];
560 C[1+8*n+i] += TempA[4] * TempB[1];
561 C[2+8*n+i] += TempA[4] * TempB[2];
562 C[3+8*n+i] += TempA[4] * TempB[3];
563 C[4+8*n+i] += TempA[4] * TempB[4];
564 C[5+8*n+i] += TempA[4] * TempB[5];
565 C[6+8*n+i] += TempA[4] * TempB[6];
566 C[7+8*n+i] += TempA[4] * TempB[7];
567
568
569
570 TempB[0] = B[(5+8*m)*lda+0+8*n];
571 TempB[1] = B[(5+8*m)*lda+1+8*n];
572 TempB[2] = B[(5+8*m)*lda+2+8*n];
573 TempB[3] = B[(5+8*m)*lda+3+8*n];
574 TempB[4] = B[(5+8*m)*lda+4+8*n];
575 TempB[5] = B[(5+8*m)*lda+5+8*n];
576 TempB[6] = B[(5+8*m)*lda+6+8*n];
577 TempB[7] = B[(5+8*m)*lda+7+8*n];
578
579 C[0+8*n+i] += TempA[5] * TempB[0];
580 C[1+8*n+i] += TempA[5] * TempB[1];
581 C[2+8*n+i] += TempA[5] * TempB[2];
582 C[3+8*n+i] += TempA[5] * TempB[3];
583 C[4+8*n+i] += TempA[5] * TempB[4];
584 C[5+8*n+i] += TempA[5] * TempB[5];
585 C[6+8*n+i] += TempA[5] * TempB[6];
586 C[7+8*n+i] += TempA[5] * TempB[7];
587
588
589
590 TempB[0] = B[(6+8*m)*lda+0+8*n];
591 TempB[1] = B[(6+8*m)*lda+1+8*n];
592 TempB[2] = B[(6+8*m)*lda+2+8*n];
593 TempB[3] = B[(6+8*m)*lda+3+8*n];
594 TempB[4] = B[(6+8*m)*lda+4+8*n];
595 TempB[5] = B[(6+8*m)*lda+5+8*n];
596 TempB[6] = B[(6+8*m)*lda+6+8*n];
597 TempB[7] = B[(6+8*m)*lda+7+8*n];
598
599 C[0+8*n+i] += TempA[6] * TempB[0];
600 C[1+8*n+i] += TempA[6] * TempB[1];
601 C[2+8*n+i] += TempA[6] * TempB[2];
602 C[3+8*n+i] += TempA[6] * TempB[3];
603 C[4+8*n+i] += TempA[6] * TempB[4];
604 C[5+8*n+i] += TempA[6] * TempB[5];
605 C[6+8*n+i] += TempA[6] * TempB[6];
606 C[7+8*n+i] += TempA[6] * TempB[7];
607
608
609 TempB[0] = B[(7+8*m)*lda+0+8*n];
610 TempB[1] = B[(7+8*m)*lda+1+8*n];
611 TempB[2] = B[(7+8*m)*lda+2+8*n];
612 TempB[3] = B[(7+8*m)*lda+3+8*n];
613 TempB[4] = B[(7+8*m)*lda+4+8*n];
614 TempB[5] = B[(7+8*m)*lda+5+8*n];
615 TempB[6] = B[(7+8*m)*lda+6+8*n];
616 TempB[7] = B[(7+8*m)*lda+7+8*n];
617
618 C[0+8*n+i] += TempA[7] * TempB[0];
619 C[1+8*n+i] += TempA[7] * TempB[1];
620 C[2+8*n+i] += TempA[7] * TempB[2];
621 C[3+8*n+i] += TempA[7] * TempB[3];
622 C[4+8*n+i] += TempA[7] * TempB[4];
623 C[5+8*n+i] += TempA[7] * TempB[5];
624 C[6+8*n+i] += TempA[7] * TempB[6];
625 C[7+8*n+i] += TempA[7] * TempB[7];
626 }
627
628 }
629 }
630 }
631
632 */
633 //-------------------------------------------------------------version2.2, optimize k. from 4 instead of 8 like v2.1, random failing on MI, unknown reason, MSI,350K, take off each inner loop for core 0 260k, both cores 134k
634 //-------------------------------------------------------------try false sharing for core 0, 136k.
635 /*
636 static __thread int j, m, n;
637 static __thread data_t TempA[4];
638 static __thread data_t TempB[4];
639
640 if(coreid == 1)
641 {
642 for ( j = 1; j < lda; j+=2 )
643 {
644 for ( m = 0; m < 8; m++ )
645 {
646 TempA[0] = A[j*lda+0+4*m];
647 TempA[1] = A[j*lda+1+4*m];
648 TempA[2] = A[j*lda+2+4*m];
649 TempA[3] = A[j*lda+3+4*m];
650
651 for( n = 0; n < 8; n++)
652 {
653
654 TempB[0] = B[(0+4*m)*lda+0+4*n];
655 TempB[1] = B[(0+4*m)*lda+1+4*n];
656 TempB[2] = B[(0+4*m)*lda+2+4*n];
657 TempB[3] = B[(0+4*m)*lda+3+4*n];
658
659
660 C[0+4*n+j*lda] += TempA[0] * TempB[0];
661 C[1+4*n+j*lda] += TempA[0] * TempB[1];
662 C[2+4*n+j*lda] += TempA[0] * TempB[2];
663 C[3+4*n+j*lda] += TempA[0] * TempB[3];
664
665
666
667
668
669 TempB[0] = B[(1+4*m)*lda+0+4*n];
670 TempB[1] = B[(1+4*m)*lda+1+4*n];
671 TempB[2] = B[(1+4*m)*lda+2+4*n];
672 TempB[3] = B[(1+4*m)*lda+3+4*n];
673
674
675 C[0+4*n+j*lda] += TempA[1] * TempB[0];
676 C[1+4*n+j*lda] += TempA[1] * TempB[1];
677 C[2+4*n+j*lda] += TempA[1] * TempB[2];
678 C[3+4*n+j*lda] += TempA[1] * TempB[3];
679
680
681
682 TempB[0] = B[(2+4*m)*lda+0+4*n];
683 TempB[1] = B[(2+4*m)*lda+1+4*n];
684 TempB[2] = B[(2+4*m)*lda+2+4*n];
685 TempB[3] = B[(2+4*m)*lda+3+4*n];
686
687
688 C[0+4*n+j*lda] += TempA[2] * TempB[0];
689 C[1+4*n+j*lda] += TempA[2] * TempB[1];
690 C[2+4*n+j*lda] += TempA[2] * TempB[2];
691 C[3+4*n+j*lda] += TempA[2] * TempB[3];
692
693
694
695
696 TempB[0] = B[(3+4*m)*lda+0+4*n];
697 TempB[1] = B[(3+4*m)*lda+1+4*n];
698 TempB[2] = B[(3+4*m)*lda+2+4*n];
699 TempB[3] = B[(3+4*m)*lda+3+4*n];
700
701
702 C[0+4*n+j*lda] += TempA[3] * TempB[0];
703 C[1+4*n+j*lda] += TempA[3] * TempB[1];
704 C[2+4*n+j*lda] += TempA[3] * TempB[2];
705 C[3+4*n+j*lda] += TempA[3] * TempB[3];
706
707
708 }
709 }
710 }
711 }
712 if(coreid == 0)
713 {
714 for ( j = 0; j < lda; j+=2 )
715 {
716 for ( m = 0; m < 8; m++ )
717 {
718 TempA[0] = A[j*lda+0+4*m];
719 TempA[1] = A[j*lda+1+4*m];
720 TempA[2] = A[j*lda+2+4*m];
721 TempA[3] = A[j*lda+3+4*m];
722
723 for( n = 0; n < 8; n++)
724 {
725
726
727
728
729
730
731
732 TempB[0] = B[(1+4*m)*lda+0+4*n];
733 TempB[1] = B[(1+4*m)*lda+1+4*n];
734 TempB[2] = B[(1+4*m)*lda+2+4*n];
735 TempB[3] = B[(1+4*m)*lda+3+4*n];
736
737
738 C[0+4*n+j*lda] += TempA[1] * TempB[0];
739 C[1+4*n+j*lda] += TempA[1] * TempB[1];
740 C[2+4*n+j*lda] += TempA[1] * TempB[2];
741 C[3+4*n+j*lda] += TempA[1] * TempB[3];
742
743
744
745 TempB[0] = B[(2+4*m)*lda+0+4*n];
746 TempB[1] = B[(2+4*m)*lda+1+4*n];
747 TempB[2] = B[(2+4*m)*lda+2+4*n];
748 TempB[3] = B[(2+4*m)*lda+3+4*n];
749
750
751 C[0+4*n+j*lda] += TempA[2] * TempB[0];
752 C[1+4*n+j*lda] += TempA[2] * TempB[1];
753 C[2+4*n+j*lda] += TempA[2] * TempB[2];
754 C[3+4*n+j*lda] += TempA[2] * TempB[3];
755
756
757
758
759 TempB[0] = B[(3+4*m)*lda+0+4*n];
760 TempB[1] = B[(3+4*m)*lda+1+4*n];
761 TempB[2] = B[(3+4*m)*lda+2+4*n];
762 TempB[3] = B[(3+4*m)*lda+3+4*n];
763
764
765 C[0+4*n+j*lda] += TempA[3] * TempB[0];
766 C[1+4*n+j*lda] += TempA[3] * TempB[1];
767 C[2+4*n+j*lda] += TempA[3] * TempB[2];
768 C[3+4*n+j*lda] += TempA[3] * TempB[3];
769
770 TempB[0] = B[(0+4*m)*lda+0+4*n];
771 TempB[1] = B[(0+4*m)*lda+1+4*n];
772 TempB[2] = B[(0+4*m)*lda+2+4*n];
773 TempB[3] = B[(0+4*m)*lda+3+4*n];
774
775
776 C[0+4*n+j*lda] += TempA[0] * TempB[0];
777 C[1+4*n+j*lda] += TempA[0] * TempB[1];
778 C[2+4*n+j*lda] += TempA[0] * TempB[2];
779 C[3+4*n+j*lda] += TempA[0] * TempB[3];
780
781
782 }
783 }
784 }
785 }
786 */
787
788
789
790 //-------------------------------------------------------------version2.3, read 8 elements in B at one time. make k to 2. 150k mi 128k msi. worse than v2.0
791 /*
792 static __thread int i, j, k, m, n;
793 static __thread data_t TempA[2];
794 static __thread data_t TempB[8];
795
796 if(coreid == 0)
797 {
798 for ( j = 0; j < lda; j+=2 )
799 {
800 for ( m = 0; m < 16; m++ )
801 {
802 TempA[0] = A[j*lda + 0 + 2*m];
803 TempA[1] = A[j*lda + 1 + 2*m];
804 for( n = 0; n < 4; n++)
805 {
806
807 TempB[0] = B[2*m*lda+0+8*n];
808 TempB[1] = B[2*m*lda+1+8*n];
809 TempB[2] = B[2*m*lda+2+8*n];
810 TempB[3] = B[2*m*lda+3+8*n];
811 TempB[4] = B[2*m*lda+4+8*n];
812 TempB[5] = B[2*m*lda+5+8*n];
813 TempB[6] = B[2*m*lda+6+8*n];
814 TempB[7] = B[2*m*lda+7+8*n];
815
816 C[0+8*n+j*lda] += TempA[0] * TempB[0];
817 C[1+8*n+j*lda] += TempA[0] * TempB[1];
818 C[2+8*n+j*lda] += TempA[0] * TempB[2];
819 C[3+8*n+j*lda] += TempA[0] * TempB[3];
820 C[4+8*n+j*lda] += TempA[0] * TempB[4];
821 C[5+8*n+j*lda] += TempA[0] * TempB[5];
822 C[6+8*n+j*lda] += TempA[0] * TempB[6];
823 C[7+8*n+j*lda] += TempA[0] * TempB[7];
824
825 TempB[0] = B[(1+2*m)*lda+0+8*n];
826 TempB[1] = B[(1+2*m)*lda+1+8*n];
827 TempB[2] = B[(1+2*m)*lda+2+8*n];
828 TempB[3] = B[(1+2*m)*lda+3+8*n];
829 TempB[4] = B[(1+2*m)*lda+4+8*n];
830 TempB[5] = B[(1+2*m)*lda+5+8*n];
831 TempB[6] = B[(1+2*m)*lda+6+8*n];
832 TempB[7] = B[(1+2*m)*lda+7+8*n];
833
834 C[0+8*n+j*lda] += TempA[1] * TempB[0];
835 C[1+8*n+j*lda] += TempA[1] * TempB[1];
836 C[2+8*n+j*lda] += TempA[1] * TempB[2];
837 C[3+8*n+j*lda] += TempA[1] * TempB[3];
838 C[4+8*n+j*lda] += TempA[1] * TempB[4];
839 C[5+8*n+j*lda] += TempA[1] * TempB[5];
840 C[6+8*n+j*lda] += TempA[1] * TempB[6];
841 C[7+8*n+j*lda] += TempA[1] * TempB[7];
842
843 }
844
845 }
846 }
847 }
848
849 if(coreid == 1)
850 {
851 for ( j = 1; j < lda; j+=2 )
852 {
853 for ( m = 0; m < 16; m++ )
854 {
855 TempA[0] = A[j*lda + 0 + 2*m];
856 TempA[1] = A[j*lda + 1 + 2*m];
857 for( n = 0; n < 4; n++)
858 {
859
860 TempB[0] = B[2*m*lda+0+8*n];
861 TempB[1] = B[2*m*lda+1+8*n];
862 TempB[2] = B[2*m*lda+2+8*n];
863 TempB[3] = B[2*m*lda+3+8*n];
864 TempB[4] = B[2*m*lda+4+8*n];
865 TempB[5] = B[2*m*lda+5+8*n];
866 TempB[6] = B[2*m*lda+6+8*n];
867 TempB[7] = B[2*m*lda+7+8*n];
868
869 C[0+8*n+j*lda] += TempA[0] * TempB[0];
870 C[1+8*n+j*lda] += TempA[0] * TempB[1];
871 C[2+8*n+j*lda] += TempA[0] * TempB[2];
872 C[3+8*n+j*lda] += TempA[0] * TempB[3];
873 C[4+8*n+j*lda] += TempA[0] * TempB[4];
874 C[5+8*n+j*lda] += TempA[0] * TempB[5];
875 C[6+8*n+j*lda] += TempA[0] * TempB[6];
876 C[7+8*n+j*lda] += TempA[0] * TempB[7];
877
878 TempB[0] = B[(1+2*m)*lda+0+8*n];
879 TempB[1] = B[(1+2*m)*lda+1+8*n];
880 TempB[2] = B[(1+2*m)*lda+2+8*n];
881 TempB[3] = B[(1+2*m)*lda+3+8*n];
882 TempB[4] = B[(1+2*m)*lda+4+8*n];
883 TempB[5] = B[(1+2*m)*lda+5+8*n];
884 TempB[6] = B[(1+2*m)*lda+6+8*n];
885 TempB[7] = B[(1+2*m)*lda+7+8*n];
886
887 C[0+8*n+j*lda] += TempA[1] * TempB[0];
888 C[1+8*n+j*lda] += TempA[1] * TempB[1];
889 C[2+8*n+j*lda] += TempA[1] * TempB[2];
890 C[3+8*n+j*lda] += TempA[1] * TempB[3];
891 C[4+8*n+j*lda] += TempA[1] * TempB[4];
892 C[5+8*n+j*lda] += TempA[1] * TempB[5];
893 C[6+8*n+j*lda] += TempA[1] * TempB[6];
894 C[7+8*n+j*lda] += TempA[1] * TempB[7];
895
896 }
897
898 }
899 }
900 }
901 */
902 //-------------------------------------------------------------version2.4, read 4 170k and 16 140k, error because not enough space elements in B at one time.
903 /*
904 static __thread int i, j, k, m, n;
905 static __thread data_t TempA;
906 static __thread data_t TempB[16];
907
908 if(coreid == 0)
909 {
910 for ( j = 0; j < lda; j+=2 )
911 {
912 for ( k = 0; k < lda; k++ )
913 {
914 TempA = A[j*lda + k];
915 for( n = 0; n < 2; n++)
916 {
917
918 TempB[0] = B[k*lda+0+16*n];
919 TempB[1] = B[k*lda+1+16*n];
920 TempB[2] = B[k*lda+2+16*n];
921 TempB[3] = B[k*lda+3+16*n];
922 TempB[4] = B[k*lda+4+16*n];
923 TempB[5] = B[k*lda+5+16*n];
924 TempB[6] = B[k*lda+6+16*n];
925 TempB[7] = B[k*lda+7+16*n];
926 TempB[8] = B[k*lda+8+16*n];
927 TempB[9] = B[k*lda+9+16*n];
928 TempB[10] = B[k*lda+10+16*n];
929 TempB[11] = B[k*lda+11+16*n];
930 TempB[12] = B[k*lda+12+16*n];
931 TempB[13] = B[k*lda+13+16*n];
932 TempB[14] = B[k*lda+14+16*n];
933 TempB[15] = B[k*lda+15+16*n];
934
935
936 C[0+16*n+j*lda] += TempA * TempB[0];
937 C[1+16*n+j*lda] += TempA * TempB[1];
938 C[2+16*n+j*lda] += TempA * TempB[2];
939 C[3+16*n+j*lda] += TempA * TempB[3];
940 C[4+16*n+j*lda] += TempA * TempB[4];
941 C[5+16*n+j*lda] += TempA * TempB[5];
942 C[6+16*n+j*lda] += TempA * TempB[6];
943 C[7+16*n+j*lda] += TempA * TempB[7];
944 C[8+16*n+j*lda] += TempA * TempB[8];
945 C[9+16*n+j*lda] += TempA * TempB[9];
946 C[10+16*n+j*lda] += TempA * TempB[10];
947 C[11+16*n+j*lda] += TempA * TempB[11];
948 C[12+16*n+j*lda] += TempA * TempB[12];
949 C[13+16*n+j*lda] += TempA * TempB[13];
950 C[14+16*n+j*lda] += TempA * TempB[14];
951 C[15+16*n+j*lda] += TempA * TempB[15];
952
953
954
955 }
956
957 }
958 }
959 }
960 if(coreid == 1)
961 {
962 for ( j = 1; j < lda; j+=2 )
963 {
964 for ( k = 0; k < lda; k++ )
965 {
966 TempA = A[j*lda + k];
967 for( n = 0; n < 2; n++)
968 {
969
970 TempB[0] = B[k*lda+0+16*n];
971 TempB[1] = B[k*lda+1+16*n];
972 TempB[2] = B[k*lda+2+16*n];
973 TempB[3] = B[k*lda+3+16*n];
974 TempB[4] = B[k*lda+4+16*n];
975 TempB[5] = B[k*lda+5+16*n];
976 TempB[6] = B[k*lda+6+16*n];
977 TempB[7] = B[k*lda+7+16*n];
978 TempB[8] = B[k*lda+8+16*n];
979 TempB[9] = B[k*lda+9+16*n];
980 TempB[10] = B[k*lda+10+16*n];
981 TempB[11] = B[k*lda+11+16*n];
982 TempB[12] = B[k*lda+12+16*n];
983 TempB[13] = B[k*lda+13+16*n];
984 TempB[14] = B[k*lda+14+16*n];
985 TempB[15] = B[k*lda+15+16*n];
986
987
988 C[0+16*n+j*lda] += TempA * TempB[0];
989 C[1+16*n+j*lda] += TempA * TempB[1];
990 C[2+16*n+j*lda] += TempA * TempB[2];
991 C[3+16*n+j*lda] += TempA * TempB[3];
992 C[4+16*n+j*lda] += TempA * TempB[4];
993 C[5+16*n+j*lda] += TempA * TempB[5];
994 C[6+16*n+j*lda] += TempA * TempB[6];
995 C[7+16*n+j*lda] += TempA * TempB[7];
996 C[8+16*n+j*lda] += TempA * TempB[8];
997 C[9+16*n+j*lda] += TempA * TempB[9];
998 C[10+16*n+j*lda] += TempA * TempB[10];
999 C[11+16*n+j*lda] += TempA * TempB[11];
1000 C[12+16*n+j*lda] += TempA * TempB[12];
1001 C[13+16*n+j*lda] += TempA * TempB[13];
1002 C[14+16*n+j*lda] += TempA * TempB[14];
1003 C[15+16*n+j*lda] += TempA * TempB[15];
1004
1005
1006
1007 }
1008
1009 }
1010 }
1011 }
1012
1013 */
1014 //-------------------------------------------------------------version2.5, read 10 elements in B at one time. has corner cases. Turns out it hangs.
1015 /*
1016 static __thread int j, k, n;
1017 static __thread data_t TempA;
1018 static __thread data_t TempB[10];
1019
1020 if(coreid == 0)
1021 {
1022 for ( j = 0; j < lda; j+=2 )
1023 {
1024 for ( k = 0; k < lda; k++ )
1025 {
1026 TempA = A[j*lda + k];
1027 for( n = 0; n < 3; n++)
1028 {
1029 TempB[0] = B[k*lda+0+10*n];
1030 TempB[1] = B[k*lda+1+10*n];
1031 TempB[2] = B[k*lda+2+10*n];
1032 TempB[3] = B[k*lda+3+10*n];
1033 TempB[4] = B[k*lda+4+10*n];
1034 TempB[5] = B[k*lda+5+10*n];
1035 TempB[6] = B[k*lda+6+10*n];
1036 TempB[7] = B[k*lda+7+10*n];
1037 TempB[8] = B[k*lda+8+10*n];
1038 TempB[9] = B[k*lda+9+10*n];
1039
1040 C[0+10*n+j*lda] += TempA * TempB[0];
1041 C[1+10*n+j*lda] += TempA * TempB[1];
1042 C[2+10*n+j*lda] += TempA * TempB[2];
1043 C[3+10*n+j*lda] += TempA * TempB[3];
1044 C[4+10*n+j*lda] += TempA * TempB[4];
1045 C[5+10*n+j*lda] += TempA * TempB[5];
1046 C[6+10*n+j*lda] += TempA * TempB[6];
1047 C[7+10*n+j*lda] += TempA * TempB[7];
1048 C[8+10*n+j*lda] += TempA * TempB[8];
1049 C[9+10*n+j*lda] += TempA * TempB[9];
1050 }
1051 TempB[0] = B[k*lda+30];
1052 TempB[1] = B[k*lda+31];
1053 C[30+j*lda] += TempA * TempB[0];
1054 C[31+j*lda] += TempA * TempB[1];
1055 }
1056 }
1057 }
1058 if(coreid == 1)
1059 {
1060 for ( j = 1; j < lda; j+=2 )
1061 {
1062 for ( k = 0; k < lda; k++ )
1063 {
1064 TempA = A[j*lda + k];
1065 for( n = 0; n < 3; n++)
1066 {
1067 TempB[0] = B[k*lda+0+10*n];
1068 TempB[1] = B[k*lda+1+10*n];
1069 TempB[2] = B[k*lda+2+10*n];
1070 TempB[3] = B[k*lda+3+10*n];
1071 TempB[4] = B[k*lda+4+10*n];
1072 TempB[5] = B[k*lda+5+10*n];
1073 TempB[6] = B[k*lda+6+10*n];
1074 TempB[7] = B[k*lda+7+10*n];
1075 TempB[8] = B[k*lda+8+10*n];
1076 TempB[9] = B[k*lda+9+10*n];
1077
1078 C[0+10*n+j*lda] += TempA * TempB[0];
1079 C[1+10*n+j*lda] += TempA * TempB[1];
1080 C[2+10*n+j*lda] += TempA * TempB[2];
1081 C[3+10*n+j*lda] += TempA * TempB[3];
1082 C[4+10*n+j*lda] += TempA * TempB[4];
1083 C[5+10*n+j*lda] += TempA * TempB[5];
1084 C[6+10*n+j*lda] += TempA * TempB[6];
1085 C[7+10*n+j*lda] += TempA * TempB[7];
1086 C[8+10*n+j*lda] += TempA * TempB[8];
1087 C[9+10*n+j*lda] += TempA * TempB[9];
1088 }
1089 TempB[0] = B[k*lda+30];
1090 TempB[1] = B[k*lda+31];
1091 C[30+j*lda] += TempA * TempB[0];
1092 C[31+j*lda] += TempA * TempB[1];
1093 }
1094 }
1095 }
1096
1097 */
1098
1099 //-------------------------------------------------------------version2.6, optimize 2.0. take off n loop and tried different order of reading B
1100 /*
1101 static __thread int j, k, n;
1102 static __thread data_t TempA;
1103 static __thread data_t TempB[8];
1104
1105 if(coreid == 0)
1106 {
1107 for ( j = 0; j < lda; j+=2 )
1108 {
1109 for ( k = 0; k < lda; k++ )
1110 {
1111 TempA = A[j*lda + k];
1112
1113 TempB[0] = B[k*lda+0];
1114 TempB[1] = B[k*lda+1];
1115 TempB[2] = B[k*lda+2];
1116 TempB[3] = B[k*lda+3];
1117 TempB[4] = B[k*lda+4];
1118 TempB[5] = B[k*lda+5];
1119 TempB[6] = B[k*lda+6];
1120 TempB[7] = B[k*lda+7];
1121
1122 C[0+j*lda] += TempA * TempB[0];
1123 C[1+j*lda] += TempA * TempB[1];
1124 C[2+j*lda] += TempA * TempB[2];
1125 C[3+j*lda] += TempA * TempB[3];
1126 C[4+j*lda] += TempA * TempB[4];
1127 C[5+j*lda] += TempA * TempB[5];
1128 C[6+j*lda] += TempA * TempB[6];
1129 C[7+j*lda] += TempA * TempB[7];
1130
1131 TempB[0] = B[k*lda+8];
1132 TempB[1] = B[k*lda+9];
1133 TempB[2] = B[k*lda+10];
1134 TempB[3] = B[k*lda+11];
1135 TempB[4] = B[k*lda+12];
1136 TempB[5] = B[k*lda+13];
1137 TempB[6] = B[k*lda+14];
1138 TempB[7] = B[k*lda+15];
1139
1140 C[8+j*lda] += TempA * TempB[0];
1141 C[9+j*lda] += TempA * TempB[1];
1142 C[10+j*lda] += TempA * TempB[2];
1143 C[11+j*lda] += TempA * TempB[3];
1144 C[12+j*lda] += TempA * TempB[4];
1145 C[13+j*lda] += TempA * TempB[5];
1146 C[14+j*lda] += TempA * TempB[6];
1147 C[15+j*lda] += TempA * TempB[7];
1148
1149 TempB[0] = B[k*lda+16];
1150 TempB[1] = B[k*lda+17];
1151 TempB[2] = B[k*lda+18];
1152 TempB[3] = B[k*lda+19];
1153 TempB[4] = B[k*lda+20];
1154 TempB[5] = B[k*lda+21];
1155 TempB[6] = B[k*lda+22];
1156 TempB[7] = B[k*lda+23];
1157
1158 C[16+j*lda] += TempA * TempB[0];
1159 C[17+j*lda] += TempA * TempB[1];
1160 C[18+j*lda] += TempA * TempB[2];
1161 C[19+j*lda] += TempA * TempB[3];
1162 C[20+j*lda] += TempA * TempB[4];
1163 C[21+j*lda] += TempA * TempB[5];
1164 C[22+j*lda] += TempA * TempB[6];
1165 C[23+j*lda] += TempA * TempB[7];
1166
1167 TempB[0] = B[k*lda+24];
1168 TempB[1] = B[k*lda+25];
1169 TempB[2] = B[k*lda+26];
1170 TempB[3] = B[k*lda+27];
1171 TempB[4] = B[k*lda+28];
1172 TempB[5] = B[k*lda+29];
1173 TempB[6] = B[k*lda+30];
1174 TempB[7] = B[k*lda+31];
1175
1176 C[24+j*lda] += TempA * TempB[0];
1177 C[25+j*lda] += TempA * TempB[1];
1178 C[26+j*lda] += TempA * TempB[2];
1179 C[27+j*lda] += TempA * TempB[3];
1180 C[28+j*lda] += TempA * TempB[4];
1181 C[29+j*lda] += TempA * TempB[5];
1182 C[30+j*lda] += TempA * TempB[6];
1183 C[31+j*lda] += TempA * TempB[7];
1184
1185
1186
1187 }
1188 }
1189 }
1190
1191 if(coreid == 1)
1192 {
1193 for ( j = 1; j < lda; j+=2 )
1194 {
1195 for ( k = 0; k < lda; k++ )
1196 {
1197 TempA = A[j*lda + k];
1198
1199
1200 TempB[0] = B[k*lda+24];
1201 TempB[1] = B[k*lda+25];
1202 TempB[2] = B[k*lda+26];
1203 TempB[3] = B[k*lda+27];
1204 TempB[4] = B[k*lda+28];
1205 TempB[5] = B[k*lda+29];
1206 TempB[6] = B[k*lda+30];
1207 TempB[7] = B[k*lda+31];
1208
1209 C[24+j*lda] += TempA * TempB[0];
1210 C[25+j*lda] += TempA * TempB[1];
1211 C[26+j*lda] += TempA * TempB[2];
1212 C[27+j*lda] += TempA * TempB[3];
1213 C[28+j*lda] += TempA * TempB[4];
1214 C[29+j*lda] += TempA * TempB[5];
1215 C[30+j*lda] += TempA * TempB[6];
1216 C[31+j*lda] += TempA * TempB[7];
1217
1218 TempB[0] = B[k*lda+0];
1219 TempB[1] = B[k*lda+1];
1220 TempB[2] = B[k*lda+2];
1221 TempB[3] = B[k*lda+3];
1222 TempB[4] = B[k*lda+4];
1223 TempB[5] = B[k*lda+5];
1224 TempB[6] = B[k*lda+6];
1225 TempB[7] = B[k*lda+7];
1226
1227 C[0+j*lda] += TempA * TempB[0];
1228 C[1+j*lda] += TempA * TempB[1];
1229 C[2+j*lda] += TempA * TempB[2];
1230 C[3+j*lda] += TempA * TempB[3];
1231 C[4+j*lda] += TempA * TempB[4];
1232 C[5+j*lda] += TempA * TempB[5];
1233 C[6+j*lda] += TempA * TempB[6];
1234 C[7+j*lda] += TempA * TempB[7];
1235
1236 TempB[0] = B[k*lda+8];
1237 TempB[1] = B[k*lda+9];
1238 TempB[2] = B[k*lda+10];
1239 TempB[3] = B[k*lda+11];
1240 TempB[4] = B[k*lda+12];
1241 TempB[5] = B[k*lda+13];
1242 TempB[6] = B[k*lda+14];
1243 TempB[7] = B[k*lda+15];
1244
1245 C[8+j*lda] += TempA * TempB[0];
1246 C[9+j*lda] += TempA * TempB[1];
1247 C[10+j*lda] += TempA * TempB[2];
1248 C[11+j*lda] += TempA * TempB[3];
1249 C[12+j*lda] += TempA * TempB[4];
1250 C[13+j*lda] += TempA * TempB[5];
1251 C[14+j*lda] += TempA * TempB[6];
1252 C[15+j*lda] += TempA * TempB[7];
1253
1254 TempB[0] = B[k*lda+16];
1255 TempB[1] = B[k*lda+17];
1256 TempB[2] = B[k*lda+18];
1257 TempB[3] = B[k*lda+19];
1258 TempB[4] = B[k*lda+20];
1259 TempB[5] = B[k*lda+21];
1260 TempB[6] = B[k*lda+22];
1261 TempB[7] = B[k*lda+23];
1262
1263 C[16+j*lda] += TempA * TempB[0];
1264 C[17+j*lda] += TempA * TempB[1];
1265 C[18+j*lda] += TempA * TempB[2];
1266 C[19+j*lda] += TempA * TempB[3];
1267 C[20+j*lda] += TempA * TempB[4];
1268 C[21+j*lda] += TempA * TempB[5];
1269 C[22+j*lda] += TempA * TempB[6];
1270 C[23+j*lda] += TempA * TempB[7];
1271
1272
1273
1274
1275
1276
1277 }
1278 }
1279 }
1280 */
1281 //-------------------------------------------------------------version2.7, use m=l*da, i=k*lda,out of stack, only i, MI 150k, only m, MSI 117.9k slower than v2.0
1282 /*
1283 static __thread int i, j, k, m, n;
1284 static __thread data_t TempA;
1285 static __thread data_t TempB[8];
1286
1287 if(coreid == 0)
1288 {
1289 for ( j = 0; j < lda; j+=2 )
1290 {
1291 m = j * lda;
1292 for ( k = 0; k < lda; k++ )
1293 {
1294 TempA = A[m+ k];
1295 for( n = 0; n < 4; n++)
1296 {
1297
1298 TempB[0] = B[k *lda+0+8*n];
1299 TempB[1] = B[k *lda+1+8*n];
1300 TempB[2] = B[k *lda+2+8*n];
1301 TempB[3] = B[k *lda+3+8*n];
1302 TempB[4] = B[k *lda+4+8*n];
1303 TempB[5] = B[k *lda+5+8*n];
1304 TempB[6] = B[k *lda+6+8*n];
1305 TempB[7] = B[k *lda+7+8*n];
1306
1307 C[0+8*n+m] += TempA * TempB[0];
1308 C[1+8*n+m] += TempA * TempB[1];
1309 C[2+8*n+m] += TempA * TempB[2];
1310 C[3+8*n+m] += TempA * TempB[3];
1311 C[4+8*n+m] += TempA * TempB[4];
1312 C[5+8*n+m] += TempA * TempB[5];
1313 C[6+8*n+m] += TempA * TempB[6];
1314 C[7+8*n+m] += TempA * TempB[7];
1315
1316 }
1317
1318 }
1319 }
1320 }
1321 if(coreid == 1)
1322 {
1323 for ( j = 1; j < lda; j+=2 )
1324 {
1325 m = j * lda;
1326 for ( k = 0; k < lda; k++ )
1327 {
1328 TempA = A[m+ k];
1329 for( n = 0; n < 4; n++)
1330 {
1331
1332 TempB[0] = B[k *lda+0+8*n];
1333 TempB[1] = B[k *lda+1+8*n];
1334 TempB[2] = B[k *lda+2+8*n];
1335 TempB[3] = B[k *lda+3+8*n];
1336 TempB[4] = B[k *lda+4+8*n];
1337 TempB[5] = B[k *lda+5+8*n];
1338 TempB[6] = B[k *lda+6+8*n];
1339 TempB[7] = B[k *lda+7+8*n];
1340
1341 C[0+8*n+m] += TempA * TempB[0];
1342 C[1+8*n+m] += TempA * TempB[1];
1343 C[2+8*n+m] += TempA * TempB[2];
1344 C[3+8*n+m] += TempA * TempB[3];
1345 C[4+8*n+m] += TempA * TempB[4];
1346 C[5+8*n+m] += TempA * TempB[5];
1347 C[6+8*n+m] += TempA * TempB[6];
1348 C[7+8*n+m] += TempA * TempB[7];
1349
1350 }
1351
1352 }
1353 }
1354 }
1355 */
1356 //-------------------------------------------------------------version2.8 deal with false sharing, MSI,118K vs v2.0 117.0K. MI 147.629K.
1357 /*
1358 static __thread int i, j, k, m, n;
1359 static __thread data_t TempA;
1360 static __thread data_t TempB[8];
1361
1362 if(coreid == 0)
1363 {
1364 for ( j = 0; j < lda; j+=2 )
1365 {
1366 for ( k = 0; k < lda; k++ )
1367 {
1368 TempA = A[j*lda + k];
1369 for( n = 0; n < 2; n++)
1370 {
1371
1372 TempB[0] = B[k*lda+0+16*n];
1373 TempB[1] = B[k*lda+1+16*n];
1374 TempB[2] = B[k*lda+2+16*n];
1375 TempB[3] = B[k*lda+3+16*n];
1376 TempB[4] = B[k*lda+4+16*n];
1377 TempB[5] = B[k*lda+5+16*n];
1378 TempB[6] = B[k*lda+6+16*n];
1379 TempB[7] = B[k*lda+7+16*n];
1380
1381
1382
1383 C[0+16*n+j*lda] += TempA * TempB[0];
1384 C[1+16*n+j*lda] += TempA * TempB[1];
1385 C[2+16*n+j*lda] += TempA * TempB[2];
1386 C[3+16*n+j*lda] += TempA * TempB[3];
1387 C[4+16*n+j*lda] += TempA * TempB[4];
1388 C[5+16*n+j*lda] += TempA * TempB[5];
1389 C[6+16*n+j*lda] += TempA * TempB[6];
1390 C[7+16*n+j*lda] += TempA * TempB[7];
1391
1392 TempB[0] = B[k*lda+8+16*n];
1393 TempB[1] = B[k*lda+9+16*n];
1394 TempB[2] = B[k*lda+10+16*n];
1395 TempB[3] = B[k*lda+11+16*n];
1396 TempB[4] = B[k*lda+12+16*n];
1397 TempB[5] = B[k*lda+13+16*n];
1398 TempB[6] = B[k*lda+14+16*n];
1399 TempB[7] = B[k*lda+15+16*n];
1400
1401 C[8+16*n+j*lda] += TempA * TempB[0];
1402 C[9+16*n+j*lda] += TempA * TempB[1];
1403 C[10+16*n+j*lda] += TempA * TempB[2];
1404 C[11+16*n+j*lda] += TempA * TempB[3];
1405 C[12+16*n+j*lda] += TempA * TempB[4];
1406 C[13+16*n+j*lda] += TempA * TempB[5];
1407 C[14+16*n+j*lda] += TempA * TempB[6];
1408 C[15+16*n+j*lda] += TempA * TempB[7];
1409
1410
1411
1412 }
1413
1414 }
1415 }
1416 }
1417 if(coreid == 1)
1418 {
1419 for ( j = 1; j < lda; j+=2 )
1420 {
1421 for ( k = 0; k < lda; k++ )
1422 {
1423 TempA = A[j*lda + k];
1424 for( n = 0; n < 2; n++)
1425 {
1426
1427
1428
1429 TempB[0] = B[k*lda+8+16*n];
1430 TempB[1] = B[k*lda+9+16*n];
1431 TempB[2] = B[k*lda+10+16*n];
1432 TempB[3] = B[k*lda+11+16*n];
1433 TempB[4] = B[k*lda+12+16*n];
1434 TempB[5] = B[k*lda+13+16*n];
1435 TempB[6] = B[k*lda+14+16*n];
1436 TempB[7] = B[k*lda+15+16*n];
1437
1438 C[8+16*n+j*lda] += TempA * TempB[0];
1439 C[9+16*n+j*lda] += TempA * TempB[1];
1440 C[10+16*n+j*lda] += TempA * TempB[2];
1441 C[11+16*n+j*lda] += TempA * TempB[3];
1442 C[12+16*n+j*lda] += TempA * TempB[4];
1443 C[13+16*n+j*lda] += TempA * TempB[5];
1444 C[14+16*n+j*lda] += TempA * TempB[6];
1445 C[15+16*n+j*lda] += TempA * TempB[7];
1446
1447 TempB[0] = B[k*lda+0+16*n];
1448 TempB[1] = B[k*lda+1+16*n];
1449 TempB[2] = B[k*lda+2+16*n];
1450 TempB[3] = B[k*lda+3+16*n];
1451 TempB[4] = B[k*lda+4+16*n];
1452 TempB[5] = B[k*lda+5+16*n];
1453 TempB[6] = B[k*lda+6+16*n];
1454 TempB[7] = B[k*lda+7+16*n];
1455
1456
1457
1458 C[0+16*n+j*lda] += TempA * TempB[0];
1459 C[1+16*n+j*lda] += TempA * TempB[1];
1460 C[2+16*n+j*lda] += TempA * TempB[2];
1461 C[3+16*n+j*lda] += TempA * TempB[3];
1462 C[4+16*n+j*lda] += TempA * TempB[4];
1463 C[5+16*n+j*lda] += TempA * TempB[5];
1464 C[6+16*n+j*lda] += TempA * TempB[6];
1465 C[7+16*n+j*lda] += TempA * TempB[7];
1466
1467
1468 }
1469
1470 }
1471 }
1472 }
1473 */
1474
1475 //----------------------------------------------------------------version 2.11 optmize j,use core 1 j from 0 to 15 MSI 98k i = j*lda
1476 //----------------------------------------------------------------version 2.12 not use i = j *lda MSI 95k
1477 /*
1478 static __thread data_t TempA[8];
1479 static __thread data_t TempB[8];
1480 static __thread int j,m,n,i,k;
1481
1482 if(coreid == 1)
1483 {
1484 for ( j = 16; j < 32; j++ )
1485 {
1486
1487 for ( m = 0; m < 4; m++ )
1488 {
1489
1490 TempA[0] = A[j*lda+0+8*m];
1491 TempA[1] = A[j*lda+1+8*m];
1492 TempA[2] = A[j*lda+2+8*m];
1493 TempA[3] = A[j*lda+3+8*m];
1494 TempA[4] = A[j*lda+4+8*m];
1495 TempA[5] = A[j*lda+5+8*m];
1496 TempA[6] = A[j*lda+6+8*m];
1497 TempA[7] = A[j*lda+7+8*m];
1498
1499 for( n = 0; n < 4; n++)
1500 {
1501
1502 TempB[0] = B[(0+8*m)*lda+0+8*n];
1503 TempB[1] = B[(0+8*m)*lda+1+8*n];
1504 TempB[2] = B[(0+8*m)*lda+2+8*n];
1505 TempB[3] = B[(0+8*m)*lda+3+8*n];
1506 TempB[4] = B[(0+8*m)*lda+4+8*n];
1507 TempB[5] = B[(0+8*m)*lda+5+8*n];
1508 TempB[6] = B[(0+8*m)*lda+6+8*n];
1509 TempB[7] = B[(0+8*m)*lda+7+8*n];
1510
1511 C[0+8*n+j*lda] += TempA[0] * TempB[0];
1512 C[1+8*n+j*lda] += TempA[0] * TempB[1];
1513 C[2+8*n+j*lda] += TempA[0] * TempB[2];
1514 C[3+8*n+j*lda] += TempA[0] * TempB[3];
1515 C[4+8*n+j*lda] += TempA[0] * TempB[4];
1516 C[5+8*n+j*lda] += TempA[0] * TempB[5];
1517 C[6+8*n+j*lda] += TempA[0] * TempB[6];
1518 C[7+8*n+j*lda] += TempA[0] * TempB[7];
1519
1520
1521
1522 TempB[0] = B[(1+8*m)*lda+0+8*n];
1523 TempB[1] = B[(1+8*m)*lda+1+8*n];
1524 TempB[2] = B[(1+8*m)*lda+2+8*n];
1525 TempB[3] = B[(1+8*m)*lda+3+8*n];
1526 TempB[4] = B[(1+8*m)*lda+4+8*n];
1527 TempB[5] = B[(1+8*m)*lda+5+8*n];
1528 TempB[6] = B[(1+8*m)*lda+6+8*n];
1529 TempB[7] = B[(1+8*m)*lda+7+8*n];
1530
1531 C[0+8*n+j*lda] += TempA[1] * TempB[0];
1532 C[1+8*n+j*lda] += TempA[1] * TempB[1];
1533 C[2+8*n+j*lda] += TempA[1] * TempB[2];
1534 C[3+8*n+j*lda] += TempA[1] * TempB[3];
1535 C[4+8*n+j*lda] += TempA[1] * TempB[4];
1536 C[5+8*n+j*lda] += TempA[1] * TempB[5];
1537 C[6+8*n+j*lda] += TempA[1] * TempB[6];
1538 C[7+8*n+j*lda] += TempA[1] * TempB[7];
1539
1540
1541
1542 TempB[0] = B[(2+8*m)*lda+0+8*n];
1543 TempB[1] = B[(2+8*m)*lda+1+8*n];
1544 TempB[2] = B[(2+8*m)*lda+2+8*n];
1545 TempB[3] = B[(2+8*m)*lda+3+8*n];
1546 TempB[4] = B[(2+8*m)*lda+4+8*n];
1547 TempB[5] = B[(2+8*m)*lda+5+8*n];
1548 TempB[6] = B[(2+8*m)*lda+6+8*n];
1549 TempB[7] = B[(2+8*m)*lda+7+8*n];
1550
1551 C[0+8*n+j*lda] += TempA[2] * TempB[0];
1552 C[1+8*n+j*lda] += TempA[2] * TempB[1];
1553 C[2+8*n+j*lda] += TempA[2] * TempB[2];
1554 C[3+8*n+j*lda] += TempA[2] * TempB[3];
1555 C[4+8*n+j*lda] += TempA[2] * TempB[4];
1556 C[5+8*n+j*lda] += TempA[2] * TempB[5];
1557 C[6+8*n+j*lda] += TempA[2] * TempB[6];
1558 C[7+8*n+j*lda] += TempA[2] * TempB[7];
1559
1560
1561
1562 TempB[0] = B[(3+8*m)*lda+0+8*n];
1563 TempB[1] = B[(3+8*m)*lda+1+8*n];
1564 TempB[2] = B[(3+8*m)*lda+2+8*n];
1565 TempB[3] = B[(3+8*m)*lda+3+8*n];
1566 TempB[4] = B[(3+8*m)*lda+4+8*n];
1567 TempB[5] = B[(3+8*m)*lda+5+8*n];
1568 TempB[6] = B[(3+8*m)*lda+6+8*n];
1569 TempB[7] = B[(3+8*m)*lda+7+8*n];
1570
1571 C[0+8*n+j*lda] += TempA[3] * TempB[0];
1572 C[1+8*n+j*lda] += TempA[3] * TempB[1];
1573 C[2+8*n+j*lda] += TempA[3] * TempB[2];
1574 C[3+8*n+j*lda] += TempA[3] * TempB[3];
1575 C[4+8*n+j*lda] += TempA[3] * TempB[4];
1576 C[5+8*n+j*lda] += TempA[3] * TempB[5];
1577 C[6+8*n+j*lda] += TempA[3] * TempB[6];
1578 C[7+8*n+j*lda] += TempA[3] * TempB[7];
1579
1580
1581 TempB[0] = B[(4+8*m)*lda+0+8*n];
1582 TempB[1] = B[(4+8*m)*lda+1+8*n];
1583 TempB[2] = B[(4+8*m)*lda+2+8*n];
1584 TempB[3] = B[(4+8*m)*lda+3+8*n];
1585 TempB[4] = B[(4+8*m)*lda+4+8*n];
1586 TempB[5] = B[(4+8*m)*lda+5+8*n];
1587 TempB[6] = B[(4+8*m)*lda+6+8*n];
1588 TempB[7] = B[(4+8*m)*lda+7+8*n];
1589
1590 C[0+8*n+j*lda] += TempA[4] * TempB[0];
1591 C[1+8*n+j*lda] += TempA[4] * TempB[1];
1592 C[2+8*n+j*lda] += TempA[4] * TempB[2];
1593 C[3+8*n+j*lda] += TempA[4] * TempB[3];
1594 C[4+8*n+j*lda] += TempA[4] * TempB[4];
1595 C[5+8*n+j*lda] += TempA[4] * TempB[5];
1596 C[6+8*n+j*lda] += TempA[4] * TempB[6];
1597 C[7+8*n+j*lda] += TempA[4] * TempB[7];
1598
1599
1600
1601 TempB[0] = B[(5+8*m)*lda+0+8*n];
1602 TempB[1] = B[(5+8*m)*lda+1+8*n];
1603 TempB[2] = B[(5+8*m)*lda+2+8*n];
1604 TempB[3] = B[(5+8*m)*lda+3+8*n];
1605 TempB[4] = B[(5+8*m)*lda+4+8*n];
1606 TempB[5] = B[(5+8*m)*lda+5+8*n];
1607 TempB[6] = B[(5+8*m)*lda+6+8*n];
1608 TempB[7] = B[(5+8*m)*lda+7+8*n];
1609
1610 C[0+8*n+j*lda] += TempA[5] * TempB[0];
1611 C[1+8*n+j*lda] += TempA[5] * TempB[1];
1612 C[2+8*n+j*lda] += TempA[5] * TempB[2];
1613 C[3+8*n+j*lda] += TempA[5] * TempB[3];
1614 C[4+8*n+j*lda] += TempA[5] * TempB[4];
1615 C[5+8*n+j*lda] += TempA[5] * TempB[5];
1616 C[6+8*n+j*lda] += TempA[5] * TempB[6];
1617 C[7+8*n+j*lda] += TempA[5] * TempB[7];
1618
1619
1620
1621 TempB[0] = B[(6+8*m)*lda+0+8*n];
1622 TempB[1] = B[(6+8*m)*lda+1+8*n];
1623 TempB[2] = B[(6+8*m)*lda+2+8*n];
1624 TempB[3] = B[(6+8*m)*lda+3+8*n];
1625 TempB[4] = B[(6+8*m)*lda+4+8*n];
1626 TempB[5] = B[(6+8*m)*lda+5+8*n];
1627 TempB[6] = B[(6+8*m)*lda+6+8*n];
1628 TempB[7] = B[(6+8*m)*lda+7+8*n];
1629
1630 C[0+8*n+j*lda] += TempA[6] * TempB[0];
1631 C[1+8*n+j*lda] += TempA[6] * TempB[1];
1632 C[2+8*n+j*lda] += TempA[6] * TempB[2];
1633 C[3+8*n+j*lda] += TempA[6] * TempB[3];
1634 C[4+8*n+j*lda] += TempA[6] * TempB[4];
1635 C[5+8*n+j*lda] += TempA[6] * TempB[5];
1636 C[6+8*n+j*lda] += TempA[6] * TempB[6];
1637 C[7+8*n+j*lda] += TempA[6] * TempB[7];
1638
1639
1640 TempB[0] = B[(7+8*m)*lda+0+8*n];
1641 TempB[1] = B[(7+8*m)*lda+1+8*n];
1642 TempB[2] = B[(7+8*m)*lda+2+8*n];
1643 TempB[3] = B[(7+8*m)*lda+3+8*n];
1644 TempB[4] = B[(7+8*m)*lda+4+8*n];
1645 TempB[5] = B[(7+8*m)*lda+5+8*n];
1646 TempB[6] = B[(7+8*m)*lda+6+8*n];
1647 TempB[7] = B[(7+8*m)*lda+7+8*n];
1648
1649 C[0+8*n+j*lda] += TempA[7] * TempB[0];
1650 C[1+8*n+j*lda] += TempA[7] * TempB[1];
1651 C[2+8*n+j*lda] += TempA[7] * TempB[2];
1652 C[3+8*n+j*lda] += TempA[7] * TempB[3];
1653 C[4+8*n+j*lda] += TempA[7] * TempB[4];
1654 C[5+8*n+j*lda] += TempA[7] * TempB[5];
1655 C[6+8*n+j*lda] += TempA[7] * TempB[6];
1656 C[7+8*n+j*lda] += TempA[7] * TempB[7];
1657 }
1658
1659 }
1660 }
1661 }
1662 if(coreid ==0)
1663 {
1664 for ( j = 0; j < 16; j++ )
1665 {
1666
1667 for ( m = 0; m < 4; m++ )
1668 {
1669
1670 TempA[0] = A[j*lda+0+8*m];
1671 TempA[1] = A[j*lda+1+8*m];
1672 TempA[2] = A[j*lda+2+8*m];
1673 TempA[3] = A[j*lda+3+8*m];
1674 TempA[4] = A[j*lda+4+8*m];
1675 TempA[5] = A[j*lda+5+8*m];
1676 TempA[6] = A[j*lda+6+8*m];
1677 TempA[7] = A[j*lda+7+8*m];
1678
1679 for( n = 0; n < 4; n++)
1680 {
1681
1682 TempB[0] = B[(0+8*m)*lda+0+8*n];
1683 TempB[1] = B[(0+8*m)*lda+1+8*n];
1684 TempB[2] = B[(0+8*m)*lda+2+8*n];
1685 TempB[3] = B[(0+8*m)*lda+3+8*n];
1686 TempB[4] = B[(0+8*m)*lda+4+8*n];
1687 TempB[5] = B[(0+8*m)*lda+5+8*n];
1688 TempB[6] = B[(0+8*m)*lda+6+8*n];
1689 TempB[7] = B[(0+8*m)*lda+7+8*n];
1690
1691 C[0+8*n+j*lda] += TempA[0] * TempB[0];
1692 C[1+8*n+j*lda] += TempA[0] * TempB[1];
1693 C[2+8*n+j*lda] += TempA[0] * TempB[2];
1694 C[3+8*n+j*lda] += TempA[0] * TempB[3];
1695 C[4+8*n+j*lda] += TempA[0] * TempB[4];
1696 C[5+8*n+j*lda] += TempA[0] * TempB[5];
1697 C[6+8*n+j*lda] += TempA[0] * TempB[6];
1698 C[7+8*n+j*lda] += TempA[0] * TempB[7];
1699
1700
1701
1702 TempB[0] = B[(1+8*m)*lda+0+8*n];
1703 TempB[1] = B[(1+8*m)*lda+1+8*n];
1704 TempB[2] = B[(1+8*m)*lda+2+8*n];
1705 TempB[3] = B[(1+8*m)*lda+3+8*n];
1706 TempB[4] = B[(1+8*m)*lda+4+8*n];
1707 TempB[5] = B[(1+8*m)*lda+5+8*n];
1708 TempB[6] = B[(1+8*m)*lda+6+8*n];
1709 TempB[7] = B[(1+8*m)*lda+7+8*n];
1710
1711 C[0+8*n+j*lda] += TempA[1] * TempB[0];
1712 C[1+8*n+j*lda] += TempA[1] * TempB[1];
1713 C[2+8*n+j*lda] += TempA[1] * TempB[2];
1714 C[3+8*n+j*lda] += TempA[1] * TempB[3];
1715 C[4+8*n+j*lda] += TempA[1] * TempB[4];
1716 C[5+8*n+j*lda] += TempA[1] * TempB[5];
1717 C[6+8*n+j*lda] += TempA[1] * TempB[6];
1718 C[7+8*n+j*lda] += TempA[1] * TempB[7];
1719
1720
1721
1722 TempB[0] = B[(2+8*m)*lda+0+8*n];
1723 TempB[1] = B[(2+8*m)*lda+1+8*n];
1724 TempB[2] = B[(2+8*m)*lda+2+8*n];
1725 TempB[3] = B[(2+8*m)*lda+3+8*n];
1726 TempB[4] = B[(2+8*m)*lda+4+8*n];
1727 TempB[5] = B[(2+8*m)*lda+5+8*n];
1728 TempB[6] = B[(2+8*m)*lda+6+8*n];
1729 TempB[7] = B[(2+8*m)*lda+7+8*n];
1730
1731 C[0+8*n+j*lda] += TempA[2] * TempB[0];
1732 C[1+8*n+j*lda] += TempA[2] * TempB[1];
1733 C[2+8*n+j*lda] += TempA[2] * TempB[2];
1734 C[3+8*n+j*lda] += TempA[2] * TempB[3];
1735 C[4+8*n+j*lda] += TempA[2] * TempB[4];
1736 C[5+8*n+j*lda] += TempA[2] * TempB[5];
1737 C[6+8*n+j*lda] += TempA[2] * TempB[6];
1738 C[7+8*n+j*lda] += TempA[2] * TempB[7];
1739
1740
1741
1742 TempB[0] = B[(3+8*m)*lda+0+8*n];
1743 TempB[1] = B[(3+8*m)*lda+1+8*n];
1744 TempB[2] = B[(3+8*m)*lda+2+8*n];
1745 TempB[3] = B[(3+8*m)*lda+3+8*n];
1746 TempB[4] = B[(3+8*m)*lda+4+8*n];
1747 TempB[5] = B[(3+8*m)*lda+5+8*n];
1748 TempB[6] = B[(3+8*m)*lda+6+8*n];
1749 TempB[7] = B[(3+8*m)*lda+7+8*n];
1750
1751 C[0+8*n+j*lda] += TempA[3] * TempB[0];
1752 C[1+8*n+j*lda] += TempA[3] * TempB[1];
1753 C[2+8*n+j*lda] += TempA[3] * TempB[2];
1754 C[3+8*n+j*lda] += TempA[3] * TempB[3];
1755 C[4+8*n+j*lda] += TempA[3] * TempB[4];
1756 C[5+8*n+j*lda] += TempA[3] * TempB[5];
1757 C[6+8*n+j*lda] += TempA[3] * TempB[6];
1758 C[7+8*n+j*lda] += TempA[3] * TempB[7];
1759
1760
1761 TempB[0] = B[(4+8*m)*lda+0+8*n];
1762 TempB[1] = B[(4+8*m)*lda+1+8*n];
1763 TempB[2] = B[(4+8*m)*lda+2+8*n];
1764 TempB[3] = B[(4+8*m)*lda+3+8*n];
1765 TempB[4] = B[(4+8*m)*lda+4+8*n];
1766 TempB[5] = B[(4+8*m)*lda+5+8*n];
1767 TempB[6] = B[(4+8*m)*lda+6+8*n];
1768 TempB[7] = B[(4+8*m)*lda+7+8*n];
1769
1770 C[0+8*n+j*lda] += TempA[4] * TempB[0];
1771 C[1+8*n+j*lda] += TempA[4] * TempB[1];
1772 C[2+8*n+j*lda] += TempA[4] * TempB[2];
1773 C[3+8*n+j*lda] += TempA[4] * TempB[3];
1774 C[4+8*n+j*lda] += TempA[4] * TempB[4];
1775 C[5+8*n+j*lda] += TempA[4] * TempB[5];
1776 C[6+8*n+j*lda] += TempA[4] * TempB[6];
1777 C[7+8*n+j*lda] += TempA[4] * TempB[7];
1778
1779
1780
1781 TempB[0] = B[(5+8*m)*lda+0+8*n];
1782 TempB[1] = B[(5+8*m)*lda+1+8*n];
1783 TempB[2] = B[(5+8*m)*lda+2+8*n];
1784 TempB[3] = B[(5+8*m)*lda+3+8*n];
1785 TempB[4] = B[(5+8*m)*lda+4+8*n];
1786 TempB[5] = B[(5+8*m)*lda+5+8*n];
1787 TempB[6] = B[(5+8*m)*lda+6+8*n];
1788 TempB[7] = B[(5+8*m)*lda+7+8*n];
1789
1790 C[0+8*n+j*lda] += TempA[5] * TempB[0];
1791 C[1+8*n+j*lda] += TempA[5] * TempB[1];
1792 C[2+8*n+j*lda] += TempA[5] * TempB[2];
1793 C[3+8*n+j*lda] += TempA[5] * TempB[3];
1794 C[4+8*n+j*lda] += TempA[5] * TempB[4];
1795 C[5+8*n+j*lda] += TempA[5] * TempB[5];
1796 C[6+8*n+j*lda] += TempA[5] * TempB[6];
1797 C[7+8*n+j*lda] += TempA[5] * TempB[7];
1798
1799
1800
1801 TempB[0] = B[(6+8*m)*lda+0+8*n];
1802 TempB[1] = B[(6+8*m)*lda+1+8*n];
1803 TempB[2] = B[(6+8*m)*lda+2+8*n];
1804 TempB[3] = B[(6+8*m)*lda+3+8*n];
1805 TempB[4] = B[(6+8*m)*lda+4+8*n];
1806 TempB[5] = B[(6+8*m)*lda+5+8*n];
1807 TempB[6] = B[(6+8*m)*lda+6+8*n];
1808 TempB[7] = B[(6+8*m)*lda+7+8*n];
1809
1810 C[0+8*n+j*lda] += TempA[6] * TempB[0];
1811 C[1+8*n+j*lda] += TempA[6] * TempB[1];
1812 C[2+8*n+j*lda] += TempA[6] * TempB[2];
1813 C[3+8*n+j*lda] += TempA[6] * TempB[3];
1814 C[4+8*n+j*lda] += TempA[6] * TempB[4];
1815 C[5+8*n+j*lda] += TempA[6] * TempB[5];
1816 C[6+8*n+j*lda] += TempA[6] * TempB[6];
1817 C[7+8*n+j*lda] += TempA[6] * TempB[7];
1818
1819
1820 TempB[0] = B[(7+8*m)*lda+0+8*n];
1821 TempB[1] = B[(7+8*m)*lda+1+8*n];
1822 TempB[2] = B[(7+8*m)*lda+2+8*n];
1823 TempB[3] = B[(7+8*m)*lda+3+8*n];
1824 TempB[4] = B[(7+8*m)*lda+4+8*n];
1825 TempB[5] = B[(7+8*m)*lda+5+8*n];
1826 TempB[6] = B[(7+8*m)*lda+6+8*n];
1827 TempB[7] = B[(7+8*m)*lda+7+8*n];
1828
1829 C[0+8*n+j*lda] += TempA[7] * TempB[0];
1830 C[1+8*n+j*lda] += TempA[7] * TempB[1];
1831 C[2+8*n+j*lda] += TempA[7] * TempB[2];
1832 C[3+8*n+j*lda] += TempA[7] * TempB[3];
1833 C[4+8*n+j*lda] += TempA[7] * TempB[4];
1834 C[5+8*n+j*lda] += TempA[7] * TempB[5];
1835 C[6+8*n+j*lda] += TempA[7] * TempB[6];
1836 C[7+8*n+j*lda] += TempA[7] * TempB[7];
1837 }
1838
1839 }
1840 }
1841 }
1842 */
1843 //-----------------------------------------------------------------version 2.14 optimize C. when tempc[8] inside n loop, MSI, 98K MI,158k
1844 //-----------------------------------------------------------------version 2.15 optimize v2.14 a little MSI 89k. MI, 161K. don't decare tempc[8]=0 in the loop
1845 /*
1846 static __thread data_t TempA[8];
1847 static __thread data_t TempB[8];
1848 static __thread data_t TempC[8];
1849 static __thread int j,m,n,i,k;
1850
1851 if(coreid == 1)
1852 {
1853 for ( j = 16; j < 32; j++ )
1854 {
1855
1856 for ( m = 0; m < 4; m++ )
1857 {
1858
1859 TempA[0] = A[j*lda+0+8*m];
1860 TempA[1] = A[j*lda+1+8*m];
1861 TempA[2] = A[j*lda+2+8*m];
1862 TempA[3] = A[j*lda+3+8*m];
1863 TempA[4] = A[j*lda+4+8*m];
1864 TempA[5] = A[j*lda+5+8*m];
1865 TempA[6] = A[j*lda+6+8*m];
1866 TempA[7] = A[j*lda+7+8*m];
1867
1868
1869
1870 for( n = 0; n < 4; n++)
1871 {
1872
1873
1874 TempB[0] = B[(0+8*m)*lda+0+8*n];
1875 TempB[1] = B[(0+8*m)*lda+1+8*n];
1876 TempB[2] = B[(0+8*m)*lda+2+8*n];
1877 TempB[3] = B[(0+8*m)*lda+3+8*n];
1878 TempB[4] = B[(0+8*m)*lda+4+8*n];
1879 TempB[5] = B[(0+8*m)*lda+5+8*n];
1880 TempB[6] = B[(0+8*m)*lda+6+8*n];
1881 TempB[7] = B[(0+8*m)*lda+7+8*n];
1882
1883
1884 TempC[0] = TempA[0] * TempB[0];
1885 TempC[1] = TempA[0] * TempB[1];
1886 TempC[2] = TempA[0] * TempB[2];
1887 TempC[3] = TempA[0] * TempB[3];
1888 TempC[4] = TempA[0] * TempB[4];
1889 TempC[5] = TempA[0] * TempB[5];
1890 TempC[6] = TempA[0] * TempB[6];
1891 TempC[7] = TempA[0] * TempB[7];
1892
1893
1894
1895 TempB[0] = B[(1+8*m)*lda+0+8*n];
1896 TempB[1] = B[(1+8*m)*lda+1+8*n];
1897 TempB[2] = B[(1+8*m)*lda+2+8*n];
1898 TempB[3] = B[(1+8*m)*lda+3+8*n];
1899 TempB[4] = B[(1+8*m)*lda+4+8*n];
1900 TempB[5] = B[(1+8*m)*lda+5+8*n];
1901 TempB[6] = B[(1+8*m)*lda+6+8*n];
1902 TempB[7] = B[(1+8*m)*lda+7+8*n];
1903
1904 TempC[0] += TempA[1] * TempB[0];
1905 TempC[1] += TempA[1] * TempB[1];
1906 TempC[2] += TempA[1] * TempB[2];
1907 TempC[3] += TempA[1] * TempB[3];
1908 TempC[4] += TempA[1] * TempB[4];
1909 TempC[5] += TempA[1] * TempB[5];
1910 TempC[6] += TempA[1] * TempB[6];
1911 TempC[7] += TempA[1] * TempB[7];
1912
1913
1914
1915 TempB[0] = B[(2+8*m)*lda+0+8*n];
1916 TempB[1] = B[(2+8*m)*lda+1+8*n];
1917 TempB[2] = B[(2+8*m)*lda+2+8*n];
1918 TempB[3] = B[(2+8*m)*lda+3+8*n];
1919 TempB[4] = B[(2+8*m)*lda+4+8*n];
1920 TempB[5] = B[(2+8*m)*lda+5+8*n];
1921 TempB[6] = B[(2+8*m)*lda+6+8*n];
1922 TempB[7] = B[(2+8*m)*lda+7+8*n];
1923
1924 TempC[0] += TempA[2] * TempB[0];
1925 TempC[1] += TempA[2] * TempB[1];
1926 TempC[2] += TempA[2] * TempB[2];
1927 TempC[3] += TempA[2] * TempB[3];
1928 TempC[4] += TempA[2] * TempB[4];
1929 TempC[5] += TempA[2] * TempB[5];
1930 TempC[6] += TempA[2] * TempB[6];
1931 TempC[7] += TempA[2] * TempB[7];
1932
1933
1934
1935 TempB[0] = B[(3+8*m)*lda+0+8*n];
1936 TempB[1] = B[(3+8*m)*lda+1+8*n];
1937 TempB[2] = B[(3+8*m)*lda+2+8*n];
1938 TempB[3] = B[(3+8*m)*lda+3+8*n];
1939 TempB[4] = B[(3+8*m)*lda+4+8*n];
1940 TempB[5] = B[(3+8*m)*lda+5+8*n];
1941 TempB[6] = B[(3+8*m)*lda+6+8*n];
1942 TempB[7] = B[(3+8*m)*lda+7+8*n];
1943
1944 TempC[0] += TempA[3] * TempB[0];
1945 TempC[1] += TempA[3] * TempB[1];
1946 TempC[2] += TempA[3] * TempB[2];
1947 TempC[3] += TempA[3] * TempB[3];
1948 TempC[4] += TempA[3] * TempB[4];
1949 TempC[5] += TempA[3] * TempB[5];
1950 TempC[6] += TempA[3] * TempB[6];
1951 TempC[7] += TempA[3] * TempB[7];
1952
1953
1954 TempB[0] = B[(4+8*m)*lda+0+8*n];
1955 TempB[1] = B[(4+8*m)*lda+1+8*n];
1956 TempB[2] = B[(4+8*m)*lda+2+8*n];
1957 TempB[3] = B[(4+8*m)*lda+3+8*n];
1958 TempB[4] = B[(4+8*m)*lda+4+8*n];
1959 TempB[5] = B[(4+8*m)*lda+5+8*n];
1960 TempB[6] = B[(4+8*m)*lda+6+8*n];
1961 TempB[7] = B[(4+8*m)*lda+7+8*n];
1962
1963 TempC[0] += TempA[4] * TempB[0];
1964 TempC[1] += TempA[4] * TempB[1];
1965 TempC[2] += TempA[4] * TempB[2];
1966 TempC[3] += TempA[4] * TempB[3];
1967 TempC[4] += TempA[4] * TempB[4];
1968 TempC[5] += TempA[4] * TempB[5];
1969 TempC[6] += TempA[4] * TempB[6];
1970 TempC[7] += TempA[4] * TempB[7];
1971
1972
1973
1974 TempB[0] = B[(5+8*m)*lda+0+8*n];
1975 TempB[1] = B[(5+8*m)*lda+1+8*n];
1976 TempB[2] = B[(5+8*m)*lda+2+8*n];
1977 TempB[3] = B[(5+8*m)*lda+3+8*n];
1978 TempB[4] = B[(5+8*m)*lda+4+8*n];
1979 TempB[5] = B[(5+8*m)*lda+5+8*n];
1980 TempB[6] = B[(5+8*m)*lda+6+8*n];
1981 TempB[7] = B[(5+8*m)*lda+7+8*n];
1982
1983 TempC[0] += TempA[5] * TempB[0];
1984 TempC[1] += TempA[5] * TempB[1];
1985 TempC[2] += TempA[5] * TempB[2];
1986 TempC[3] += TempA[5] * TempB[3];
1987 TempC[4] += TempA[5] * TempB[4];
1988 TempC[5] += TempA[5] * TempB[5];
1989 TempC[6] += TempA[5] * TempB[6];
1990 TempC[7] += TempA[5] * TempB[7];
1991
1992
1993
1994 TempB[0] = B[(6+8*m)*lda+0+8*n];
1995 TempB[1] = B[(6+8*m)*lda+1+8*n];
1996 TempB[2] = B[(6+8*m)*lda+2+8*n];
1997 TempB[3] = B[(6+8*m)*lda+3+8*n];
1998 TempB[4] = B[(6+8*m)*lda+4+8*n];
1999 TempB[5] = B[(6+8*m)*lda+5+8*n];
2000 TempB[6] = B[(6+8*m)*lda+6+8*n];
2001 TempB[7] = B[(6+8*m)*lda+7+8*n];
2002
2003 TempC[0] += TempA[6] * TempB[0];
2004 TempC[1] += TempA[6] * TempB[1];
2005 TempC[2] += TempA[6] * TempB[2];
2006 TempC[3] += TempA[6] * TempB[3];
2007 TempC[4] += TempA[6] * TempB[4];
2008 TempC[5] += TempA[6] * TempB[5];
2009 TempC[6] += TempA[6] * TempB[6];
2010 TempC[7] += TempA[6] * TempB[7];
2011
2012
2013 TempB[0] = B[(7+8*m)*lda+0+8*n];
2014 TempB[1] = B[(7+8*m)*lda+1+8*n];
2015 TempB[2] = B[(7+8*m)*lda+2+8*n];
2016 TempB[3] = B[(7+8*m)*lda+3+8*n];
2017 TempB[4] = B[(7+8*m)*lda+4+8*n];
2018 TempB[5] = B[(7+8*m)*lda+5+8*n];
2019 TempB[6] = B[(7+8*m)*lda+6+8*n];
2020 TempB[7] = B[(7+8*m)*lda+7+8*n];
2021
2022 TempC[0] += TempA[7] * TempB[0];
2023 TempC[1] += TempA[7] * TempB[1];
2024 TempC[2] += TempA[7] * TempB[2];
2025 TempC[3] += TempA[7] * TempB[3];
2026 TempC[4] += TempA[7] * TempB[4];
2027 TempC[5] += TempA[7] * TempB[5];
2028 TempC[6] += TempA[7] * TempB[6];
2029 TempC[7] += TempA[7] * TempB[7];
2030
2031
2032
2033 C[0+8*n+j*lda] += TempC[0];
2034 C[1+8*n+j*lda] += TempC[1];
2035 C[2+8*n+j*lda] += TempC[2];
2036 C[3+8*n+j*lda] += TempC[3];
2037 C[4+8*n+j*lda] += TempC[4];
2038 C[5+8*n+j*lda] += TempC[5];
2039 C[6+8*n+j*lda] += TempC[6];
2040 C[7+8*n+j*lda] += TempC[7];
2041 }
2042 }
2043 }
2044 }
2045 if(coreid == 0)
2046 {
2047 for ( j = 0; j < 16; j++ )
2048 {
2049
2050 for ( m = 0; m < 4; m++ )
2051 {
2052
2053 TempA[0] = A[j*lda+0+8*m];
2054 TempA[1] = A[j*lda+1+8*m];
2055 TempA[2] = A[j*lda+2+8*m];
2056 TempA[3] = A[j*lda+3+8*m];
2057 TempA[4] = A[j*lda+4+8*m];
2058 TempA[5] = A[j*lda+5+8*m];
2059 TempA[6] = A[j*lda+6+8*m];
2060 TempA[7] = A[j*lda+7+8*m];
2061
2062 for( n = 0; n < 4; n++)
2063 {
2064
2065
2066 TempB[0] = B[(0+8*m)*lda+0+8*n];
2067 TempB[1] = B[(0+8*m)*lda+1+8*n];
2068 TempB[2] = B[(0+8*m)*lda+2+8*n];
2069 TempB[3] = B[(0+8*m)*lda+3+8*n];
2070 TempB[4] = B[(0+8*m)*lda+4+8*n];
2071 TempB[5] = B[(0+8*m)*lda+5+8*n];
2072 TempB[6] = B[(0+8*m)*lda+6+8*n];
2073 TempB[7] = B[(0+8*m)*lda+7+8*n];
2074
2075
2076 TempC[0] = TempA[0] * TempB[0];
2077 TempC[1] = TempA[0] * TempB[1];
2078 TempC[2] = TempA[0] * TempB[2];
2079 TempC[3] = TempA[0] * TempB[3];
2080 TempC[4] = TempA[0] * TempB[4];
2081 TempC[5] = TempA[0] * TempB[5];
2082 TempC[6] = TempA[0] * TempB[6];
2083 TempC[7] = TempA[0] * TempB[7];
2084
2085
2086
2087 TempB[0] = B[(1+8*m)*lda+0+8*n];
2088 TempB[1] = B[(1+8*m)*lda+1+8*n];
2089 TempB[2] = B[(1+8*m)*lda+2+8*n];
2090 TempB[3] = B[(1+8*m)*lda+3+8*n];
2091 TempB[4] = B[(1+8*m)*lda+4+8*n];
2092 TempB[5] = B[(1+8*m)*lda+5+8*n];
2093 TempB[6] = B[(1+8*m)*lda+6+8*n];
2094 TempB[7] = B[(1+8*m)*lda+7+8*n];
2095
2096 TempC[0] += TempA[1] * TempB[0];
2097 TempC[1] += TempA[1] * TempB[1];
2098 TempC[2] += TempA[1] * TempB[2];
2099 TempC[3] += TempA[1] * TempB[3];
2100 TempC[4] += TempA[1] * TempB[4];
2101 TempC[5] += TempA[1] * TempB[5];
2102 TempC[6] += TempA[1] * TempB[6];
2103 TempC[7] += TempA[1] * TempB[7];
2104
2105
2106
2107 TempB[0] = B[(2+8*m)*lda+0+8*n];
2108 TempB[1] = B[(2+8*m)*lda+1+8*n];
2109 TempB[2] = B[(2+8*m)*lda+2+8*n];
2110 TempB[3] = B[(2+8*m)*lda+3+8*n];
2111 TempB[4] = B[(2+8*m)*lda+4+8*n];
2112 TempB[5] = B[(2+8*m)*lda+5+8*n];
2113 TempB[6] = B[(2+8*m)*lda+6+8*n];
2114 TempB[7] = B[(2+8*m)*lda+7+8*n];
2115
2116 TempC[0] += TempA[2] * TempB[0];
2117 TempC[1] += TempA[2] * TempB[1];
2118 TempC[2] += TempA[2] * TempB[2];
2119 TempC[3] += TempA[2] * TempB[3];
2120 TempC[4] += TempA[2] * TempB[4];
2121 TempC[5] += TempA[2] * TempB[5];
2122 TempC[6] += TempA[2] * TempB[6];
2123 TempC[7] += TempA[2] * TempB[7];
2124
2125
2126
2127 TempB[0] = B[(3+8*m)*lda+0+8*n];
2128 TempB[1] = B[(3+8*m)*lda+1+8*n];
2129 TempB[2] = B[(3+8*m)*lda+2+8*n];
2130 TempB[3] = B[(3+8*m)*lda+3+8*n];
2131 TempB[4] = B[(3+8*m)*lda+4+8*n];
2132 TempB[5] = B[(3+8*m)*lda+5+8*n];
2133 TempB[6] = B[(3+8*m)*lda+6+8*n];
2134 TempB[7] = B[(3+8*m)*lda+7+8*n];
2135
2136 TempC[0] += TempA[3] * TempB[0];
2137 TempC[1] += TempA[3] * TempB[1];
2138 TempC[2] += TempA[3] * TempB[2];
2139 TempC[3] += TempA[3] * TempB[3];
2140 TempC[4] += TempA[3] * TempB[4];
2141 TempC[5] += TempA[3] * TempB[5];
2142 TempC[6] += TempA[3] * TempB[6];
2143 TempC[7] += TempA[3] * TempB[7];
2144
2145
2146 TempB[0] = B[(4+8*m)*lda+0+8*n];
2147 TempB[1] = B[(4+8*m)*lda+1+8*n];
2148 TempB[2] = B[(4+8*m)*lda+2+8*n];
2149 TempB[3] = B[(4+8*m)*lda+3+8*n];
2150 TempB[4] = B[(4+8*m)*lda+4+8*n];
2151 TempB[5] = B[(4+8*m)*lda+5+8*n];
2152 TempB[6] = B[(4+8*m)*lda+6+8*n];
2153 TempB[7] = B[(4+8*m)*lda+7+8*n];
2154
2155 TempC[0] += TempA[4] * TempB[0];
2156 TempC[1] += TempA[4] * TempB[1];
2157 TempC[2] += TempA[4] * TempB[2];
2158 TempC[3] += TempA[4] * TempB[3];
2159 TempC[4] += TempA[4] * TempB[4];
2160 TempC[5] += TempA[4] * TempB[5];
2161 TempC[6] += TempA[4] * TempB[6];
2162 TempC[7] += TempA[4] * TempB[7];
2163
2164
2165
2166 TempB[0] = B[(5+8*m)*lda+0+8*n];
2167 TempB[1] = B[(5+8*m)*lda+1+8*n];
2168 TempB[2] = B[(5+8*m)*lda+2+8*n];
2169 TempB[3] = B[(5+8*m)*lda+3+8*n];
2170 TempB[4] = B[(5+8*m)*lda+4+8*n];
2171 TempB[5] = B[(5+8*m)*lda+5+8*n];
2172 TempB[6] = B[(5+8*m)*lda+6+8*n];
2173 TempB[7] = B[(5+8*m)*lda+7+8*n];
2174
2175 TempC[0] += TempA[5] * TempB[0];
2176 TempC[1] += TempA[5] * TempB[1];
2177 TempC[2] += TempA[5] * TempB[2];
2178 TempC[3] += TempA[5] * TempB[3];
2179 TempC[4] += TempA[5] * TempB[4];
2180 TempC[5] += TempA[5] * TempB[5];
2181 TempC[6] += TempA[5] * TempB[6];
2182 TempC[7] += TempA[5] * TempB[7];
2183
2184
2185
2186 TempB[0] = B[(6+8*m)*lda+0+8*n];
2187 TempB[1] = B[(6+8*m)*lda+1+8*n];
2188 TempB[2] = B[(6+8*m)*lda+2+8*n];
2189 TempB[3] = B[(6+8*m)*lda+3+8*n];
2190 TempB[4] = B[(6+8*m)*lda+4+8*n];
2191 TempB[5] = B[(6+8*m)*lda+5+8*n];
2192 TempB[6] = B[(6+8*m)*lda+6+8*n];
2193 TempB[7] = B[(6+8*m)*lda+7+8*n];
2194
2195 TempC[0] += TempA[6] * TempB[0];
2196 TempC[1] += TempA[6] * TempB[1];
2197 TempC[2] += TempA[6] * TempB[2];
2198 TempC[3] += TempA[6] * TempB[3];
2199 TempC[4] += TempA[6] * TempB[4];
2200 TempC[5] += TempA[6] * TempB[5];
2201 TempC[6] += TempA[6] * TempB[6];
2202 TempC[7] += TempA[6] * TempB[7];
2203
2204
2205 TempB[0] = B[(7+8*m)*lda+0+8*n];
2206 TempB[1] = B[(7+8*m)*lda+1+8*n];
2207 TempB[2] = B[(7+8*m)*lda+2+8*n];
2208 TempB[3] = B[(7+8*m)*lda+3+8*n];
2209 TempB[4] = B[(7+8*m)*lda+4+8*n];
2210 TempB[5] = B[(7+8*m)*lda+5+8*n];
2211 TempB[6] = B[(7+8*m)*lda+6+8*n];
2212 TempB[7] = B[(7+8*m)*lda+7+8*n];
2213
2214 TempC[0] += TempA[7] * TempB[0];
2215 TempC[1] += TempA[7] * TempB[1];
2216 TempC[2] += TempA[7] * TempB[2];
2217 TempC[3] += TempA[7] * TempB[3];
2218 TempC[4] += TempA[7] * TempB[4];
2219 TempC[5] += TempA[7] * TempB[5];
2220 TempC[6] += TempA[7] * TempB[6];
2221 TempC[7] += TempA[7] * TempB[7];
2222
2223 C[0+8*n+j*lda] += TempC[0];
2224 C[1+8*n+j*lda] += TempC[1];
2225 C[2+8*n+j*lda] += TempC[2];
2226 C[3+8*n+j*lda] += TempC[3];
2227 C[4+8*n+j*lda] += TempC[4];
2228 C[5+8*n+j*lda] += TempC[5];
2229 C[6+8*n+j*lda] += TempC[6];
2230 C[7+8*n+j*lda] += TempC[7];
2231 }
2232
2233 }
2234 }
2235 }
2236 */
2237 //-----------------------------------------------------------------version 2.16, optimize v2.15 get rid of tempb. MSI 83K.w/ test one 81K.
2238
2239
2240 static __thread data_t TempA[8];
2241 static __thread data_t TempB[8];
2242 static __thread data_t TempC[8];
2243 static __thread int j,m,n;
2244
2245 if(coreid == 1)
2246 {
2247 for ( j = 16; j < 32; j++ )
2248 {
2249
2250 for ( m = 0; m < 4; m++ )
2251 {
2252
2253 TempA[0] = A[j*lda+0+8*m];
2254 TempA[1] = A[j*lda+1+8*m];
2255 TempA[2] = A[j*lda+2+8*m];
2256 TempA[3] = A[j*lda+3+8*m];
2257 TempA[4] = A[j*lda+4+8*m];
2258 TempA[5] = A[j*lda+5+8*m];
2259 TempA[6] = A[j*lda+6+8*m];
2260 TempA[7] = A[j*lda+7+8*m];
2261
2262
2263
2264 for( n = 0; n < 4; n++)
2265 {
2266
2267
2268
2269
2270
2271 TempC[0] = TempA[0] * B[(0+8*m)*lda+0+8*n];
2272 TempC[1] = TempA[0] * B[(0+8*m)*lda+1+8*n];
2273 TempC[2] = TempA[0] * B[(0+8*m)*lda+2+8*n];
2274 TempC[3] = TempA[0] * B[(0+8*m)*lda+3+8*n];
2275 TempC[4] = TempA[0] * B[(0+8*m)*lda+4+8*n];
2276 TempC[5] = TempA[0] * B[(0+8*m)*lda+5+8*n];
2277 TempC[6] = TempA[0] * B[(0+8*m)*lda+6+8*n];
2278 TempC[7] = TempA[0] * B[(0+8*m)*lda+7+8*n];
2279
2280
2281 TempC[0] += TempA[1] * B[(1+8*m)*lda+0+8*n];
2282 TempC[1] += TempA[1] * B[(1+8*m)*lda+1+8*n];
2283 TempC[2] += TempA[1] * B[(1+8*m)*lda+2+8*n];
2284 TempC[3] += TempA[1] * B[(1+8*m)*lda+3+8*n];
2285 TempC[4] += TempA[1] * B[(1+8*m)*lda+4+8*n];
2286 TempC[5] += TempA[1] * B[(1+8*m)*lda+5+8*n];
2287 TempC[6] += TempA[1] * B[(1+8*m)*lda+6+8*n];
2288 TempC[7] += TempA[1] * B[(1+8*m)*lda+7+8*n];
2289
2290
2291
2292 TempC[0] += TempA[2] * B[(2+8*m)*lda+0+8*n];
2293 TempC[1] += TempA[2] * B[(2+8*m)*lda+1+8*n];
2294 TempC[2] += TempA[2] * B[(2+8*m)*lda+2+8*n];
2295 TempC[3] += TempA[2] * B[(2+8*m)*lda+3+8*n];
2296 TempC[4] += TempA[2] * B[(2+8*m)*lda+4+8*n];
2297 TempC[5] += TempA[2] * B[(2+8*m)*lda+5+8*n];
2298 TempC[6] += TempA[2] * B[(2+8*m)*lda+6+8*n];
2299 TempC[7] += TempA[2] * B[(2+8*m)*lda+7+8*n];
2300
2301
2302
2303 TempC[0] += TempA[3] * B[(3+8*m)*lda+0+8*n];
2304 TempC[1] += TempA[3] * B[(3+8*m)*lda+1+8*n];
2305 TempC[2] += TempA[3] * B[(3+8*m)*lda+2+8*n];
2306 TempC[3] += TempA[3] * B[(3+8*m)*lda+3+8*n];
2307 TempC[4] += TempA[3] * B[(3+8*m)*lda+4+8*n];
2308 TempC[5] += TempA[3] * B[(3+8*m)*lda+5+8*n];
2309 TempC[6] += TempA[3] * B[(3+8*m)*lda+6+8*n];
2310 TempC[7] += TempA[3] * B[(3+8*m)*lda+7+8*n];
2311
2312 TempC[0] += TempA[4] * B[(4+8*m)*lda+0+8*n];
2313 TempC[1] += TempA[4] * B[(4+8*m)*lda+1+8*n];
2314 TempC[2] += TempA[4] * B[(4+8*m)*lda+2+8*n];
2315 TempC[3] += TempA[4] * B[(4+8*m)*lda+3+8*n];
2316 TempC[4] += TempA[4] * B[(4+8*m)*lda+4+8*n];
2317 TempC[5] += TempA[4] * B[(4+8*m)*lda+5+8*n];
2318 TempC[6] += TempA[4] * B[(4+8*m)*lda+6+8*n];
2319 TempC[7] += TempA[4] * B[(4+8*m)*lda+7+8*n];
2320
2321
2322 TempC[0] += TempA[5] * B[(5+8*m)*lda+0+8*n];
2323 TempC[1] += TempA[5] * B[(5+8*m)*lda+1+8*n];
2324 TempC[2] += TempA[5] * B[(5+8*m)*lda+2+8*n];
2325 TempC[3] += TempA[5] * B[(5+8*m)*lda+3+8*n];
2326 TempC[4] += TempA[5] * B[(5+8*m)*lda+4+8*n];
2327 TempC[5] += TempA[5] * B[(5+8*m)*lda+5+8*n];
2328 TempC[6] += TempA[5] * B[(5+8*m)*lda+6+8*n];
2329 TempC[7] += TempA[5] * B[(5+8*m)*lda+7+8*n];
2330
2331
2332
2333 TempC[0] += TempA[6] * B[(6+8*m)*lda+0+8*n];
2334 TempC[1] += TempA[6] * B[(6+8*m)*lda+1+8*n];
2335 TempC[2] += TempA[6] * B[(6+8*m)*lda+2+8*n];
2336 TempC[3] += TempA[6] * B[(6+8*m)*lda+3+8*n];
2337 TempC[4] += TempA[6] * B[(6+8*m)*lda+4+8*n];
2338 TempC[5] += TempA[6] * B[(6+8*m)*lda+5+8*n];
2339 TempC[6] += TempA[6] * B[(6+8*m)*lda+6+8*n];
2340 TempC[7] += TempA[6] * B[(6+8*m)*lda+7+8*n];
2341
2342
2343 TempC[0] += TempA[7] * B[(7+8*m)*lda+0+8*n];
2344 TempC[1] += TempA[7] * B[(7+8*m)*lda+1+8*n];
2345 TempC[2] += TempA[7] * B[(7+8*m)*lda+2+8*n];
2346 TempC[3] += TempA[7] * B[(7+8*m)*lda+3+8*n];
2347 TempC[4] += TempA[7] * B[(7+8*m)*lda+4+8*n];
2348 TempC[5] += TempA[7] * B[(7+8*m)*lda+5+8*n];
2349 TempC[6] += TempA[7] * B[(7+8*m)*lda+6+8*n];
2350 TempC[7] += TempA[7] * B[(7+8*m)*lda+7+8*n];
2351
2352
2353
2354 C[0+8*n+j*lda] += TempC[0];
2355 C[1+8*n+j*lda] += TempC[1];
2356 C[2+8*n+j*lda] += TempC[2];
2357 C[3+8*n+j*lda] += TempC[3];
2358 C[4+8*n+j*lda] += TempC[4];
2359 C[5+8*n+j*lda] += TempC[5];
2360 C[6+8*n+j*lda] += TempC[6];
2361 C[7+8*n+j*lda] += TempC[7];
2362 }
2363 }
2364 }
2365 }
2366 if(coreid == 0)
2367 {
2368 for ( j = 0; j < 16; j++ )
2369 {
2370
2371 for ( m = 0; m < 4; m++ )
2372 {
2373
2374 TempA[0] = A[j*lda+0+8*m];
2375 TempA[1] = A[j*lda+1+8*m];
2376 TempA[2] = A[j*lda+2+8*m];
2377 TempA[3] = A[j*lda+3+8*m];
2378 TempA[4] = A[j*lda+4+8*m];
2379 TempA[5] = A[j*lda+5+8*m];
2380 TempA[6] = A[j*lda+6+8*m];
2381 TempA[7] = A[j*lda+7+8*m];
2382
2383
2384
2385 for( n = 0; n < 4; n++)
2386 {
2387
2388
2389
2390
2391
2392 TempC[0] = TempA[0] * B[(0+8*m)*lda+0+8*n];
2393 TempC[1] = TempA[0] * B[(0+8*m)*lda+1+8*n];
2394 TempC[2] = TempA[0] * B[(0+8*m)*lda+2+8*n];
2395 TempC[3] = TempA[0] * B[(0+8*m)*lda+3+8*n];
2396 TempC[4] = TempA[0] * B[(0+8*m)*lda+4+8*n];
2397 TempC[5] = TempA[0] * B[(0+8*m)*lda+5+8*n];
2398 TempC[6] = TempA[0] * B[(0+8*m)*lda+6+8*n];
2399 TempC[7] = TempA[0] * B[(0+8*m)*lda+7+8*n];
2400
2401
2402 TempC[0] += TempA[1] * B[(1+8*m)*lda+0+8*n];
2403 TempC[1] += TempA[1] * B[(1+8*m)*lda+1+8*n];
2404 TempC[2] += TempA[1] * B[(1+8*m)*lda+2+8*n];
2405 TempC[3] += TempA[1] * B[(1+8*m)*lda+3+8*n];
2406 TempC[4] += TempA[1] * B[(1+8*m)*lda+4+8*n];
2407 TempC[5] += TempA[1] * B[(1+8*m)*lda+5+8*n];
2408 TempC[6] += TempA[1] * B[(1+8*m)*lda+6+8*n];
2409 TempC[7] += TempA[1] * B[(1+8*m)*lda+7+8*n];
2410
2411
2412
2413 TempC[0] += TempA[2] * B[(2+8*m)*lda+0+8*n];
2414 TempC[1] += TempA[2] * B[(2+8*m)*lda+1+8*n];
2415 TempC[2] += TempA[2] * B[(2+8*m)*lda+2+8*n];
2416 TempC[3] += TempA[2] * B[(2+8*m)*lda+3+8*n];
2417 TempC[4] += TempA[2] * B[(2+8*m)*lda+4+8*n];
2418 TempC[5] += TempA[2] * B[(2+8*m)*lda+5+8*n];
2419 TempC[6] += TempA[2] * B[(2+8*m)*lda+6+8*n];
2420 TempC[7] += TempA[2] * B[(2+8*m)*lda+7+8*n];
2421
2422
2423
2424 TempC[0] += TempA[3] * B[(3+8*m)*lda+0+8*n];
2425 TempC[1] += TempA[3] * B[(3+8*m)*lda+1+8*n];
2426 TempC[2] += TempA[3] * B[(3+8*m)*lda+2+8*n];
2427 TempC[3] += TempA[3] * B[(3+8*m)*lda+3+8*n];
2428 TempC[4] += TempA[3] * B[(3+8*m)*lda+4+8*n];
2429 TempC[5] += TempA[3] * B[(3+8*m)*lda+5+8*n];
2430 TempC[6] += TempA[3] * B[(3+8*m)*lda+6+8*n];
2431 TempC[7] += TempA[3] * B[(3+8*m)*lda+7+8*n];
2432
2433 TempC[0] += TempA[4] * B[(4+8*m)*lda+0+8*n];
2434 TempC[1] += TempA[4] * B[(4+8*m)*lda+1+8*n];
2435 TempC[2] += TempA[4] * B[(4+8*m)*lda+2+8*n];
2436 TempC[3] += TempA[4] * B[(4+8*m)*lda+3+8*n];
2437 TempC[4] += TempA[4] * B[(4+8*m)*lda+4+8*n];
2438 TempC[5] += TempA[4] * B[(4+8*m)*lda+5+8*n];
2439 TempC[6] += TempA[4] * B[(4+8*m)*lda+6+8*n];
2440 TempC[7] += TempA[4] * B[(4+8*m)*lda+7+8*n];
2441
2442
2443 TempC[0] += TempA[5] * B[(5+8*m)*lda+0+8*n];
2444 TempC[1] += TempA[5] * B[(5+8*m)*lda+1+8*n];
2445 TempC[2] += TempA[5] * B[(5+8*m)*lda+2+8*n];
2446 TempC[3] += TempA[5] * B[(5+8*m)*lda+3+8*n];
2447 TempC[4] += TempA[5] * B[(5+8*m)*lda+4+8*n];
2448 TempC[5] += TempA[5] * B[(5+8*m)*lda+5+8*n];
2449 TempC[6] += TempA[5] * B[(5+8*m)*lda+6+8*n];
2450 TempC[7] += TempA[5] * B[(5+8*m)*lda+7+8*n];
2451
2452
2453
2454 TempC[0] += TempA[6] * B[(6+8*m)*lda+0+8*n];
2455 TempC[1] += TempA[6] * B[(6+8*m)*lda+1+8*n];
2456 TempC[2] += TempA[6] * B[(6+8*m)*lda+2+8*n];
2457 TempC[3] += TempA[6] * B[(6+8*m)*lda+3+8*n];
2458 TempC[4] += TempA[6] * B[(6+8*m)*lda+4+8*n];
2459 TempC[5] += TempA[6] * B[(6+8*m)*lda+5+8*n];
2460 TempC[6] += TempA[6] * B[(6+8*m)*lda+6+8*n];
2461 TempC[7] += TempA[6] * B[(6+8*m)*lda+7+8*n];
2462
2463
2464 TempC[0] += TempA[7] * B[(7+8*m)*lda+0+8*n];
2465 TempC[1] += TempA[7] * B[(7+8*m)*lda+1+8*n];
2466 TempC[2] += TempA[7] * B[(7+8*m)*lda+2+8*n];
2467 TempC[3] += TempA[7] * B[(7+8*m)*lda+3+8*n];
2468 TempC[4] += TempA[7] * B[(7+8*m)*lda+4+8*n];
2469 TempC[5] += TempA[7] * B[(7+8*m)*lda+5+8*n];
2470 TempC[6] += TempA[7] * B[(7+8*m)*lda+6+8*n];
2471 TempC[7] += TempA[7] * B[(7+8*m)*lda+7+8*n];
2472
2473
2474
2475 C[0+8*n+j*lda] += TempC[0];
2476 C[1+8*n+j*lda] += TempC[1];
2477 C[2+8*n+j*lda] += TempC[2];
2478 C[3+8*n+j*lda] += TempC[3];
2479 C[4+8*n+j*lda] += TempC[4];
2480 C[5+8*n+j*lda] += TempC[5];
2481 C[6+8*n+j*lda] += TempC[6];
2482 C[7+8*n+j*lda] += TempC[7];
2483 }
2484 }
2485 }
2486 }
2487
2488 //-----------------------------------------------------------------version 2.13 optimize j
2489 /*
2490 static __thread data_t TempA[8];
2491 static __thread data_t TempB[8];
2492 static __thread data_t TempC[8];
2493 static __thread int j,m,n,i,k;
2494
2495 if(coreid == 1)
2496 {
2497 for ( j = 16; j < 32; j++ )
2498 {
2499
2500 for ( m = 0; m < 4; m++ )
2501 {
2502
2503 TempA[0] = A[j*lda+0+8*m];
2504 TempA[1] = A[j*lda+1+8*m];
2505 TempA[2] = A[j*lda+2+8*m];
2506 TempA[3] = A[j*lda+3+8*m];
2507 TempA[4] = A[j*lda+4+8*m];
2508 TempA[5] = A[j*lda+5+8*m];
2509 TempA[6] = A[j*lda+6+8*m];
2510 TempA[7] = A[j*lda+7+8*m];
2511
2512 for( n = 0; n < 4; n++)
2513 {
2514 TempB[0] = B[(0+8*m)*lda+0+8*n];
2515 TempB[1] = B[(0+8*m)*lda+1+8*n];
2516 TempB[2] = B[(0+8*m)*lda+2+8*n];
2517 TempB[3] = B[(0+8*m)*lda+3+8*n];
2518 TempB[4] = B[(0+8*m)*lda+4+8*n];
2519 TempB[5] = B[(0+8*m)*lda+5+8*n];
2520 TempB[6] = B[(0+8*m)*lda+6+8*n];
2521 TempB[7] = B[(0+8*m)*lda+7+8*n];
2522
2523 C[0+8*n+j*lda] += TempA[0] * TempB[0];
2524 C[1+8*n+j*lda] += TempA[0] * TempB[1];
2525 C[2+8*n+j*lda] += TempA[0] * TempB[2];
2526 C[3+8*n+j*lda] += TempA[0] * TempB[3];
2527 C[4+8*n+j*lda] += TempA[0] * TempB[4];
2528 C[5+8*n+j*lda] += TempA[0] * TempB[5];
2529 C[6+8*n+j*lda] += TempA[0] * TempB[6];
2530 C[7+8*n+j*lda] += TempA[0] * TempB[7];
2531
2532
2533
2534 TempB[0] = B[(1+8*m)*lda+0+8*n];
2535 TempB[1] = B[(1+8*m)*lda+1+8*n];
2536 TempB[2] = B[(1+8*m)*lda+2+8*n];
2537 TempB[3] = B[(1+8*m)*lda+3+8*n];
2538 TempB[4] = B[(1+8*m)*lda+4+8*n];
2539 TempB[5] = B[(1+8*m)*lda+5+8*n];
2540 TempB[6] = B[(1+8*m)*lda+6+8*n];
2541 TempB[7] = B[(1+8*m)*lda+7+8*n];
2542
2543 C[0+8*n+j*lda] += TempA[1] * TempB[0];
2544 C[1+8*n+j*lda] += TempA[1] * TempB[1];
2545 C[2+8*n+j*lda] += TempA[1] * TempB[2];
2546 C[3+8*n+j*lda] += TempA[1] * TempB[3];
2547 C[4+8*n+j*lda] += TempA[1] * TempB[4];
2548 C[5+8*n+j*lda] += TempA[1] * TempB[5];
2549 C[6+8*n+j*lda] += TempA[1] * TempB[6];
2550 C[7+8*n+j*lda] += TempA[1] * TempB[7];
2551
2552
2553
2554 TempB[0] = B[(2+8*m)*lda+0+8*n];
2555 TempB[1] = B[(2+8*m)*lda+1+8*n];
2556 TempB[2] = B[(2+8*m)*lda+2+8*n];
2557 TempB[3] = B[(2+8*m)*lda+3+8*n];
2558 TempB[4] = B[(2+8*m)*lda+4+8*n];
2559 TempB[5] = B[(2+8*m)*lda+5+8*n];
2560 TempB[6] = B[(2+8*m)*lda+6+8*n];
2561 TempB[7] = B[(2+8*m)*lda+7+8*n];
2562
2563 C[0+8*n+j*lda] += TempA[2] * TempB[0];
2564 C[1+8*n+j*lda] += TempA[2] * TempB[1];
2565 C[2+8*n+j*lda] += TempA[2] * TempB[2];
2566 C[3+8*n+j*lda] += TempA[2] * TempB[3];
2567 C[4+8*n+j*lda] += TempA[2] * TempB[4];
2568 C[5+8*n+j*lda] += TempA[2] * TempB[5];
2569 C[6+8*n+j*lda] += TempA[2] * TempB[6];
2570 C[7+8*n+j*lda] += TempA[2] * TempB[7];
2571
2572
2573
2574 TempB[0] = B[(3+8*m)*lda+0+8*n];
2575 TempB[1] = B[(3+8*m)*lda+1+8*n];
2576 TempB[2] = B[(3+8*m)*lda+2+8*n];
2577 TempB[3] = B[(3+8*m)*lda+3+8*n];
2578 TempB[4] = B[(3+8*m)*lda+4+8*n];
2579 TempB[5] = B[(3+8*m)*lda+5+8*n];
2580 TempB[6] = B[(3+8*m)*lda+6+8*n];
2581 TempB[7] = B[(3+8*m)*lda+7+8*n];
2582
2583 C[0+8*n+j*lda] += TempA[3] * TempB[0];
2584 C[1+8*n+j*lda] += TempA[3] * TempB[1];
2585 C[2+8*n+j*lda] += TempA[3] * TempB[2];
2586 C[3+8*n+j*lda] += TempA[3] * TempB[3];
2587 C[4+8*n+j*lda] += TempA[3] * TempB[4];
2588 C[5+8*n+j*lda] += TempA[3] * TempB[5];
2589 C[6+8*n+j*lda] += TempA[3] * TempB[6];
2590 C[7+8*n+j*lda] += TempA[3] * TempB[7];
2591
2592
2593 TempB[0] = B[(4+8*m)*lda+0+8*n];
2594 TempB[1] = B[(4+8*m)*lda+1+8*n];
2595 TempB[2] = B[(4+8*m)*lda+2+8*n];
2596 TempB[3] = B[(4+8*m)*lda+3+8*n];
2597 TempB[4] = B[(4+8*m)*lda+4+8*n];
2598 TempB[5] = B[(4+8*m)*lda+5+8*n];
2599 TempB[6] = B[(4+8*m)*lda+6+8*n];
2600 TempB[7] = B[(4+8*m)*lda+7+8*n];
2601
2602 C[0+8*n+j*lda] += TempA[4] * TempB[0];
2603 C[1+8*n+j*lda] += TempA[4] * TempB[1];
2604 C[2+8*n+j*lda] += TempA[4] * TempB[2];
2605 C[3+8*n+j*lda] += TempA[4] * TempB[3];
2606 C[4+8*n+j*lda] += TempA[4] * TempB[4];
2607 C[5+8*n+j*lda] += TempA[4] * TempB[5];
2608 C[6+8*n+j*lda] += TempA[4] * TempB[6];
2609 C[7+8*n+j*lda] += TempA[4] * TempB[7];
2610
2611
2612
2613 TempB[0] = B[(5+8*m)*lda+0+8*n];
2614 TempB[1] = B[(5+8*m)*lda+1+8*n];
2615 TempB[2] = B[(5+8*m)*lda+2+8*n];
2616 TempB[3] = B[(5+8*m)*lda+3+8*n];
2617 TempB[4] = B[(5+8*m)*lda+4+8*n];
2618 TempB[5] = B[(5+8*m)*lda+5+8*n];
2619 TempB[6] = B[(5+8*m)*lda+6+8*n];
2620 TempB[7] = B[(5+8*m)*lda+7+8*n];
2621
2622 C[0+8*n+j*lda] += TempA[5] * TempB[0];
2623 C[1+8*n+j*lda] += TempA[5] * TempB[1];
2624 C[2+8*n+j*lda] += TempA[5] * TempB[2];
2625 C[3+8*n+j*lda] += TempA[5] * TempB[3];
2626 C[4+8*n+j*lda] += TempA[5] * TempB[4];
2627 C[5+8*n+j*lda] += TempA[5] * TempB[5];
2628 C[6+8*n+j*lda] += TempA[5] * TempB[6];
2629 C[7+8*n+j*lda] += TempA[5] * TempB[7];
2630
2631
2632
2633 TempB[0] = B[(6+8*m)*lda+0+8*n];
2634 TempB[1] = B[(6+8*m)*lda+1+8*n];
2635 TempB[2] = B[(6+8*m)*lda+2+8*n];
2636 TempB[3] = B[(6+8*m)*lda+3+8*n];
2637 TempB[4] = B[(6+8*m)*lda+4+8*n];
2638 TempB[5] = B[(6+8*m)*lda+5+8*n];
2639 TempB[6] = B[(6+8*m)*lda+6+8*n];
2640 TempB[7] = B[(6+8*m)*lda+7+8*n];
2641
2642 C[0+8*n+j*lda] += TempA[6] * TempB[0];
2643 C[1+8*n+j*lda] += TempA[6] * TempB[1];
2644 C[2+8*n+j*lda] += TempA[6] * TempB[2];
2645 C[3+8*n+j*lda] += TempA[6] * TempB[3];
2646 C[4+8*n+j*lda] += TempA[6] * TempB[4];
2647 C[5+8*n+j*lda] += TempA[6] * TempB[5];
2648 C[6+8*n+j*lda] += TempA[6] * TempB[6];
2649 C[7+8*n+j*lda] += TempA[6] * TempB[7];
2650
2651
2652 TempB[0] = B[(7+8*m)*lda+0+8*n];
2653 TempB[1] = B[(7+8*m)*lda+1+8*n];
2654 TempB[2] = B[(7+8*m)*lda+2+8*n];
2655 TempB[3] = B[(7+8*m)*lda+3+8*n];
2656 TempB[4] = B[(7+8*m)*lda+4+8*n];
2657 TempB[5] = B[(7+8*m)*lda+5+8*n];
2658 TempB[6] = B[(7+8*m)*lda+6+8*n];
2659 TempB[7] = B[(7+8*m)*lda+7+8*n];
2660
2661 C[0+8*n+j*lda] += TempA[7] * TempB[0];
2662 C[1+8*n+j*lda] += TempA[7] * TempB[1];
2663 C[2+8*n+j*lda] += TempA[7] * TempB[2];
2664 C[3+8*n+j*lda] += TempA[7] * TempB[3];
2665 C[4+8*n+j*lda] += TempA[7] * TempB[4];
2666 C[5+8*n+j*lda] += TempA[7] * TempB[5];
2667 C[6+8*n+j*lda] += TempA[7] * TempB[6];
2668 C[7+8*n+j*lda] += TempA[7] * TempB[7];
2669 }
2670
2671 }
2672 }
2673 }
2674 if(coreid == 0)
2675 {
2676 for ( j = 0; j < 16; j++ )
2677 {
2678
2679 for ( m = 0; m < 4; m++ )
2680 {
2681
2682 TempA[0] = A[j*lda+0+8*m];
2683 TempA[1] = A[j*lda+1+8*m];
2684 TempA[2] = A[j*lda+2+8*m];
2685 TempA[3] = A[j*lda+3+8*m];
2686 TempA[4] = A[j*lda+4+8*m];
2687 TempA[5] = A[j*lda+5+8*m];
2688 TempA[6] = A[j*lda+6+8*m];
2689 TempA[7] = A[j*lda+7+8*m];
2690
2691 for( n = 0; n < 4; n++)
2692 {
2693 TempB[0] = B[(0+8*m)*lda+0+8*n];
2694 TempB[1] = B[(0+8*m)*lda+1+8*n];
2695 TempB[2] = B[(0+8*m)*lda+2+8*n];
2696 TempB[3] = B[(0+8*m)*lda+3+8*n];
2697 TempB[4] = B[(0+8*m)*lda+4+8*n];
2698 TempB[5] = B[(0+8*m)*lda+5+8*n];
2699 TempB[6] = B[(0+8*m)*lda+6+8*n];
2700 TempB[7] = B[(0+8*m)*lda+7+8*n];
2701
2702 C[0+8*n+j*lda] += TempA[0] * TempB[0];
2703 C[1+8*n+j*lda] += TempA[0] * TempB[1];
2704 C[2+8*n+j*lda] += TempA[0] * TempB[2];
2705 C[3+8*n+j*lda] += TempA[0] * TempB[3];
2706 C[4+8*n+j*lda] += TempA[0] * TempB[4];
2707 C[5+8*n+j*lda] += TempA[0] * TempB[5];
2708 C[6+8*n+j*lda] += TempA[0] * TempB[6];
2709 C[7+8*n+j*lda] += TempA[0] * TempB[7];
2710
2711
2712
2713 TempB[0] = B[(1+8*m)*lda+0+8*n];
2714 TempB[1] = B[(1+8*m)*lda+1+8*n];
2715 TempB[2] = B[(1+8*m)*lda+2+8*n];
2716 TempB[3] = B[(1+8*m)*lda+3+8*n];
2717 TempB[4] = B[(1+8*m)*lda+4+8*n];
2718 TempB[5] = B[(1+8*m)*lda+5+8*n];
2719 TempB[6] = B[(1+8*m)*lda+6+8*n];
2720 TempB[7] = B[(1+8*m)*lda+7+8*n];
2721
2722 C[0+8*n+j*lda] += TempA[1] * TempB[0];
2723 C[1+8*n+j*lda] += TempA[1] * TempB[1];
2724 C[2+8*n+j*lda] += TempA[1] * TempB[2];
2725 C[3+8*n+j*lda] += TempA[1] * TempB[3];
2726 C[4+8*n+j*lda] += TempA[1] * TempB[4];
2727 C[5+8*n+j*lda] += TempA[1] * TempB[5];
2728 C[6+8*n+j*lda] += TempA[1] * TempB[6];
2729 C[7+8*n+j*lda] += TempA[1] * TempB[7];
2730
2731
2732
2733 TempB[0] = B[(2+8*m)*lda+0+8*n];
2734 TempB[1] = B[(2+8*m)*lda+1+8*n];
2735 TempB[2] = B[(2+8*m)*lda+2+8*n];
2736 TempB[3] = B[(2+8*m)*lda+3+8*n];
2737 TempB[4] = B[(2+8*m)*lda+4+8*n];
2738 TempB[5] = B[(2+8*m)*lda+5+8*n];
2739 TempB[6] = B[(2+8*m)*lda+6+8*n];
2740 TempB[7] = B[(2+8*m)*lda+7+8*n];
2741
2742 C[0+8*n+j*lda] += TempA[2] * TempB[0];
2743 C[1+8*n+j*lda] += TempA[2] * TempB[1];
2744 C[2+8*n+j*lda] += TempA[2] * TempB[2];
2745 C[3+8*n+j*lda] += TempA[2] * TempB[3];
2746 C[4+8*n+j*lda] += TempA[2] * TempB[4];
2747 C[5+8*n+j*lda] += TempA[2] * TempB[5];
2748 C[6+8*n+j*lda] += TempA[2] * TempB[6];
2749 C[7+8*n+j*lda] += TempA[2] * TempB[7];
2750
2751
2752
2753 TempB[0] = B[(3+8*m)*lda+0+8*n];
2754 TempB[1] = B[(3+8*m)*lda+1+8*n];
2755 TempB[2] = B[(3+8*m)*lda+2+8*n];
2756 TempB[3] = B[(3+8*m)*lda+3+8*n];
2757 TempB[4] = B[(3+8*m)*lda+4+8*n];
2758 TempB[5] = B[(3+8*m)*lda+5+8*n];
2759 TempB[6] = B[(3+8*m)*lda+6+8*n];
2760 TempB[7] = B[(3+8*m)*lda+7+8*n];
2761
2762 C[0+8*n+j*lda] += TempA[3] * TempB[0];
2763 C[1+8*n+j*lda] += TempA[3] * TempB[1];
2764 C[2+8*n+j*lda] += TempA[3] * TempB[2];
2765 C[3+8*n+j*lda] += TempA[3] * TempB[3];
2766 C[4+8*n+j*lda] += TempA[3] * TempB[4];
2767 C[5+8*n+j*lda] += TempA[3] * TempB[5];
2768 C[6+8*n+j*lda] += TempA[3] * TempB[6];
2769 C[7+8*n+j*lda] += TempA[3] * TempB[7];
2770
2771
2772 TempB[0] = B[(4+8*m)*lda+0+8*n];
2773 TempB[1] = B[(4+8*m)*lda+1+8*n];
2774 TempB[2] = B[(4+8*m)*lda+2+8*n];
2775 TempB[3] = B[(4+8*m)*lda+3+8*n];
2776 TempB[4] = B[(4+8*m)*lda+4+8*n];
2777 TempB[5] = B[(4+8*m)*lda+5+8*n];
2778 TempB[6] = B[(4+8*m)*lda+6+8*n];
2779 TempB[7] = B[(4+8*m)*lda+7+8*n];
2780
2781 C[0+8*n+j*lda] += TempA[4] * TempB[0];
2782 C[1+8*n+j*lda] += TempA[4] * TempB[1];
2783 C[2+8*n+j*lda] += TempA[4] * TempB[2];
2784 C[3+8*n+j*lda] += TempA[4] * TempB[3];
2785 C[4+8*n+j*lda] += TempA[4] * TempB[4];
2786 C[5+8*n+j*lda] += TempA[4] * TempB[5];
2787 C[6+8*n+j*lda] += TempA[4] * TempB[6];
2788 C[7+8*n+j*lda] += TempA[4] * TempB[7];
2789
2790
2791
2792 TempB[0] = B[(5+8*m)*lda+0+8*n];
2793 TempB[1] = B[(5+8*m)*lda+1+8*n];
2794 TempB[2] = B[(5+8*m)*lda+2+8*n];
2795 TempB[3] = B[(5+8*m)*lda+3+8*n];
2796 TempB[4] = B[(5+8*m)*lda+4+8*n];
2797 TempB[5] = B[(5+8*m)*lda+5+8*n];
2798 TempB[6] = B[(5+8*m)*lda+6+8*n];
2799 TempB[7] = B[(5+8*m)*lda+7+8*n];
2800
2801 C[0+8*n+j*lda] += TempA[5] * TempB[0];
2802 C[1+8*n+j*lda] += TempA[5] * TempB[1];
2803 C[2+8*n+j*lda] += TempA[5] * TempB[2];
2804 C[3+8*n+j*lda] += TempA[5] * TempB[3];
2805 C[4+8*n+j*lda] += TempA[5] * TempB[4];
2806 C[5+8*n+j*lda] += TempA[5] * TempB[5];
2807 C[6+8*n+j*lda] += TempA[5] * TempB[6];
2808 C[7+8*n+j*lda] += TempA[5] * TempB[7];
2809
2810
2811
2812 TempB[0] = B[(6+8*m)*lda+0+8*n];
2813 TempB[1] = B[(6+8*m)*lda+1+8*n];
2814 TempB[2] = B[(6+8*m)*lda+2+8*n];
2815 TempB[3] = B[(6+8*m)*lda+3+8*n];
2816 TempB[4] = B[(6+8*m)*lda+4+8*n];
2817 TempB[5] = B[(6+8*m)*lda+5+8*n];
2818 TempB[6] = B[(6+8*m)*lda+6+8*n];
2819 TempB[7] = B[(6+8*m)*lda+7+8*n];
2820
2821 C[0+8*n+j*lda] += TempA[6] * TempB[0];
2822 C[1+8*n+j*lda] += TempA[6] * TempB[1];
2823 C[2+8*n+j*lda] += TempA[6] * TempB[2];
2824 C[3+8*n+j*lda] += TempA[6] * TempB[3];
2825 C[4+8*n+j*lda] += TempA[6] * TempB[4];
2826 C[5+8*n+j*lda] += TempA[6] * TempB[5];
2827 C[6+8*n+j*lda] += TempA[6] * TempB[6];
2828 C[7+8*n+j*lda] += TempA[6] * TempB[7];
2829
2830
2831 TempB[0] = B[(7+8*m)*lda+0+8*n];
2832 TempB[1] = B[(7+8*m)*lda+1+8*n];
2833 TempB[2] = B[(7+8*m)*lda+2+8*n];
2834 TempB[3] = B[(7+8*m)*lda+3+8*n];
2835 TempB[4] = B[(7+8*m)*lda+4+8*n];
2836 TempB[5] = B[(7+8*m)*lda+5+8*n];
2837 TempB[6] = B[(7+8*m)*lda+6+8*n];
2838 TempB[7] = B[(7+8*m)*lda+7+8*n];
2839
2840 C[0+8*n+j*lda] += TempA[7] * TempB[0];
2841 C[1+8*n+j*lda] += TempA[7] * TempB[1];
2842 C[2+8*n+j*lda] += TempA[7] * TempB[2];
2843 C[3+8*n+j*lda] += TempA[7] * TempB[3];
2844 C[4+8*n+j*lda] += TempA[7] * TempB[4];
2845 C[5+8*n+j*lda] += TempA[7] * TempB[5];
2846 C[6+8*n+j*lda] += TempA[7] * TempB[6];
2847 C[7+8*n+j*lda] += TempA[7] * TempB[7];
2848 }
2849
2850 }
2851 }
2852 }
2853 */
2854 }
2855
2856 //--------------------------------------------------------------------------
2857 // Main
2858 //
2859 // all threads start executing thread_entry(). Use their "coreid" to
2860 // differentiate between threads (each thread is running on a separate core).
2861
2862 void thread_entry(int cid, int nc)
2863 {
2864 coreid = cid;
2865 ncores = nc;
2866
2867 // static allocates data in the binary, which is visible to both threads
2868 static data_t results_data[ARRAY_SIZE];
2869
2870
2871 // // Execute the provided, naive matmul
2872 // barrier(nc);
2873 // stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier(nc));
2874 //
2875 //
2876 // // verify
2877 // verifyMT(ARRAY_SIZE, results_data, verify_data);
2878 //
2879 // // clear results from the first trial
2880 // size_t i;
2881 // if (coreid == 0)
2882 // for (i=0; i < ARRAY_SIZE; i++)
2883 // results_data[i] = 0;
2884 // barrier(nc);
2885
2886
2887 // Execute your faster matmul
2888 barrier(nc);
2889 stats(matmul(DIM_SIZE, input1_data, input2_data, results_data); barrier(nc));
2890
2891 #ifdef DEBUG
2892 printArrayMT("results:", ARRAY_SIZE, results_data);
2893 printArrayMT("verify :", ARRAY_SIZE, verify_data);
2894 #endif
2895
2896 // verify
2897 verifyMT(ARRAY_SIZE, results_data, verify_data);
2898 barrier(nc);
2899
2900 exit(0);
2901 }
2902