Fix build with riscv-gcc version 4.9
[riscv-tests.git] / mt / av_matmul / matmul_mi.c
1 //**************************************************************************
2 // Multi-threaded Matrix Multiply benchmark
3 //--------------------------------------------------------------------------
4 // TA : Christopher Celio
5 // Student:
6 //
7 //
8 // This benchmark multiplies two 2-D arrays together and writes the results to
9 // a third vector. The input data (and reference data) should be generated
10 // using the matmul_gendata.pl perl script and dumped to a file named
11 // dataset.h.
12
13
14 // print out arrays, etc.
15 //#define DEBUG
16
17 //--------------------------------------------------------------------------
18 // Includes
19
20 #include <string.h>
21 #include <stdlib.h>
22 #include <stdio.h>
23
24
25 //--------------------------------------------------------------------------
26 // Input/Reference Data
27
28 typedef float data_t;
29 #include "dataset.h"
30
31
32 //--------------------------------------------------------------------------
33 // Basic Utilities and Multi-thread Support
34
35 __thread unsigned long coreid;
36 unsigned long ncores;
37
38 #include "util.h"
39
40 #define stringify_1(s) #s
41 #define stringify(s) stringify_1(s)
42 #define stats(code) do { \
43 unsigned long _c = -rdcycle(), _i = -rdinstret(); \
44 code; \
45 _c += rdcycle(), _i += rdinstret(); \
46 if (coreid == 0) \
47 printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
48 stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
49 } while(0)
50
51
52 //--------------------------------------------------------------------------
53 // Helper functions
54
55 void printArrayMT( char name[], int n, data_t arr[] )
56 {
57 int i;
58 if (coreid != 0)
59 return;
60
61 printf( " %10s :", name );
62 for ( i = 0; i < n; i++ )
63 printf( " %3ld ", (long) arr[i] );
64 printf( "\n" );
65 }
66
67 void __attribute__((noinline)) verifyMT(size_t n, const data_t* test, const data_t* correct)
68 {
69 if (coreid != 0)
70 return;
71
72 size_t i;
73 for (i = 0; i < n; i++)
74 {
75 if (test[i] != correct[i])
76 {
77 printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n",
78 i, (long)test[i], i, (long)correct[i]);
79 exit(-1);
80 }
81 }
82
83 return;
84 }
85
86 //--------------------------------------------------------------------------
87 // matmul function
88
89 // single-thread, naive version
90 void __attribute__((noinline)) matmul_naive(const int lda, const data_t A[], const data_t B[], data_t C[] )
91 {
92 int i, j, k;
93
94 if (coreid > 0)
95 return;
96
97 for ( i = 0; i < lda; i++ )
98 for ( j = 0; j < lda; j++ )
99 {
100 for ( k = 0; k < lda; k++ )
101 {
102 C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
103 }
104 }
105
106 }
107
108
109
110 void __attribute__((noinline)) matmul(const int lda, const data_t A[], const data_t B[], data_t C[] )
111 {
112
113 // ***************************** //
114 // **** ADD YOUR CODE HERE ***** //
115 // ***************************** //
116 //
117 // feel free to make a separate function for MI and MSI versions.
118
119 //-------------------------------------------------------------first working version best 500k
120 /*
121 static __thread int i, j, k;
122 if(coreid == 0)
123 {
124 for ( j = 0; j < lda; j+=2 )
125 {
126 for ( k = 0; k < lda; k++ )
127 {
128 for ( i = 0; i < lda; i++)
129 {
130 C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
131 }
132 }
133 }
134 }
135
136 if(coreid ==1)
137 {
138 for ( j = 1; j < lda; j+=2 )
139 {
140 for ( k = 0;k < lda; k++)
141 {
142 for ( i = 0; i < lda; i++)
143 {
144 C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
145
146 }
147 }
148 }
149 }
150 */
151 //-------------------------------------------------------------version1.1, take read out of inner loop,300k
152 /*
153 static __thread int i, j, k;
154 static __thread data_t TempA;
155
156 if(coreid == 0)
157 {
158 for ( j = 0; j < lda; j+=2 )
159 {
160 for ( k = 0; k < lda; k++ )
161 {
162 TempA = A[j*lda + k];
163 for ( i = 0; i < lda; i++)
164 {
165 C[i + j*lda] += TempA* B[k*lda + i];
166 }
167 }
168 }
169 }
170
171 if(coreid ==1)
172 {
173 for ( j = 1; j < lda; j+=2 )
174 {
175 for ( k = 0;k < lda; k++)
176 {
177 TempA = A[j*lda + k];
178 for ( i = 0; i < lda; i++)
179 {
180 C[i + j*lda] += TempA* B[k*lda + i];
181 }
182 }
183 }
184 }
185 */
186 //-------------------------------------------------------------version2.0, read 8 elements in B at one time. 140k mi, MSI117.0k
187 /*
188 static __thread int i, j, k, m, n;
189 static __thread data_t TempA;
190 static __thread data_t TempB[8];
191
192 if(coreid == 0)
193 {
194 for ( j = 0; j < lda; j+=2 )
195 {
196 for ( k = 0; k < lda; k++ )
197 {
198 TempA = A[j*lda + k];
199 for( n = 0; n < 4; n++)
200 {
201
202 TempB[0] = B[k*lda+0+8*n];
203 TempB[1] = B[k*lda+1+8*n];
204 TempB[2] = B[k*lda+2+8*n];
205 TempB[3] = B[k*lda+3+8*n];
206 TempB[4] = B[k*lda+4+8*n];
207 TempB[5] = B[k*lda+5+8*n];
208 TempB[6] = B[k*lda+6+8*n];
209 TempB[7] = B[k*lda+7+8*n];
210
211 C[0+8*n+j*lda] += TempA * TempB[0];
212 C[1+8*n+j*lda] += TempA * TempB[1];
213 C[2+8*n+j*lda] += TempA * TempB[2];
214 C[3+8*n+j*lda] += TempA * TempB[3];
215 C[4+8*n+j*lda] += TempA * TempB[4];
216 C[5+8*n+j*lda] += TempA * TempB[5];
217 C[6+8*n+j*lda] += TempA * TempB[6];
218 C[7+8*n+j*lda] += TempA * TempB[7];
219
220 }
221
222 }
223 }
224 }
225
226 if(coreid == 1)
227 {
228 for ( j = 1; j < lda; j+=2 )
229 {
230 for ( k = 0; k < lda; k++ )
231 {
232 TempA = A[j*lda + k];
233 for( n = 0; n < 4; n++)
234 {
235
236 TempB[0] = B[k*lda+0+8*n];
237 TempB[1] = B[k*lda+1+8*n];
238 TempB[2] = B[k*lda+2+8*n];
239 TempB[3] = B[k*lda+3+8*n];
240 TempB[4] = B[k*lda+4+8*n];
241 TempB[5] = B[k*lda+5+8*n];
242 TempB[6] = B[k*lda+6+8*n];
243 TempB[7] = B[k*lda+7+8*n];
244
245 C[0+8*n+j*lda] += TempA * TempB[0];
246 C[1+8*n+j*lda] += TempA * TempB[1];
247 C[2+8*n+j*lda] += TempA * TempB[2];
248 C[3+8*n+j*lda] += TempA * TempB[3];
249 C[4+8*n+j*lda] += TempA * TempB[4];
250 C[5+8*n+j*lda] += TempA * TempB[5];
251 C[6+8*n+j*lda] += TempA * TempB[6];
252 C[7+8*n+j*lda] += TempA * TempB[7];
253
254 }
255
256 }
257 }
258 }
259 */
260
261 //-------------------------------------------------------------version2.1, optimize k. 700k. bad move to v2.2.
262 //-------------------------------------------------------------version2.9 take off all inner loops for both cores, MSI,109K. MI 182k
263 //-------------------------------------------------------------version2.10 use i= j*lda inside the n loop increase speed. but not out m and n. tried replace first 3, get 104.9k
264 /*
265 static __thread int j, m, i,n;
266 static __thread data_t TempA[8];
267 static __thread data_t TempB[8];
268
269 if(coreid == 1)
270 {
271 for ( j = 1; j < lda; j+=2 )
272 {
273
274 for ( m = 0; m < 4; m++ )
275 {
276
277 TempA[0] = A[j*lda+0+8*m];
278 TempA[1] = A[j*lda+1+8*m];
279 TempA[2] = A[j*lda+2+8*m];
280 TempA[3] = A[j*lda+3+8*m];
281 TempA[4] = A[j*lda+4+8*m];
282 TempA[5] = A[j*lda+5+8*m];
283 TempA[6] = A[j*lda+6+8*m];
284 TempA[7] = A[j*lda+7+8*m];
285
286 for( n = 0; n < 4; n++)
287 {
288 i = j*lda;
289
290 TempB[0] = B[(0+8*m)*lda+0+8*n];
291 TempB[1] = B[(0+8*m)*lda+1+8*n];
292 TempB[2] = B[(0+8*m)*lda+2+8*n];
293 TempB[3] = B[(0+8*m)*lda+3+8*n];
294 TempB[4] = B[(0+8*m)*lda+4+8*n];
295 TempB[5] = B[(0+8*m)*lda+5+8*n];
296 TempB[6] = B[(0+8*m)*lda+6+8*n];
297 TempB[7] = B[(0+8*m)*lda+7+8*n];
298
299 C[0+8*n+i] += TempA[0] * TempB[0];
300 C[1+8*n+i] += TempA[0] * TempB[1];
301 C[2+8*n+i] += TempA[0] * TempB[2];
302 C[3+8*n+i] += TempA[0] * TempB[3];
303 C[4+8*n+i] += TempA[0] * TempB[4];
304 C[5+8*n+i] += TempA[0] * TempB[5];
305 C[6+8*n+i] += TempA[0] * TempB[6];
306 C[7+8*n+i] += TempA[0] * TempB[7];
307
308
309
310 TempB[0] = B[(1+8*m)*lda+0+8*n];
311 TempB[1] = B[(1+8*m)*lda+1+8*n];
312 TempB[2] = B[(1+8*m)*lda+2+8*n];
313 TempB[3] = B[(1+8*m)*lda+3+8*n];
314 TempB[4] = B[(1+8*m)*lda+4+8*n];
315 TempB[5] = B[(1+8*m)*lda+5+8*n];
316 TempB[6] = B[(1+8*m)*lda+6+8*n];
317 TempB[7] = B[(1+8*m)*lda+7+8*n];
318
319 C[0+8*n+i] += TempA[1] * TempB[0];
320 C[1+8*n+i] += TempA[1] * TempB[1];
321 C[2+8*n+i] += TempA[1] * TempB[2];
322 C[3+8*n+i] += TempA[1] * TempB[3];
323 C[4+8*n+i] += TempA[1] * TempB[4];
324 C[5+8*n+i] += TempA[1] * TempB[5];
325 C[6+8*n+i] += TempA[1] * TempB[6];
326 C[7+8*n+i] += TempA[1] * TempB[7];
327
328
329
330 TempB[0] = B[(2+8*m)*lda+0+8*n];
331 TempB[1] = B[(2+8*m)*lda+1+8*n];
332 TempB[2] = B[(2+8*m)*lda+2+8*n];
333 TempB[3] = B[(2+8*m)*lda+3+8*n];
334 TempB[4] = B[(2+8*m)*lda+4+8*n];
335 TempB[5] = B[(2+8*m)*lda+5+8*n];
336 TempB[6] = B[(2+8*m)*lda+6+8*n];
337 TempB[7] = B[(2+8*m)*lda+7+8*n];
338
339 C[0+8*n+i] += TempA[2] * TempB[0];
340 C[1+8*n+i] += TempA[2] * TempB[1];
341 C[2+8*n+i] += TempA[2] * TempB[2];
342 C[3+8*n+i] += TempA[2] * TempB[3];
343 C[4+8*n+i] += TempA[2] * TempB[4];
344 C[5+8*n+i] += TempA[2] * TempB[5];
345 C[6+8*n+i] += TempA[2] * TempB[6];
346 C[7+8*n+i] += TempA[2] * TempB[7];
347
348
349
350 TempB[0] = B[(3+8*m)*lda+0+8*n];
351 TempB[1] = B[(3+8*m)*lda+1+8*n];
352 TempB[2] = B[(3+8*m)*lda+2+8*n];
353 TempB[3] = B[(3+8*m)*lda+3+8*n];
354 TempB[4] = B[(3+8*m)*lda+4+8*n];
355 TempB[5] = B[(3+8*m)*lda+5+8*n];
356 TempB[6] = B[(3+8*m)*lda+6+8*n];
357 TempB[7] = B[(3+8*m)*lda+7+8*n];
358
359 C[0+8*n+i] += TempA[3] * TempB[0];
360 C[1+8*n+i] += TempA[3] * TempB[1];
361 C[2+8*n+i] += TempA[3] * TempB[2];
362 C[3+8*n+i] += TempA[3] * TempB[3];
363 C[4+8*n+i] += TempA[3] * TempB[4];
364 C[5+8*n+i] += TempA[3] * TempB[5];
365 C[6+8*n+i] += TempA[3] * TempB[6];
366 C[7+8*n+i] += TempA[3] * TempB[7];
367
368
369 TempB[0] = B[(4+8*m)*lda+0+8*n];
370 TempB[1] = B[(4+8*m)*lda+1+8*n];
371 TempB[2] = B[(4+8*m)*lda+2+8*n];
372 TempB[3] = B[(4+8*m)*lda+3+8*n];
373 TempB[4] = B[(4+8*m)*lda+4+8*n];
374 TempB[5] = B[(4+8*m)*lda+5+8*n];
375 TempB[6] = B[(4+8*m)*lda+6+8*n];
376 TempB[7] = B[(4+8*m)*lda+7+8*n];
377
378 C[0+8*n+i] += TempA[4] * TempB[0];
379 C[1+8*n+i] += TempA[4] * TempB[1];
380 C[2+8*n+i] += TempA[4] * TempB[2];
381 C[3+8*n+i] += TempA[4] * TempB[3];
382 C[4+8*n+i] += TempA[4] * TempB[4];
383 C[5+8*n+i] += TempA[4] * TempB[5];
384 C[6+8*n+i] += TempA[4] * TempB[6];
385 C[7+8*n+i] += TempA[4] * TempB[7];
386
387
388
389 TempB[0] = B[(5+8*m)*lda+0+8*n];
390 TempB[1] = B[(5+8*m)*lda+1+8*n];
391 TempB[2] = B[(5+8*m)*lda+2+8*n];
392 TempB[3] = B[(5+8*m)*lda+3+8*n];
393 TempB[4] = B[(5+8*m)*lda+4+8*n];
394 TempB[5] = B[(5+8*m)*lda+5+8*n];
395 TempB[6] = B[(5+8*m)*lda+6+8*n];
396 TempB[7] = B[(5+8*m)*lda+7+8*n];
397
398 C[0+8*n+i] += TempA[5] * TempB[0];
399 C[1+8*n+i] += TempA[5] * TempB[1];
400 C[2+8*n+i] += TempA[5] * TempB[2];
401 C[3+8*n+i] += TempA[5] * TempB[3];
402 C[4+8*n+i] += TempA[5] * TempB[4];
403 C[5+8*n+i] += TempA[5] * TempB[5];
404 C[6+8*n+i] += TempA[5] * TempB[6];
405 C[7+8*n+i] += TempA[5] * TempB[7];
406
407
408
409 TempB[0] = B[(6+8*m)*lda+0+8*n];
410 TempB[1] = B[(6+8*m)*lda+1+8*n];
411 TempB[2] = B[(6+8*m)*lda+2+8*n];
412 TempB[3] = B[(6+8*m)*lda+3+8*n];
413 TempB[4] = B[(6+8*m)*lda+4+8*n];
414 TempB[5] = B[(6+8*m)*lda+5+8*n];
415 TempB[6] = B[(6+8*m)*lda+6+8*n];
416 TempB[7] = B[(6+8*m)*lda+7+8*n];
417
418 C[0+8*n+i] += TempA[6] * TempB[0];
419 C[1+8*n+i] += TempA[6] * TempB[1];
420 C[2+8*n+i] += TempA[6] * TempB[2];
421 C[3+8*n+i] += TempA[6] * TempB[3];
422 C[4+8*n+i] += TempA[6] * TempB[4];
423 C[5+8*n+i] += TempA[6] * TempB[5];
424 C[6+8*n+i] += TempA[6] * TempB[6];
425 C[7+8*n+i] += TempA[6] * TempB[7];
426
427
428 TempB[0] = B[(7+8*m)*lda+0+8*n];
429 TempB[1] = B[(7+8*m)*lda+1+8*n];
430 TempB[2] = B[(7+8*m)*lda+2+8*n];
431 TempB[3] = B[(7+8*m)*lda+3+8*n];
432 TempB[4] = B[(7+8*m)*lda+4+8*n];
433 TempB[5] = B[(7+8*m)*lda+5+8*n];
434 TempB[6] = B[(7+8*m)*lda+6+8*n];
435 TempB[7] = B[(7+8*m)*lda+7+8*n];
436
437 C[0+8*n+i] += TempA[7] * TempB[0];
438 C[1+8*n+i] += TempA[7] * TempB[1];
439 C[2+8*n+i] += TempA[7] * TempB[2];
440 C[3+8*n+i] += TempA[7] * TempB[3];
441 C[4+8*n+i] += TempA[7] * TempB[4];
442 C[5+8*n+i] += TempA[7] * TempB[5];
443 C[6+8*n+i] += TempA[7] * TempB[6];
444 C[7+8*n+i] += TempA[7] * TempB[7];
445 }
446
447 }
448 }
449 }
450 if(coreid == 0)
451 {
452 for ( j = 0; j < lda; j+=2 )
453 {
454
455 for ( m = 0; m < 4; m++ )
456 {
457
458 TempA[0] = A[j*lda+0+8*m];
459 TempA[1] = A[j*lda+1+8*m];
460 TempA[2] = A[j*lda+2+8*m];
461 TempA[3] = A[j*lda+3+8*m];
462 TempA[4] = A[j*lda+4+8*m];
463 TempA[5] = A[j*lda+5+8*m];
464 TempA[6] = A[j*lda+6+8*m];
465 TempA[7] = A[j*lda+7+8*m];
466
467 for( n = 0; n < 4; n++)
468 {
469 i = j*lda;
470
471 TempB[0] = B[(0+8*m)*lda+0+8*n];
472 TempB[1] = B[(0+8*m)*lda+1+8*n];
473 TempB[2] = B[(0+8*m)*lda+2+8*n];
474 TempB[3] = B[(0+8*m)*lda+3+8*n];
475 TempB[4] = B[(0+8*m)*lda+4+8*n];
476 TempB[5] = B[(0+8*m)*lda+5+8*n];
477 TempB[6] = B[(0+8*m)*lda+6+8*n];
478 TempB[7] = B[(0+8*m)*lda+7+8*n];
479
480 C[0+8*n+i] += TempA[0] * TempB[0];
481 C[1+8*n+i] += TempA[0] * TempB[1];
482 C[2+8*n+i] += TempA[0] * TempB[2];
483 C[3+8*n+i] += TempA[0] * TempB[3];
484 C[4+8*n+i] += TempA[0] * TempB[4];
485 C[5+8*n+i] += TempA[0] * TempB[5];
486 C[6+8*n+i] += TempA[0] * TempB[6];
487 C[7+8*n+i] += TempA[0] * TempB[7];
488
489
490
491 TempB[0] = B[(1+8*m)*lda+0+8*n];
492 TempB[1] = B[(1+8*m)*lda+1+8*n];
493 TempB[2] = B[(1+8*m)*lda+2+8*n];
494 TempB[3] = B[(1+8*m)*lda+3+8*n];
495 TempB[4] = B[(1+8*m)*lda+4+8*n];
496 TempB[5] = B[(1+8*m)*lda+5+8*n];
497 TempB[6] = B[(1+8*m)*lda+6+8*n];
498 TempB[7] = B[(1+8*m)*lda+7+8*n];
499
500 C[0+8*n+i] += TempA[1] * TempB[0];
501 C[1+8*n+i] += TempA[1] * TempB[1];
502 C[2+8*n+i] += TempA[1] * TempB[2];
503 C[3+8*n+i] += TempA[1] * TempB[3];
504 C[4+8*n+i] += TempA[1] * TempB[4];
505 C[5+8*n+i] += TempA[1] * TempB[5];
506 C[6+8*n+i] += TempA[1] * TempB[6];
507 C[7+8*n+i] += TempA[1] * TempB[7];
508
509
510
511 TempB[0] = B[(2+8*m)*lda+0+8*n];
512 TempB[1] = B[(2+8*m)*lda+1+8*n];
513 TempB[2] = B[(2+8*m)*lda+2+8*n];
514 TempB[3] = B[(2+8*m)*lda+3+8*n];
515 TempB[4] = B[(2+8*m)*lda+4+8*n];
516 TempB[5] = B[(2+8*m)*lda+5+8*n];
517 TempB[6] = B[(2+8*m)*lda+6+8*n];
518 TempB[7] = B[(2+8*m)*lda+7+8*n];
519
520 C[0+8*n+i] += TempA[2] * TempB[0];
521 C[1+8*n+i] += TempA[2] * TempB[1];
522 C[2+8*n+i] += TempA[2] * TempB[2];
523 C[3+8*n+i] += TempA[2] * TempB[3];
524 C[4+8*n+i] += TempA[2] * TempB[4];
525 C[5+8*n+i] += TempA[2] * TempB[5];
526 C[6+8*n+i] += TempA[2] * TempB[6];
527 C[7+8*n+i] += TempA[2] * TempB[7];
528
529
530
531 TempB[0] = B[(3+8*m)*lda+0+8*n];
532 TempB[1] = B[(3+8*m)*lda+1+8*n];
533 TempB[2] = B[(3+8*m)*lda+2+8*n];
534 TempB[3] = B[(3+8*m)*lda+3+8*n];
535 TempB[4] = B[(3+8*m)*lda+4+8*n];
536 TempB[5] = B[(3+8*m)*lda+5+8*n];
537 TempB[6] = B[(3+8*m)*lda+6+8*n];
538 TempB[7] = B[(3+8*m)*lda+7+8*n];
539
540 C[0+8*n+i] += TempA[3] * TempB[0];
541 C[1+8*n+i] += TempA[3] * TempB[1];
542 C[2+8*n+i] += TempA[3] * TempB[2];
543 C[3+8*n+i] += TempA[3] * TempB[3];
544 C[4+8*n+i] += TempA[3] * TempB[4];
545 C[5+8*n+i] += TempA[3] * TempB[5];
546 C[6+8*n+i] += TempA[3] * TempB[6];
547 C[7+8*n+i] += TempA[3] * TempB[7];
548
549
550 TempB[0] = B[(4+8*m)*lda+0+8*n];
551 TempB[1] = B[(4+8*m)*lda+1+8*n];
552 TempB[2] = B[(4+8*m)*lda+2+8*n];
553 TempB[3] = B[(4+8*m)*lda+3+8*n];
554 TempB[4] = B[(4+8*m)*lda+4+8*n];
555 TempB[5] = B[(4+8*m)*lda+5+8*n];
556 TempB[6] = B[(4+8*m)*lda+6+8*n];
557 TempB[7] = B[(4+8*m)*lda+7+8*n];
558
559 C[0+8*n+i] += TempA[4] * TempB[0];
560 C[1+8*n+i] += TempA[4] * TempB[1];
561 C[2+8*n+i] += TempA[4] * TempB[2];
562 C[3+8*n+i] += TempA[4] * TempB[3];
563 C[4+8*n+i] += TempA[4] * TempB[4];
564 C[5+8*n+i] += TempA[4] * TempB[5];
565 C[6+8*n+i] += TempA[4] * TempB[6];
566 C[7+8*n+i] += TempA[4] * TempB[7];
567
568
569
570 TempB[0] = B[(5+8*m)*lda+0+8*n];
571 TempB[1] = B[(5+8*m)*lda+1+8*n];
572 TempB[2] = B[(5+8*m)*lda+2+8*n];
573 TempB[3] = B[(5+8*m)*lda+3+8*n];
574 TempB[4] = B[(5+8*m)*lda+4+8*n];
575 TempB[5] = B[(5+8*m)*lda+5+8*n];
576 TempB[6] = B[(5+8*m)*lda+6+8*n];
577 TempB[7] = B[(5+8*m)*lda+7+8*n];
578
579 C[0+8*n+i] += TempA[5] * TempB[0];
580 C[1+8*n+i] += TempA[5] * TempB[1];
581 C[2+8*n+i] += TempA[5] * TempB[2];
582 C[3+8*n+i] += TempA[5] * TempB[3];
583 C[4+8*n+i] += TempA[5] * TempB[4];
584 C[5+8*n+i] += TempA[5] * TempB[5];
585 C[6+8*n+i] += TempA[5] * TempB[6];
586 C[7+8*n+i] += TempA[5] * TempB[7];
587
588
589
590 TempB[0] = B[(6+8*m)*lda+0+8*n];
591 TempB[1] = B[(6+8*m)*lda+1+8*n];
592 TempB[2] = B[(6+8*m)*lda+2+8*n];
593 TempB[3] = B[(6+8*m)*lda+3+8*n];
594 TempB[4] = B[(6+8*m)*lda+4+8*n];
595 TempB[5] = B[(6+8*m)*lda+5+8*n];
596 TempB[6] = B[(6+8*m)*lda+6+8*n];
597 TempB[7] = B[(6+8*m)*lda+7+8*n];
598
599 C[0+8*n+i] += TempA[6] * TempB[0];
600 C[1+8*n+i] += TempA[6] * TempB[1];
601 C[2+8*n+i] += TempA[6] * TempB[2];
602 C[3+8*n+i] += TempA[6] * TempB[3];
603 C[4+8*n+i] += TempA[6] * TempB[4];
604 C[5+8*n+i] += TempA[6] * TempB[5];
605 C[6+8*n+i] += TempA[6] * TempB[6];
606 C[7+8*n+i] += TempA[6] * TempB[7];
607
608
609 TempB[0] = B[(7+8*m)*lda+0+8*n];
610 TempB[1] = B[(7+8*m)*lda+1+8*n];
611 TempB[2] = B[(7+8*m)*lda+2+8*n];
612 TempB[3] = B[(7+8*m)*lda+3+8*n];
613 TempB[4] = B[(7+8*m)*lda+4+8*n];
614 TempB[5] = B[(7+8*m)*lda+5+8*n];
615 TempB[6] = B[(7+8*m)*lda+6+8*n];
616 TempB[7] = B[(7+8*m)*lda+7+8*n];
617
618 C[0+8*n+i] += TempA[7] * TempB[0];
619 C[1+8*n+i] += TempA[7] * TempB[1];
620 C[2+8*n+i] += TempA[7] * TempB[2];
621 C[3+8*n+i] += TempA[7] * TempB[3];
622 C[4+8*n+i] += TempA[7] * TempB[4];
623 C[5+8*n+i] += TempA[7] * TempB[5];
624 C[6+8*n+i] += TempA[7] * TempB[6];
625 C[7+8*n+i] += TempA[7] * TempB[7];
626 }
627
628 }
629 }
630 }
631
632 */
633 //-------------------------------------------------------------version2.2, optimize k. from 4 instead of 8 like v2.1, random failing on MI, unknown reason, MSI,350K, take off each inner loop for core 0 260k, both cores 134k
634 //-------------------------------------------------------------try false sharing for core 0, 136k.
635 /*
636 static __thread int j, m, n;
637 static __thread data_t TempA[4];
638 static __thread data_t TempB[4];
639
640 if(coreid == 1)
641 {
642 for ( j = 1; j < lda; j+=2 )
643 {
644 for ( m = 0; m < 8; m++ )
645 {
646 TempA[0] = A[j*lda+0+4*m];
647 TempA[1] = A[j*lda+1+4*m];
648 TempA[2] = A[j*lda+2+4*m];
649 TempA[3] = A[j*lda+3+4*m];
650
651 for( n = 0; n < 8; n++)
652 {
653
654 TempB[0] = B[(0+4*m)*lda+0+4*n];
655 TempB[1] = B[(0+4*m)*lda+1+4*n];
656 TempB[2] = B[(0+4*m)*lda+2+4*n];
657 TempB[3] = B[(0+4*m)*lda+3+4*n];
658
659
660 C[0+4*n+j*lda] += TempA[0] * TempB[0];
661 C[1+4*n+j*lda] += TempA[0] * TempB[1];
662 C[2+4*n+j*lda] += TempA[0] * TempB[2];
663 C[3+4*n+j*lda] += TempA[0] * TempB[3];
664
665
666
667
668
669 TempB[0] = B[(1+4*m)*lda+0+4*n];
670 TempB[1] = B[(1+4*m)*lda+1+4*n];
671 TempB[2] = B[(1+4*m)*lda+2+4*n];
672 TempB[3] = B[(1+4*m)*lda+3+4*n];
673
674
675 C[0+4*n+j*lda] += TempA[1] * TempB[0];
676 C[1+4*n+j*lda] += TempA[1] * TempB[1];
677 C[2+4*n+j*lda] += TempA[1] * TempB[2];
678 C[3+4*n+j*lda] += TempA[1] * TempB[3];
679
680
681
682 TempB[0] = B[(2+4*m)*lda+0+4*n];
683 TempB[1] = B[(2+4*m)*lda+1+4*n];
684 TempB[2] = B[(2+4*m)*lda+2+4*n];
685 TempB[3] = B[(2+4*m)*lda+3+4*n];
686
687
688 C[0+4*n+j*lda] += TempA[2] * TempB[0];
689 C[1+4*n+j*lda] += TempA[2] * TempB[1];
690 C[2+4*n+j*lda] += TempA[2] * TempB[2];
691 C[3+4*n+j*lda] += TempA[2] * TempB[3];
692
693
694
695
696 TempB[0] = B[(3+4*m)*lda+0+4*n];
697 TempB[1] = B[(3+4*m)*lda+1+4*n];
698 TempB[2] = B[(3+4*m)*lda+2+4*n];
699 TempB[3] = B[(3+4*m)*lda+3+4*n];
700
701
702 C[0+4*n+j*lda] += TempA[3] * TempB[0];
703 C[1+4*n+j*lda] += TempA[3] * TempB[1];
704 C[2+4*n+j*lda] += TempA[3] * TempB[2];
705 C[3+4*n+j*lda] += TempA[3] * TempB[3];
706
707
708 }
709 }
710 }
711 }
712 if(coreid == 0)
713 {
714 for ( j = 0; j < lda; j+=2 )
715 {
716 for ( m = 0; m < 8; m++ )
717 {
718 TempA[0] = A[j*lda+0+4*m];
719 TempA[1] = A[j*lda+1+4*m];
720 TempA[2] = A[j*lda+2+4*m];
721 TempA[3] = A[j*lda+3+4*m];
722
723 for( n = 0; n < 8; n++)
724 {
725
726
727
728
729
730
731
732 TempB[0] = B[(1+4*m)*lda+0+4*n];
733 TempB[1] = B[(1+4*m)*lda+1+4*n];
734 TempB[2] = B[(1+4*m)*lda+2+4*n];
735 TempB[3] = B[(1+4*m)*lda+3+4*n];
736
737
738 C[0+4*n+j*lda] += TempA[1] * TempB[0];
739 C[1+4*n+j*lda] += TempA[1] * TempB[1];
740 C[2+4*n+j*lda] += TempA[1] * TempB[2];
741 C[3+4*n+j*lda] += TempA[1] * TempB[3];
742
743
744
745 TempB[0] = B[(2+4*m)*lda+0+4*n];
746 TempB[1] = B[(2+4*m)*lda+1+4*n];
747 TempB[2] = B[(2+4*m)*lda+2+4*n];
748 TempB[3] = B[(2+4*m)*lda+3+4*n];
749
750
751 C[0+4*n+j*lda] += TempA[2] * TempB[0];
752 C[1+4*n+j*lda] += TempA[2] * TempB[1];
753 C[2+4*n+j*lda] += TempA[2] * TempB[2];
754 C[3+4*n+j*lda] += TempA[2] * TempB[3];
755
756
757
758
759 TempB[0] = B[(3+4*m)*lda+0+4*n];
760 TempB[1] = B[(3+4*m)*lda+1+4*n];
761 TempB[2] = B[(3+4*m)*lda+2+4*n];
762 TempB[3] = B[(3+4*m)*lda+3+4*n];
763
764
765 C[0+4*n+j*lda] += TempA[3] * TempB[0];
766 C[1+4*n+j*lda] += TempA[3] * TempB[1];
767 C[2+4*n+j*lda] += TempA[3] * TempB[2];
768 C[3+4*n+j*lda] += TempA[3] * TempB[3];
769
770 TempB[0] = B[(0+4*m)*lda+0+4*n];
771 TempB[1] = B[(0+4*m)*lda+1+4*n];
772 TempB[2] = B[(0+4*m)*lda+2+4*n];
773 TempB[3] = B[(0+4*m)*lda+3+4*n];
774
775
776 C[0+4*n+j*lda] += TempA[0] * TempB[0];
777 C[1+4*n+j*lda] += TempA[0] * TempB[1];
778 C[2+4*n+j*lda] += TempA[0] * TempB[2];
779 C[3+4*n+j*lda] += TempA[0] * TempB[3];
780
781
782 }
783 }
784 }
785 }
786 */
787
788
789
790 //-------------------------------------------------------------version2.3, read 8 elements in B at one time. make k to 2. 150k mi 128k msi. worse than v2.0
791 /*
792 static __thread int i, j, k, m, n;
793 static __thread data_t TempA[2];
794 static __thread data_t TempB[8];
795
796 if(coreid == 0)
797 {
798 for ( j = 0; j < lda; j+=2 )
799 {
800 for ( m = 0; m < 16; m++ )
801 {
802 TempA[0] = A[j*lda + 0 + 2*m];
803 TempA[1] = A[j*lda + 1 + 2*m];
804 for( n = 0; n < 4; n++)
805 {
806
807 TempB[0] = B[2*m*lda+0+8*n];
808 TempB[1] = B[2*m*lda+1+8*n];
809 TempB[2] = B[2*m*lda+2+8*n];
810 TempB[3] = B[2*m*lda+3+8*n];
811 TempB[4] = B[2*m*lda+4+8*n];
812 TempB[5] = B[2*m*lda+5+8*n];
813 TempB[6] = B[2*m*lda+6+8*n];
814 TempB[7] = B[2*m*lda+7+8*n];
815
816 C[0+8*n+j*lda] += TempA[0] * TempB[0];
817 C[1+8*n+j*lda] += TempA[0] * TempB[1];
818 C[2+8*n+j*lda] += TempA[0] * TempB[2];
819 C[3+8*n+j*lda] += TempA[0] * TempB[3];
820 C[4+8*n+j*lda] += TempA[0] * TempB[4];
821 C[5+8*n+j*lda] += TempA[0] * TempB[5];
822 C[6+8*n+j*lda] += TempA[0] * TempB[6];
823 C[7+8*n+j*lda] += TempA[0] * TempB[7];
824
825 TempB[0] = B[(1+2*m)*lda+0+8*n];
826 TempB[1] = B[(1+2*m)*lda+1+8*n];
827 TempB[2] = B[(1+2*m)*lda+2+8*n];
828 TempB[3] = B[(1+2*m)*lda+3+8*n];
829 TempB[4] = B[(1+2*m)*lda+4+8*n];
830 TempB[5] = B[(1+2*m)*lda+5+8*n];
831 TempB[6] = B[(1+2*m)*lda+6+8*n];
832 TempB[7] = B[(1+2*m)*lda+7+8*n];
833
834 C[0+8*n+j*lda] += TempA[1] * TempB[0];
835 C[1+8*n+j*lda] += TempA[1] * TempB[1];
836 C[2+8*n+j*lda] += TempA[1] * TempB[2];
837 C[3+8*n+j*lda] += TempA[1] * TempB[3];
838 C[4+8*n+j*lda] += TempA[1] * TempB[4];
839 C[5+8*n+j*lda] += TempA[1] * TempB[5];
840 C[6+8*n+j*lda] += TempA[1] * TempB[6];
841 C[7+8*n+j*lda] += TempA[1] * TempB[7];
842
843 }
844
845 }
846 }
847 }
848
849 if(coreid == 1)
850 {
851 for ( j = 1; j < lda; j+=2 )
852 {
853 for ( m = 0; m < 16; m++ )
854 {
855 TempA[0] = A[j*lda + 0 + 2*m];
856 TempA[1] = A[j*lda + 1 + 2*m];
857 for( n = 0; n < 4; n++)
858 {
859
860 TempB[0] = B[2*m*lda+0+8*n];
861 TempB[1] = B[2*m*lda+1+8*n];
862 TempB[2] = B[2*m*lda+2+8*n];
863 TempB[3] = B[2*m*lda+3+8*n];
864 TempB[4] = B[2*m*lda+4+8*n];
865 TempB[5] = B[2*m*lda+5+8*n];
866 TempB[6] = B[2*m*lda+6+8*n];
867 TempB[7] = B[2*m*lda+7+8*n];
868
869 C[0+8*n+j*lda] += TempA[0] * TempB[0];
870 C[1+8*n+j*lda] += TempA[0] * TempB[1];
871 C[2+8*n+j*lda] += TempA[0] * TempB[2];
872 C[3+8*n+j*lda] += TempA[0] * TempB[3];
873 C[4+8*n+j*lda] += TempA[0] * TempB[4];
874 C[5+8*n+j*lda] += TempA[0] * TempB[5];
875 C[6+8*n+j*lda] += TempA[0] * TempB[6];
876 C[7+8*n+j*lda] += TempA[0] * TempB[7];
877
878 TempB[0] = B[(1+2*m)*lda+0+8*n];
879 TempB[1] = B[(1+2*m)*lda+1+8*n];
880 TempB[2] = B[(1+2*m)*lda+2+8*n];
881 TempB[3] = B[(1+2*m)*lda+3+8*n];
882 TempB[4] = B[(1+2*m)*lda+4+8*n];
883 TempB[5] = B[(1+2*m)*lda+5+8*n];
884 TempB[6] = B[(1+2*m)*lda+6+8*n];
885 TempB[7] = B[(1+2*m)*lda+7+8*n];
886
887 C[0+8*n+j*lda] += TempA[1] * TempB[0];
888 C[1+8*n+j*lda] += TempA[1] * TempB[1];
889 C[2+8*n+j*lda] += TempA[1] * TempB[2];
890 C[3+8*n+j*lda] += TempA[1] * TempB[3];
891 C[4+8*n+j*lda] += TempA[1] * TempB[4];
892 C[5+8*n+j*lda] += TempA[1] * TempB[5];
893 C[6+8*n+j*lda] += TempA[1] * TempB[6];
894 C[7+8*n+j*lda] += TempA[1] * TempB[7];
895
896 }
897
898 }
899 }
900 }
901 */
902 //-------------------------------------------------------------version2.4, read 4 170k and 16 140k, error because not enough space elements in B at one time.
903 /*
904 static __thread int i, j, k, m, n;
905 static __thread data_t TempA;
906 static __thread data_t TempB[16];
907
908 if(coreid == 0)
909 {
910 for ( j = 0; j < lda; j+=2 )
911 {
912 for ( k = 0; k < lda; k++ )
913 {
914 TempA = A[j*lda + k];
915 for( n = 0; n < 2; n++)
916 {
917
918 TempB[0] = B[k*lda+0+16*n];
919 TempB[1] = B[k*lda+1+16*n];
920 TempB[2] = B[k*lda+2+16*n];
921 TempB[3] = B[k*lda+3+16*n];
922 TempB[4] = B[k*lda+4+16*n];
923 TempB[5] = B[k*lda+5+16*n];
924 TempB[6] = B[k*lda+6+16*n];
925 TempB[7] = B[k*lda+7+16*n];
926 TempB[8] = B[k*lda+8+16*n];
927 TempB[9] = B[k*lda+9+16*n];
928 TempB[10] = B[k*lda+10+16*n];
929 TempB[11] = B[k*lda+11+16*n];
930 TempB[12] = B[k*lda+12+16*n];
931 TempB[13] = B[k*lda+13+16*n];
932 TempB[14] = B[k*lda+14+16*n];
933 TempB[15] = B[k*lda+15+16*n];
934
935
936 C[0+16*n+j*lda] += TempA * TempB[0];
937 C[1+16*n+j*lda] += TempA * TempB[1];
938 C[2+16*n+j*lda] += TempA * TempB[2];
939 C[3+16*n+j*lda] += TempA * TempB[3];
940 C[4+16*n+j*lda] += TempA * TempB[4];
941 C[5+16*n+j*lda] += TempA * TempB[5];
942 C[6+16*n+j*lda] += TempA * TempB[6];
943 C[7+16*n+j*lda] += TempA * TempB[7];
944 C[8+16*n+j*lda] += TempA * TempB[8];
945 C[9+16*n+j*lda] += TempA * TempB[9];
946 C[10+16*n+j*lda] += TempA * TempB[10];
947 C[11+16*n+j*lda] += TempA * TempB[11];
948 C[12+16*n+j*lda] += TempA * TempB[12];
949 C[13+16*n+j*lda] += TempA * TempB[13];
950 C[14+16*n+j*lda] += TempA * TempB[14];
951 C[15+16*n+j*lda] += TempA * TempB[15];
952
953
954
955 }
956
957 }
958 }
959 }
960 if(coreid == 1)
961 {
962 for ( j = 1; j < lda; j+=2 )
963 {
964 for ( k = 0; k < lda; k++ )
965 {
966 TempA = A[j*lda + k];
967 for( n = 0; n < 2; n++)
968 {
969
970 TempB[0] = B[k*lda+0+16*n];
971 TempB[1] = B[k*lda+1+16*n];
972 TempB[2] = B[k*lda+2+16*n];
973 TempB[3] = B[k*lda+3+16*n];
974 TempB[4] = B[k*lda+4+16*n];
975 TempB[5] = B[k*lda+5+16*n];
976 TempB[6] = B[k*lda+6+16*n];
977 TempB[7] = B[k*lda+7+16*n];
978 TempB[8] = B[k*lda+8+16*n];
979 TempB[9] = B[k*lda+9+16*n];
980 TempB[10] = B[k*lda+10+16*n];
981 TempB[11] = B[k*lda+11+16*n];
982 TempB[12] = B[k*lda+12+16*n];
983 TempB[13] = B[k*lda+13+16*n];
984 TempB[14] = B[k*lda+14+16*n];
985 TempB[15] = B[k*lda+15+16*n];
986
987
988 C[0+16*n+j*lda] += TempA * TempB[0];
989 C[1+16*n+j*lda] += TempA * TempB[1];
990 C[2+16*n+j*lda] += TempA * TempB[2];
991 C[3+16*n+j*lda] += TempA * TempB[3];
992 C[4+16*n+j*lda] += TempA * TempB[4];
993 C[5+16*n+j*lda] += TempA * TempB[5];
994 C[6+16*n+j*lda] += TempA * TempB[6];
995 C[7+16*n+j*lda] += TempA * TempB[7];
996 C[8+16*n+j*lda] += TempA * TempB[8];
997 C[9+16*n+j*lda] += TempA * TempB[9];
998 C[10+16*n+j*lda] += TempA * TempB[10];
999 C[11+16*n+j*lda] += TempA * TempB[11];
1000 C[12+16*n+j*lda] += TempA * TempB[12];
1001 C[13+16*n+j*lda] += TempA * TempB[13];
1002 C[14+16*n+j*lda] += TempA * TempB[14];
1003 C[15+16*n+j*lda] += TempA * TempB[15];
1004
1005
1006
1007 }
1008
1009 }
1010 }
1011 }
1012
1013 */
1014 //-------------------------------------------------------------version2.5, read 10 elements in B at one time. has corner cases. Turns out it hangs.
1015 /*
1016 static __thread int j, k, n;
1017 static __thread data_t TempA;
1018 static __thread data_t TempB[10];
1019
1020 if(coreid == 0)
1021 {
1022 for ( j = 0; j < lda; j+=2 )
1023 {
1024 for ( k = 0; k < lda; k++ )
1025 {
1026 TempA = A[j*lda + k];
1027 for( n = 0; n < 3; n++)
1028 {
1029 TempB[0] = B[k*lda+0+10*n];
1030 TempB[1] = B[k*lda+1+10*n];
1031 TempB[2] = B[k*lda+2+10*n];
1032 TempB[3] = B[k*lda+3+10*n];
1033 TempB[4] = B[k*lda+4+10*n];
1034 TempB[5] = B[k*lda+5+10*n];
1035 TempB[6] = B[k*lda+6+10*n];
1036 TempB[7] = B[k*lda+7+10*n];
1037 TempB[8] = B[k*lda+8+10*n];
1038 TempB[9] = B[k*lda+9+10*n];
1039
1040 C[0+10*n+j*lda] += TempA * TempB[0];
1041 C[1+10*n+j*lda] += TempA * TempB[1];
1042 C[2+10*n+j*lda] += TempA * TempB[2];
1043 C[3+10*n+j*lda] += TempA * TempB[3];
1044 C[4+10*n+j*lda] += TempA * TempB[4];
1045 C[5+10*n+j*lda] += TempA * TempB[5];
1046 C[6+10*n+j*lda] += TempA * TempB[6];
1047 C[7+10*n+j*lda] += TempA * TempB[7];
1048 C[8+10*n+j*lda] += TempA * TempB[8];
1049 C[9+10*n+j*lda] += TempA * TempB[9];
1050 }
1051 TempB[0] = B[k*lda+30];
1052 TempB[1] = B[k*lda+31];
1053 C[30+j*lda] += TempA * TempB[0];
1054 C[31+j*lda] += TempA * TempB[1];
1055 }
1056 }
1057 }
1058 if(coreid == 1)
1059 {
1060 for ( j = 1; j < lda; j+=2 )
1061 {
1062 for ( k = 0; k < lda; k++ )
1063 {
1064 TempA = A[j*lda + k];
1065 for( n = 0; n < 3; n++)
1066 {
1067 TempB[0] = B[k*lda+0+10*n];
1068 TempB[1] = B[k*lda+1+10*n];
1069 TempB[2] = B[k*lda+2+10*n];
1070 TempB[3] = B[k*lda+3+10*n];
1071 TempB[4] = B[k*lda+4+10*n];
1072 TempB[5] = B[k*lda+5+10*n];
1073 TempB[6] = B[k*lda+6+10*n];
1074 TempB[7] = B[k*lda+7+10*n];
1075 TempB[8] = B[k*lda+8+10*n];
1076 TempB[9] = B[k*lda+9+10*n];
1077
1078 C[0+10*n+j*lda] += TempA * TempB[0];
1079 C[1+10*n+j*lda] += TempA * TempB[1];
1080 C[2+10*n+j*lda] += TempA * TempB[2];
1081 C[3+10*n+j*lda] += TempA * TempB[3];
1082 C[4+10*n+j*lda] += TempA * TempB[4];
1083 C[5+10*n+j*lda] += TempA * TempB[5];
1084 C[6+10*n+j*lda] += TempA * TempB[6];
1085 C[7+10*n+j*lda] += TempA * TempB[7];
1086 C[8+10*n+j*lda] += TempA * TempB[8];
1087 C[9+10*n+j*lda] += TempA * TempB[9];
1088 }
1089 TempB[0] = B[k*lda+30];
1090 TempB[1] = B[k*lda+31];
1091 C[30+j*lda] += TempA * TempB[0];
1092 C[31+j*lda] += TempA * TempB[1];
1093 }
1094 }
1095 }
1096
1097 */
1098
1099 //-------------------------------------------------------------version2.6, optimize 2.0. take off n loop and tried different order of reading B
1100 /*
1101 static __thread int j, k, n;
1102 static __thread data_t TempA;
1103 static __thread data_t TempB[8];
1104
1105 if(coreid == 0)
1106 {
1107 for ( j = 0; j < lda; j+=2 )
1108 {
1109 for ( k = 0; k < lda; k++ )
1110 {
1111 TempA = A[j*lda + k];
1112
1113 TempB[0] = B[k*lda+0];
1114 TempB[1] = B[k*lda+1];
1115 TempB[2] = B[k*lda+2];
1116 TempB[3] = B[k*lda+3];
1117 TempB[4] = B[k*lda+4];
1118 TempB[5] = B[k*lda+5];
1119 TempB[6] = B[k*lda+6];
1120 TempB[7] = B[k*lda+7];
1121
1122 C[0+j*lda] += TempA * TempB[0];
1123 C[1+j*lda] += TempA * TempB[1];
1124 C[2+j*lda] += TempA * TempB[2];
1125 C[3+j*lda] += TempA * TempB[3];
1126 C[4+j*lda] += TempA * TempB[4];
1127 C[5+j*lda] += TempA * TempB[5];
1128 C[6+j*lda] += TempA * TempB[6];
1129 C[7+j*lda] += TempA * TempB[7];
1130
1131 TempB[0] = B[k*lda+8];
1132 TempB[1] = B[k*lda+9];
1133 TempB[2] = B[k*lda+10];
1134 TempB[3] = B[k*lda+11];
1135 TempB[4] = B[k*lda+12];
1136 TempB[5] = B[k*lda+13];
1137 TempB[6] = B[k*lda+14];
1138 TempB[7] = B[k*lda+15];
1139
1140 C[8+j*lda] += TempA * TempB[0];
1141 C[9+j*lda] += TempA * TempB[1];
1142 C[10+j*lda] += TempA * TempB[2];
1143 C[11+j*lda] += TempA * TempB[3];
1144 C[12+j*lda] += TempA * TempB[4];
1145 C[13+j*lda] += TempA * TempB[5];
1146 C[14+j*lda] += TempA * TempB[6];
1147 C[15+j*lda] += TempA * TempB[7];
1148
1149 TempB[0] = B[k*lda+16];
1150 TempB[1] = B[k*lda+17];
1151 TempB[2] = B[k*lda+18];
1152 TempB[3] = B[k*lda+19];
1153 TempB[4] = B[k*lda+20];
1154 TempB[5] = B[k*lda+21];
1155 TempB[6] = B[k*lda+22];
1156 TempB[7] = B[k*lda+23];
1157
1158 C[16+j*lda] += TempA * TempB[0];
1159 C[17+j*lda] += TempA * TempB[1];
1160 C[18+j*lda] += TempA * TempB[2];
1161 C[19+j*lda] += TempA * TempB[3];
1162 C[20+j*lda] += TempA * TempB[4];
1163 C[21+j*lda] += TempA * TempB[5];
1164 C[22+j*lda] += TempA * TempB[6];
1165 C[23+j*lda] += TempA * TempB[7];
1166
1167 TempB[0] = B[k*lda+24];
1168 TempB[1] = B[k*lda+25];
1169 TempB[2] = B[k*lda+26];
1170 TempB[3] = B[k*lda+27];
1171 TempB[4] = B[k*lda+28];
1172 TempB[5] = B[k*lda+29];
1173 TempB[6] = B[k*lda+30];
1174 TempB[7] = B[k*lda+31];
1175
1176 C[24+j*lda] += TempA * TempB[0];
1177 C[25+j*lda] += TempA * TempB[1];
1178 C[26+j*lda] += TempA * TempB[2];
1179 C[27+j*lda] += TempA * TempB[3];
1180 C[28+j*lda] += TempA * TempB[4];
1181 C[29+j*lda] += TempA * TempB[5];
1182 C[30+j*lda] += TempA * TempB[6];
1183 C[31+j*lda] += TempA * TempB[7];
1184
1185
1186
1187 }
1188 }
1189 }
1190
1191 if(coreid == 1)
1192 {
1193 for ( j = 1; j < lda; j+=2 )
1194 {
1195 for ( k = 0; k < lda; k++ )
1196 {
1197 TempA = A[j*lda + k];
1198
1199
1200 TempB[0] = B[k*lda+24];
1201 TempB[1] = B[k*lda+25];
1202 TempB[2] = B[k*lda+26];
1203 TempB[3] = B[k*lda+27];
1204 TempB[4] = B[k*lda+28];
1205 TempB[5] = B[k*lda+29];
1206 TempB[6] = B[k*lda+30];
1207 TempB[7] = B[k*lda+31];
1208
1209 C[24+j*lda] += TempA * TempB[0];
1210 C[25+j*lda] += TempA * TempB[1];
1211 C[26+j*lda] += TempA * TempB[2];
1212 C[27+j*lda] += TempA * TempB[3];
1213 C[28+j*lda] += TempA * TempB[4];
1214 C[29+j*lda] += TempA * TempB[5];
1215 C[30+j*lda] += TempA * TempB[6];
1216 C[31+j*lda] += TempA * TempB[7];
1217
1218 TempB[0] = B[k*lda+0];
1219 TempB[1] = B[k*lda+1];
1220 TempB[2] = B[k*lda+2];
1221 TempB[3] = B[k*lda+3];
1222 TempB[4] = B[k*lda+4];
1223 TempB[5] = B[k*lda+5];
1224 TempB[6] = B[k*lda+6];
1225 TempB[7] = B[k*lda+7];
1226
1227 C[0+j*lda] += TempA * TempB[0];
1228 C[1+j*lda] += TempA * TempB[1];
1229 C[2+j*lda] += TempA * TempB[2];
1230 C[3+j*lda] += TempA * TempB[3];
1231 C[4+j*lda] += TempA * TempB[4];
1232 C[5+j*lda] += TempA * TempB[5];
1233 C[6+j*lda] += TempA * TempB[6];
1234 C[7+j*lda] += TempA * TempB[7];
1235
1236 TempB[0] = B[k*lda+8];
1237 TempB[1] = B[k*lda+9];
1238 TempB[2] = B[k*lda+10];
1239 TempB[3] = B[k*lda+11];
1240 TempB[4] = B[k*lda+12];
1241 TempB[5] = B[k*lda+13];
1242 TempB[6] = B[k*lda+14];
1243 TempB[7] = B[k*lda+15];
1244
1245 C[8+j*lda] += TempA * TempB[0];
1246 C[9+j*lda] += TempA * TempB[1];
1247 C[10+j*lda] += TempA * TempB[2];
1248 C[11+j*lda] += TempA * TempB[3];
1249 C[12+j*lda] += TempA * TempB[4];
1250 C[13+j*lda] += TempA * TempB[5];
1251 C[14+j*lda] += TempA * TempB[6];
1252 C[15+j*lda] += TempA * TempB[7];
1253
1254 TempB[0] = B[k*lda+16];
1255 TempB[1] = B[k*lda+17];
1256 TempB[2] = B[k*lda+18];
1257 TempB[3] = B[k*lda+19];
1258 TempB[4] = B[k*lda+20];
1259 TempB[5] = B[k*lda+21];
1260 TempB[6] = B[k*lda+22];
1261 TempB[7] = B[k*lda+23];
1262
1263 C[16+j*lda] += TempA * TempB[0];
1264 C[17+j*lda] += TempA * TempB[1];
1265 C[18+j*lda] += TempA * TempB[2];
1266 C[19+j*lda] += TempA * TempB[3];
1267 C[20+j*lda] += TempA * TempB[4];
1268 C[21+j*lda] += TempA * TempB[5];
1269 C[22+j*lda] += TempA * TempB[6];
1270 C[23+j*lda] += TempA * TempB[7];
1271
1272
1273
1274
1275
1276
1277 }
1278 }
1279 }
1280 */
1281 //-------------------------------------------------------------version2.7, use m=l*da, i=k*lda,out of stack, only i, MI 150k, only m, MSI 117.9k slower than v2.0
1282 /*
1283 static __thread int i, j, k, m, n;
1284 static __thread data_t TempA;
1285 static __thread data_t TempB[8];
1286
1287 if(coreid == 0)
1288 {
1289 for ( j = 0; j < lda; j+=2 )
1290 {
1291 m = j * lda;
1292 for ( k = 0; k < lda; k++ )
1293 {
1294 TempA = A[m+ k];
1295 for( n = 0; n < 4; n++)
1296 {
1297
1298 TempB[0] = B[k *lda+0+8*n];
1299 TempB[1] = B[k *lda+1+8*n];
1300 TempB[2] = B[k *lda+2+8*n];
1301 TempB[3] = B[k *lda+3+8*n];
1302 TempB[4] = B[k *lda+4+8*n];
1303 TempB[5] = B[k *lda+5+8*n];
1304 TempB[6] = B[k *lda+6+8*n];
1305 TempB[7] = B[k *lda+7+8*n];
1306
1307 C[0+8*n+m] += TempA * TempB[0];
1308 C[1+8*n+m] += TempA * TempB[1];
1309 C[2+8*n+m] += TempA * TempB[2];
1310 C[3+8*n+m] += TempA * TempB[3];
1311 C[4+8*n+m] += TempA * TempB[4];
1312 C[5+8*n+m] += TempA * TempB[5];
1313 C[6+8*n+m] += TempA * TempB[6];
1314 C[7+8*n+m] += TempA * TempB[7];
1315
1316 }
1317
1318 }
1319 }
1320 }
1321 if(coreid == 1)
1322 {
1323 for ( j = 1; j < lda; j+=2 )
1324 {
1325 m = j * lda;
1326 for ( k = 0; k < lda; k++ )
1327 {
1328 TempA = A[m+ k];
1329 for( n = 0; n < 4; n++)
1330 {
1331
1332 TempB[0] = B[k *lda+0+8*n];
1333 TempB[1] = B[k *lda+1+8*n];
1334 TempB[2] = B[k *lda+2+8*n];
1335 TempB[3] = B[k *lda+3+8*n];
1336 TempB[4] = B[k *lda+4+8*n];
1337 TempB[5] = B[k *lda+5+8*n];
1338 TempB[6] = B[k *lda+6+8*n];
1339 TempB[7] = B[k *lda+7+8*n];
1340
1341 C[0+8*n+m] += TempA * TempB[0];
1342 C[1+8*n+m] += TempA * TempB[1];
1343 C[2+8*n+m] += TempA * TempB[2];
1344 C[3+8*n+m] += TempA * TempB[3];
1345 C[4+8*n+m] += TempA * TempB[4];
1346 C[5+8*n+m] += TempA * TempB[5];
1347 C[6+8*n+m] += TempA * TempB[6];
1348 C[7+8*n+m] += TempA * TempB[7];
1349
1350 }
1351
1352 }
1353 }
1354 }
1355 */
1356 //-------------------------------------------------------------version2.8 deal with false sharing, MSI,118K vs v2.0 117.0K. MI 147.629K.
1357 /*
1358 static __thread int i, j, k, m, n;
1359 static __thread data_t TempA;
1360 static __thread data_t TempB[8];
1361
1362 if(coreid == 0)
1363 {
1364 for ( j = 0; j < lda; j+=2 )
1365 {
1366 for ( k = 0; k < lda; k++ )
1367 {
1368 TempA = A[j*lda + k];
1369 for( n = 0; n < 2; n++)
1370 {
1371
1372 TempB[0] = B[k*lda+0+16*n];
1373 TempB[1] = B[k*lda+1+16*n];
1374 TempB[2] = B[k*lda+2+16*n];
1375 TempB[3] = B[k*lda+3+16*n];
1376 TempB[4] = B[k*lda+4+16*n];
1377 TempB[5] = B[k*lda+5+16*n];
1378 TempB[6] = B[k*lda+6+16*n];
1379 TempB[7] = B[k*lda+7+16*n];
1380
1381
1382
1383 C[0+16*n+j*lda] += TempA * TempB[0];
1384 C[1+16*n+j*lda] += TempA * TempB[1];
1385 C[2+16*n+j*lda] += TempA * TempB[2];
1386 C[3+16*n+j*lda] += TempA * TempB[3];
1387 C[4+16*n+j*lda] += TempA * TempB[4];
1388 C[5+16*n+j*lda] += TempA * TempB[5];
1389 C[6+16*n+j*lda] += TempA * TempB[6];
1390 C[7+16*n+j*lda] += TempA * TempB[7];
1391
1392 TempB[0] = B[k*lda+8+16*n];
1393 TempB[1] = B[k*lda+9+16*n];
1394 TempB[2] = B[k*lda+10+16*n];
1395 TempB[3] = B[k*lda+11+16*n];
1396 TempB[4] = B[k*lda+12+16*n];
1397 TempB[5] = B[k*lda+13+16*n];
1398 TempB[6] = B[k*lda+14+16*n];
1399 TempB[7] = B[k*lda+15+16*n];
1400
1401 C[8+16*n+j*lda] += TempA * TempB[0];
1402 C[9+16*n+j*lda] += TempA * TempB[1];
1403 C[10+16*n+j*lda] += TempA * TempB[2];
1404 C[11+16*n+j*lda] += TempA * TempB[3];
1405 C[12+16*n+j*lda] += TempA * TempB[4];
1406 C[13+16*n+j*lda] += TempA * TempB[5];
1407 C[14+16*n+j*lda] += TempA * TempB[6];
1408 C[15+16*n+j*lda] += TempA * TempB[7];
1409
1410
1411
1412 }
1413
1414 }
1415 }
1416 }
1417 if(coreid == 1)
1418 {
1419 for ( j = 1; j < lda; j+=2 )
1420 {
1421 for ( k = 0; k < lda; k++ )
1422 {
1423 TempA = A[j*lda + k];
1424 for( n = 0; n < 2; n++)
1425 {
1426
1427
1428
1429 TempB[0] = B[k*lda+8+16*n];
1430 TempB[1] = B[k*lda+9+16*n];
1431 TempB[2] = B[k*lda+10+16*n];
1432 TempB[3] = B[k*lda+11+16*n];
1433 TempB[4] = B[k*lda+12+16*n];
1434 TempB[5] = B[k*lda+13+16*n];
1435 TempB[6] = B[k*lda+14+16*n];
1436 TempB[7] = B[k*lda+15+16*n];
1437
1438 C[8+16*n+j*lda] += TempA * TempB[0];
1439 C[9+16*n+j*lda] += TempA * TempB[1];
1440 C[10+16*n+j*lda] += TempA * TempB[2];
1441 C[11+16*n+j*lda] += TempA * TempB[3];
1442 C[12+16*n+j*lda] += TempA * TempB[4];
1443 C[13+16*n+j*lda] += TempA * TempB[5];
1444 C[14+16*n+j*lda] += TempA * TempB[6];
1445 C[15+16*n+j*lda] += TempA * TempB[7];
1446
1447 TempB[0] = B[k*lda+0+16*n];
1448 TempB[1] = B[k*lda+1+16*n];
1449 TempB[2] = B[k*lda+2+16*n];
1450 TempB[3] = B[k*lda+3+16*n];
1451 TempB[4] = B[k*lda+4+16*n];
1452 TempB[5] = B[k*lda+5+16*n];
1453 TempB[6] = B[k*lda+6+16*n];
1454 TempB[7] = B[k*lda+7+16*n];
1455
1456
1457
1458 C[0+16*n+j*lda] += TempA * TempB[0];
1459 C[1+16*n+j*lda] += TempA * TempB[1];
1460 C[2+16*n+j*lda] += TempA * TempB[2];
1461 C[3+16*n+j*lda] += TempA * TempB[3];
1462 C[4+16*n+j*lda] += TempA * TempB[4];
1463 C[5+16*n+j*lda] += TempA * TempB[5];
1464 C[6+16*n+j*lda] += TempA * TempB[6];
1465 C[7+16*n+j*lda] += TempA * TempB[7];
1466
1467
1468 }
1469
1470 }
1471 }
1472 }
1473 */
1474
1475 //----------------------------------------------------------------version 2.11 optmize j,use core 1 j from 0 to 15 MSI 98k i = j*lda
1476 //----------------------------------------------------------------version 2.12 not use i = j *lda MSI 95k
1477 static __thread data_t TempA[8];
1478 static __thread data_t TempB[8];
1479 static __thread int j,m,n,i,k;
1480
1481 if(coreid == 1)
1482 {
1483 for ( j = 16; j < 32; j++ )
1484 {
1485
1486 for ( m = 0; m < 4; m++ )
1487 {
1488
1489 TempA[0] = A[j*lda+0+8*m];
1490 TempA[1] = A[j*lda+1+8*m];
1491 TempA[2] = A[j*lda+2+8*m];
1492 TempA[3] = A[j*lda+3+8*m];
1493 TempA[4] = A[j*lda+4+8*m];
1494 TempA[5] = A[j*lda+5+8*m];
1495 TempA[6] = A[j*lda+6+8*m];
1496 TempA[7] = A[j*lda+7+8*m];
1497
1498 for( n = 0; n < 4; n++)
1499 {
1500 /*
1501 i = j*lda;
1502
1503 TempB[0] = B[(0+8*m)*lda+0+8*n];
1504 TempB[1] = B[(0+8*m)*lda+1+8*n];
1505 TempB[2] = B[(0+8*m)*lda+2+8*n];
1506 TempB[3] = B[(0+8*m)*lda+3+8*n];
1507 TempB[4] = B[(0+8*m)*lda+4+8*n];
1508 TempB[5] = B[(0+8*m)*lda+5+8*n];
1509 TempB[6] = B[(0+8*m)*lda+6+8*n];
1510 TempB[7] = B[(0+8*m)*lda+7+8*n];
1511
1512 C[0+8*n+i] += TempA[0] * TempB[0];
1513 C[1+8*n+i] += TempA[0] * TempB[1];
1514 C[2+8*n+i] += TempA[0] * TempB[2];
1515 C[3+8*n+i] += TempA[0] * TempB[3];
1516 C[4+8*n+i] += TempA[0] * TempB[4];
1517 C[5+8*n+i] += TempA[0] * TempB[5];
1518 C[6+8*n+i] += TempA[0] * TempB[6];
1519 C[7+8*n+i] += TempA[0] * TempB[7];
1520
1521
1522
1523 TempB[0] = B[(1+8*m)*lda+0+8*n];
1524 TempB[1] = B[(1+8*m)*lda+1+8*n];
1525 TempB[2] = B[(1+8*m)*lda+2+8*n];
1526 TempB[3] = B[(1+8*m)*lda+3+8*n];
1527 TempB[4] = B[(1+8*m)*lda+4+8*n];
1528 TempB[5] = B[(1+8*m)*lda+5+8*n];
1529 TempB[6] = B[(1+8*m)*lda+6+8*n];
1530 TempB[7] = B[(1+8*m)*lda+7+8*n];
1531
1532 C[0+8*n+i] += TempA[1] * TempB[0];
1533 C[1+8*n+i] += TempA[1] * TempB[1];
1534 C[2+8*n+i] += TempA[1] * TempB[2];
1535 C[3+8*n+i] += TempA[1] * TempB[3];
1536 C[4+8*n+i] += TempA[1] * TempB[4];
1537 C[5+8*n+i] += TempA[1] * TempB[5];
1538 C[6+8*n+i] += TempA[1] * TempB[6];
1539 C[7+8*n+i] += TempA[1] * TempB[7];
1540
1541
1542
1543 TempB[0] = B[(2+8*m)*lda+0+8*n];
1544 TempB[1] = B[(2+8*m)*lda+1+8*n];
1545 TempB[2] = B[(2+8*m)*lda+2+8*n];
1546 TempB[3] = B[(2+8*m)*lda+3+8*n];
1547 TempB[4] = B[(2+8*m)*lda+4+8*n];
1548 TempB[5] = B[(2+8*m)*lda+5+8*n];
1549 TempB[6] = B[(2+8*m)*lda+6+8*n];
1550 TempB[7] = B[(2+8*m)*lda+7+8*n];
1551
1552 C[0+8*n+i] += TempA[2] * TempB[0];
1553 C[1+8*n+i] += TempA[2] * TempB[1];
1554 C[2+8*n+i] += TempA[2] * TempB[2];
1555 C[3+8*n+i] += TempA[2] * TempB[3];
1556 C[4+8*n+i] += TempA[2] * TempB[4];
1557 C[5+8*n+i] += TempA[2] * TempB[5];
1558 C[6+8*n+i] += TempA[2] * TempB[6];
1559 C[7+8*n+i] += TempA[2] * TempB[7];
1560
1561
1562
1563 TempB[0] = B[(3+8*m)*lda+0+8*n];
1564 TempB[1] = B[(3+8*m)*lda+1+8*n];
1565 TempB[2] = B[(3+8*m)*lda+2+8*n];
1566 TempB[3] = B[(3+8*m)*lda+3+8*n];
1567 TempB[4] = B[(3+8*m)*lda+4+8*n];
1568 TempB[5] = B[(3+8*m)*lda+5+8*n];
1569 TempB[6] = B[(3+8*m)*lda+6+8*n];
1570 TempB[7] = B[(3+8*m)*lda+7+8*n];
1571
1572 C[0+8*n+i] += TempA[3] * TempB[0];
1573 C[1+8*n+i] += TempA[3] * TempB[1];
1574 C[2+8*n+i] += TempA[3] * TempB[2];
1575 C[3+8*n+i] += TempA[3] * TempB[3];
1576 C[4+8*n+i] += TempA[3] * TempB[4];
1577 C[5+8*n+i] += TempA[3] * TempB[5];
1578 C[6+8*n+i] += TempA[3] * TempB[6];
1579 C[7+8*n+i] += TempA[3] * TempB[7];
1580
1581
1582 TempB[0] = B[(4+8*m)*lda+0+8*n];
1583 TempB[1] = B[(4+8*m)*lda+1+8*n];
1584 TempB[2] = B[(4+8*m)*lda+2+8*n];
1585 TempB[3] = B[(4+8*m)*lda+3+8*n];
1586 TempB[4] = B[(4+8*m)*lda+4+8*n];
1587 TempB[5] = B[(4+8*m)*lda+5+8*n];
1588 TempB[6] = B[(4+8*m)*lda+6+8*n];
1589 TempB[7] = B[(4+8*m)*lda+7+8*n];
1590
1591 C[0+8*n+i] += TempA[4] * TempB[0];
1592 C[1+8*n+i] += TempA[4] * TempB[1];
1593 C[2+8*n+i] += TempA[4] * TempB[2];
1594 C[3+8*n+i] += TempA[4] * TempB[3];
1595 C[4+8*n+i] += TempA[4] * TempB[4];
1596 C[5+8*n+i] += TempA[4] * TempB[5];
1597 C[6+8*n+i] += TempA[4] * TempB[6];
1598 C[7+8*n+i] += TempA[4] * TempB[7];
1599
1600
1601
1602 TempB[0] = B[(5+8*m)*lda+0+8*n];
1603 TempB[1] = B[(5+8*m)*lda+1+8*n];
1604 TempB[2] = B[(5+8*m)*lda+2+8*n];
1605 TempB[3] = B[(5+8*m)*lda+3+8*n];
1606 TempB[4] = B[(5+8*m)*lda+4+8*n];
1607 TempB[5] = B[(5+8*m)*lda+5+8*n];
1608 TempB[6] = B[(5+8*m)*lda+6+8*n];
1609 TempB[7] = B[(5+8*m)*lda+7+8*n];
1610
1611 C[0+8*n+i] += TempA[5] * TempB[0];
1612 C[1+8*n+i] += TempA[5] * TempB[1];
1613 C[2+8*n+i] += TempA[5] * TempB[2];
1614 C[3+8*n+i] += TempA[5] * TempB[3];
1615 C[4+8*n+i] += TempA[5] * TempB[4];
1616 C[5+8*n+i] += TempA[5] * TempB[5];
1617 C[6+8*n+i] += TempA[5] * TempB[6];
1618 C[7+8*n+i] += TempA[5] * TempB[7];
1619
1620
1621
1622 TempB[0] = B[(6+8*m)*lda+0+8*n];
1623 TempB[1] = B[(6+8*m)*lda+1+8*n];
1624 TempB[2] = B[(6+8*m)*lda+2+8*n];
1625 TempB[3] = B[(6+8*m)*lda+3+8*n];
1626 TempB[4] = B[(6+8*m)*lda+4+8*n];
1627 TempB[5] = B[(6+8*m)*lda+5+8*n];
1628 TempB[6] = B[(6+8*m)*lda+6+8*n];
1629 TempB[7] = B[(6+8*m)*lda+7+8*n];
1630
1631 C[0+8*n+i] += TempA[6] * TempB[0];
1632 C[1+8*n+i] += TempA[6] * TempB[1];
1633 C[2+8*n+i] += TempA[6] * TempB[2];
1634 C[3+8*n+i] += TempA[6] * TempB[3];
1635 C[4+8*n+i] += TempA[6] * TempB[4];
1636 C[5+8*n+i] += TempA[6] * TempB[5];
1637 C[6+8*n+i] += TempA[6] * TempB[6];
1638 C[7+8*n+i] += TempA[6] * TempB[7];
1639
1640
1641 TempB[0] = B[(7+8*m)*lda+0+8*n];
1642 TempB[1] = B[(7+8*m)*lda+1+8*n];
1643 TempB[2] = B[(7+8*m)*lda+2+8*n];
1644 TempB[3] = B[(7+8*m)*lda+3+8*n];
1645 TempB[4] = B[(7+8*m)*lda+4+8*n];
1646 TempB[5] = B[(7+8*m)*lda+5+8*n];
1647 TempB[6] = B[(7+8*m)*lda+6+8*n];
1648 TempB[7] = B[(7+8*m)*lda+7+8*n];
1649
1650 C[0+8*n+i] += TempA[7] * TempB[0];
1651 C[1+8*n+i] += TempA[7] * TempB[1];
1652 C[2+8*n+i] += TempA[7] * TempB[2];
1653 C[3+8*n+i] += TempA[7] * TempB[3];
1654 C[4+8*n+i] += TempA[7] * TempB[4];
1655 C[5+8*n+i] += TempA[7] * TempB[5];
1656 C[6+8*n+i] += TempA[7] * TempB[6];
1657 C[7+8*n+i] += TempA[7] * TempB[7];
1658
1659 */
1660 TempB[0] = B[(0+8*m)*lda+0+8*n];
1661 TempB[1] = B[(0+8*m)*lda+1+8*n];
1662 TempB[2] = B[(0+8*m)*lda+2+8*n];
1663 TempB[3] = B[(0+8*m)*lda+3+8*n];
1664 TempB[4] = B[(0+8*m)*lda+4+8*n];
1665 TempB[5] = B[(0+8*m)*lda+5+8*n];
1666 TempB[6] = B[(0+8*m)*lda+6+8*n];
1667 TempB[7] = B[(0+8*m)*lda+7+8*n];
1668
1669 C[0+8*n+j*lda] += TempA[0] * TempB[0];
1670 C[1+8*n+j*lda] += TempA[0] * TempB[1];
1671 C[2+8*n+j*lda] += TempA[0] * TempB[2];
1672 C[3+8*n+j*lda] += TempA[0] * TempB[3];
1673 C[4+8*n+j*lda] += TempA[0] * TempB[4];
1674 C[5+8*n+j*lda] += TempA[0] * TempB[5];
1675 C[6+8*n+j*lda] += TempA[0] * TempB[6];
1676 C[7+8*n+j*lda] += TempA[0] * TempB[7];
1677
1678
1679
1680 TempB[0] = B[(1+8*m)*lda+0+8*n];
1681 TempB[1] = B[(1+8*m)*lda+1+8*n];
1682 TempB[2] = B[(1+8*m)*lda+2+8*n];
1683 TempB[3] = B[(1+8*m)*lda+3+8*n];
1684 TempB[4] = B[(1+8*m)*lda+4+8*n];
1685 TempB[5] = B[(1+8*m)*lda+5+8*n];
1686 TempB[6] = B[(1+8*m)*lda+6+8*n];
1687 TempB[7] = B[(1+8*m)*lda+7+8*n];
1688
1689 C[0+8*n+j*lda] += TempA[1] * TempB[0];
1690 C[1+8*n+j*lda] += TempA[1] * TempB[1];
1691 C[2+8*n+j*lda] += TempA[1] * TempB[2];
1692 C[3+8*n+j*lda] += TempA[1] * TempB[3];
1693 C[4+8*n+j*lda] += TempA[1] * TempB[4];
1694 C[5+8*n+j*lda] += TempA[1] * TempB[5];
1695 C[6+8*n+j*lda] += TempA[1] * TempB[6];
1696 C[7+8*n+j*lda] += TempA[1] * TempB[7];
1697
1698
1699
1700 TempB[0] = B[(2+8*m)*lda+0+8*n];
1701 TempB[1] = B[(2+8*m)*lda+1+8*n];
1702 TempB[2] = B[(2+8*m)*lda+2+8*n];
1703 TempB[3] = B[(2+8*m)*lda+3+8*n];
1704 TempB[4] = B[(2+8*m)*lda+4+8*n];
1705 TempB[5] = B[(2+8*m)*lda+5+8*n];
1706 TempB[6] = B[(2+8*m)*lda+6+8*n];
1707 TempB[7] = B[(2+8*m)*lda+7+8*n];
1708
1709 C[0+8*n+j*lda] += TempA[2] * TempB[0];
1710 C[1+8*n+j*lda] += TempA[2] * TempB[1];
1711 C[2+8*n+j*lda] += TempA[2] * TempB[2];
1712 C[3+8*n+j*lda] += TempA[2] * TempB[3];
1713 C[4+8*n+j*lda] += TempA[2] * TempB[4];
1714 C[5+8*n+j*lda] += TempA[2] * TempB[5];
1715 C[6+8*n+j*lda] += TempA[2] * TempB[6];
1716 C[7+8*n+j*lda] += TempA[2] * TempB[7];
1717
1718
1719
1720 TempB[0] = B[(3+8*m)*lda+0+8*n];
1721 TempB[1] = B[(3+8*m)*lda+1+8*n];
1722 TempB[2] = B[(3+8*m)*lda+2+8*n];
1723 TempB[3] = B[(3+8*m)*lda+3+8*n];
1724 TempB[4] = B[(3+8*m)*lda+4+8*n];
1725 TempB[5] = B[(3+8*m)*lda+5+8*n];
1726 TempB[6] = B[(3+8*m)*lda+6+8*n];
1727 TempB[7] = B[(3+8*m)*lda+7+8*n];
1728
1729 C[0+8*n+j*lda] += TempA[3] * TempB[0];
1730 C[1+8*n+j*lda] += TempA[3] * TempB[1];
1731 C[2+8*n+j*lda] += TempA[3] * TempB[2];
1732 C[3+8*n+j*lda] += TempA[3] * TempB[3];
1733 C[4+8*n+j*lda] += TempA[3] * TempB[4];
1734 C[5+8*n+j*lda] += TempA[3] * TempB[5];
1735 C[6+8*n+j*lda] += TempA[3] * TempB[6];
1736 C[7+8*n+j*lda] += TempA[3] * TempB[7];
1737
1738
1739 TempB[0] = B[(4+8*m)*lda+0+8*n];
1740 TempB[1] = B[(4+8*m)*lda+1+8*n];
1741 TempB[2] = B[(4+8*m)*lda+2+8*n];
1742 TempB[3] = B[(4+8*m)*lda+3+8*n];
1743 TempB[4] = B[(4+8*m)*lda+4+8*n];
1744 TempB[5] = B[(4+8*m)*lda+5+8*n];
1745 TempB[6] = B[(4+8*m)*lda+6+8*n];
1746 TempB[7] = B[(4+8*m)*lda+7+8*n];
1747
1748 C[0+8*n+j*lda] += TempA[4] * TempB[0];
1749 C[1+8*n+j*lda] += TempA[4] * TempB[1];
1750 C[2+8*n+j*lda] += TempA[4] * TempB[2];
1751 C[3+8*n+j*lda] += TempA[4] * TempB[3];
1752 C[4+8*n+j*lda] += TempA[4] * TempB[4];
1753 C[5+8*n+j*lda] += TempA[4] * TempB[5];
1754 C[6+8*n+j*lda] += TempA[4] * TempB[6];
1755 C[7+8*n+j*lda] += TempA[4] * TempB[7];
1756
1757
1758
1759 TempB[0] = B[(5+8*m)*lda+0+8*n];
1760 TempB[1] = B[(5+8*m)*lda+1+8*n];
1761 TempB[2] = B[(5+8*m)*lda+2+8*n];
1762 TempB[3] = B[(5+8*m)*lda+3+8*n];
1763 TempB[4] = B[(5+8*m)*lda+4+8*n];
1764 TempB[5] = B[(5+8*m)*lda+5+8*n];
1765 TempB[6] = B[(5+8*m)*lda+6+8*n];
1766 TempB[7] = B[(5+8*m)*lda+7+8*n];
1767
1768 C[0+8*n+j*lda] += TempA[5] * TempB[0];
1769 C[1+8*n+j*lda] += TempA[5] * TempB[1];
1770 C[2+8*n+j*lda] += TempA[5] * TempB[2];
1771 C[3+8*n+j*lda] += TempA[5] * TempB[3];
1772 C[4+8*n+j*lda] += TempA[5] * TempB[4];
1773 C[5+8*n+j*lda] += TempA[5] * TempB[5];
1774 C[6+8*n+j*lda] += TempA[5] * TempB[6];
1775 C[7+8*n+j*lda] += TempA[5] * TempB[7];
1776
1777
1778
1779 TempB[0] = B[(6+8*m)*lda+0+8*n];
1780 TempB[1] = B[(6+8*m)*lda+1+8*n];
1781 TempB[2] = B[(6+8*m)*lda+2+8*n];
1782 TempB[3] = B[(6+8*m)*lda+3+8*n];
1783 TempB[4] = B[(6+8*m)*lda+4+8*n];
1784 TempB[5] = B[(6+8*m)*lda+5+8*n];
1785 TempB[6] = B[(6+8*m)*lda+6+8*n];
1786 TempB[7] = B[(6+8*m)*lda+7+8*n];
1787
1788 C[0+8*n+j*lda] += TempA[6] * TempB[0];
1789 C[1+8*n+j*lda] += TempA[6] * TempB[1];
1790 C[2+8*n+j*lda] += TempA[6] * TempB[2];
1791 C[3+8*n+j*lda] += TempA[6] * TempB[3];
1792 C[4+8*n+j*lda] += TempA[6] * TempB[4];
1793 C[5+8*n+j*lda] += TempA[6] * TempB[5];
1794 C[6+8*n+j*lda] += TempA[6] * TempB[6];
1795 C[7+8*n+j*lda] += TempA[6] * TempB[7];
1796
1797
1798 TempB[0] = B[(7+8*m)*lda+0+8*n];
1799 TempB[1] = B[(7+8*m)*lda+1+8*n];
1800 TempB[2] = B[(7+8*m)*lda+2+8*n];
1801 TempB[3] = B[(7+8*m)*lda+3+8*n];
1802 TempB[4] = B[(7+8*m)*lda+4+8*n];
1803 TempB[5] = B[(7+8*m)*lda+5+8*n];
1804 TempB[6] = B[(7+8*m)*lda+6+8*n];
1805 TempB[7] = B[(7+8*m)*lda+7+8*n];
1806
1807 C[0+8*n+j*lda] += TempA[7] * TempB[0];
1808 C[1+8*n+j*lda] += TempA[7] * TempB[1];
1809 C[2+8*n+j*lda] += TempA[7] * TempB[2];
1810 C[3+8*n+j*lda] += TempA[7] * TempB[3];
1811 C[4+8*n+j*lda] += TempA[7] * TempB[4];
1812 C[5+8*n+j*lda] += TempA[7] * TempB[5];
1813 C[6+8*n+j*lda] += TempA[7] * TempB[6];
1814 C[7+8*n+j*lda] += TempA[7] * TempB[7];
1815 }
1816
1817 }
1818 }
1819 }
1820 if(coreid ==0)
1821 {
1822 for ( j = 0; j < 16; j++ )
1823 {
1824
1825 for ( m = 0; m < 4; m++ )
1826 {
1827
1828 TempA[0] = A[j*lda+0+8*m];
1829 TempA[1] = A[j*lda+1+8*m];
1830 TempA[2] = A[j*lda+2+8*m];
1831 TempA[3] = A[j*lda+3+8*m];
1832 TempA[4] = A[j*lda+4+8*m];
1833 TempA[5] = A[j*lda+5+8*m];
1834 TempA[6] = A[j*lda+6+8*m];
1835 TempA[7] = A[j*lda+7+8*m];
1836
1837 for( n = 0; n < 4; n++)
1838 {
1839 /*
1840 i = j*lda;
1841
1842 TempB[0] = B[(0+8*m)*lda+0+8*n];
1843 TempB[1] = B[(0+8*m)*lda+1+8*n];
1844 TempB[2] = B[(0+8*m)*lda+2+8*n];
1845 TempB[3] = B[(0+8*m)*lda+3+8*n];
1846 TempB[4] = B[(0+8*m)*lda+4+8*n];
1847 TempB[5] = B[(0+8*m)*lda+5+8*n];
1848 TempB[6] = B[(0+8*m)*lda+6+8*n];
1849 TempB[7] = B[(0+8*m)*lda+7+8*n];
1850
1851 C[0+8*n+i] += TempA[0] * TempB[0];
1852 C[1+8*n+i] += TempA[0] * TempB[1];
1853 C[2+8*n+i] += TempA[0] * TempB[2];
1854 C[3+8*n+i] += TempA[0] * TempB[3];
1855 C[4+8*n+i] += TempA[0] * TempB[4];
1856 C[5+8*n+i] += TempA[0] * TempB[5];
1857 C[6+8*n+i] += TempA[0] * TempB[6];
1858 C[7+8*n+i] += TempA[0] * TempB[7];
1859
1860
1861
1862 TempB[0] = B[(1+8*m)*lda+0+8*n];
1863 TempB[1] = B[(1+8*m)*lda+1+8*n];
1864 TempB[2] = B[(1+8*m)*lda+2+8*n];
1865 TempB[3] = B[(1+8*m)*lda+3+8*n];
1866 TempB[4] = B[(1+8*m)*lda+4+8*n];
1867 TempB[5] = B[(1+8*m)*lda+5+8*n];
1868 TempB[6] = B[(1+8*m)*lda+6+8*n];
1869 TempB[7] = B[(1+8*m)*lda+7+8*n];
1870
1871 C[0+8*n+i] += TempA[1] * TempB[0];
1872 C[1+8*n+i] += TempA[1] * TempB[1];
1873 C[2+8*n+i] += TempA[1] * TempB[2];
1874 C[3+8*n+i] += TempA[1] * TempB[3];
1875 C[4+8*n+i] += TempA[1] * TempB[4];
1876 C[5+8*n+i] += TempA[1] * TempB[5];
1877 C[6+8*n+i] += TempA[1] * TempB[6];
1878 C[7+8*n+i] += TempA[1] * TempB[7];
1879
1880
1881
1882 TempB[0] = B[(2+8*m)*lda+0+8*n];
1883 TempB[1] = B[(2+8*m)*lda+1+8*n];
1884 TempB[2] = B[(2+8*m)*lda+2+8*n];
1885 TempB[3] = B[(2+8*m)*lda+3+8*n];
1886 TempB[4] = B[(2+8*m)*lda+4+8*n];
1887 TempB[5] = B[(2+8*m)*lda+5+8*n];
1888 TempB[6] = B[(2+8*m)*lda+6+8*n];
1889 TempB[7] = B[(2+8*m)*lda+7+8*n];
1890
1891 C[0+8*n+i] += TempA[2] * TempB[0];
1892 C[1+8*n+i] += TempA[2] * TempB[1];
1893 C[2+8*n+i] += TempA[2] * TempB[2];
1894 C[3+8*n+i] += TempA[2] * TempB[3];
1895 C[4+8*n+i] += TempA[2] * TempB[4];
1896 C[5+8*n+i] += TempA[2] * TempB[5];
1897 C[6+8*n+i] += TempA[2] * TempB[6];
1898 C[7+8*n+i] += TempA[2] * TempB[7];
1899
1900
1901
1902 TempB[0] = B[(3+8*m)*lda+0+8*n];
1903 TempB[1] = B[(3+8*m)*lda+1+8*n];
1904 TempB[2] = B[(3+8*m)*lda+2+8*n];
1905 TempB[3] = B[(3+8*m)*lda+3+8*n];
1906 TempB[4] = B[(3+8*m)*lda+4+8*n];
1907 TempB[5] = B[(3+8*m)*lda+5+8*n];
1908 TempB[6] = B[(3+8*m)*lda+6+8*n];
1909 TempB[7] = B[(3+8*m)*lda+7+8*n];
1910
1911 C[0+8*n+i] += TempA[3] * TempB[0];
1912 C[1+8*n+i] += TempA[3] * TempB[1];
1913 C[2+8*n+i] += TempA[3] * TempB[2];
1914 C[3+8*n+i] += TempA[3] * TempB[3];
1915 C[4+8*n+i] += TempA[3] * TempB[4];
1916 C[5+8*n+i] += TempA[3] * TempB[5];
1917 C[6+8*n+i] += TempA[3] * TempB[6];
1918 C[7+8*n+i] += TempA[3] * TempB[7];
1919
1920
1921 TempB[0] = B[(4+8*m)*lda+0+8*n];
1922 TempB[1] = B[(4+8*m)*lda+1+8*n];
1923 TempB[2] = B[(4+8*m)*lda+2+8*n];
1924 TempB[3] = B[(4+8*m)*lda+3+8*n];
1925 TempB[4] = B[(4+8*m)*lda+4+8*n];
1926 TempB[5] = B[(4+8*m)*lda+5+8*n];
1927 TempB[6] = B[(4+8*m)*lda+6+8*n];
1928 TempB[7] = B[(4+8*m)*lda+7+8*n];
1929
1930 C[0+8*n+i] += TempA[4] * TempB[0];
1931 C[1+8*n+i] += TempA[4] * TempB[1];
1932 C[2+8*n+i] += TempA[4] * TempB[2];
1933 C[3+8*n+i] += TempA[4] * TempB[3];
1934 C[4+8*n+i] += TempA[4] * TempB[4];
1935 C[5+8*n+i] += TempA[4] * TempB[5];
1936 C[6+8*n+i] += TempA[4] * TempB[6];
1937 C[7+8*n+i] += TempA[4] * TempB[7];
1938
1939
1940
1941 TempB[0] = B[(5+8*m)*lda+0+8*n];
1942 TempB[1] = B[(5+8*m)*lda+1+8*n];
1943 TempB[2] = B[(5+8*m)*lda+2+8*n];
1944 TempB[3] = B[(5+8*m)*lda+3+8*n];
1945 TempB[4] = B[(5+8*m)*lda+4+8*n];
1946 TempB[5] = B[(5+8*m)*lda+5+8*n];
1947 TempB[6] = B[(5+8*m)*lda+6+8*n];
1948 TempB[7] = B[(5+8*m)*lda+7+8*n];
1949
1950 C[0+8*n+i] += TempA[5] * TempB[0];
1951 C[1+8*n+i] += TempA[5] * TempB[1];
1952 C[2+8*n+i] += TempA[5] * TempB[2];
1953 C[3+8*n+i] += TempA[5] * TempB[3];
1954 C[4+8*n+i] += TempA[5] * TempB[4];
1955 C[5+8*n+i] += TempA[5] * TempB[5];
1956 C[6+8*n+i] += TempA[5] * TempB[6];
1957 C[7+8*n+i] += TempA[5] * TempB[7];
1958
1959
1960
1961 TempB[0] = B[(6+8*m)*lda+0+8*n];
1962 TempB[1] = B[(6+8*m)*lda+1+8*n];
1963 TempB[2] = B[(6+8*m)*lda+2+8*n];
1964 TempB[3] = B[(6+8*m)*lda+3+8*n];
1965 TempB[4] = B[(6+8*m)*lda+4+8*n];
1966 TempB[5] = B[(6+8*m)*lda+5+8*n];
1967 TempB[6] = B[(6+8*m)*lda+6+8*n];
1968 TempB[7] = B[(6+8*m)*lda+7+8*n];
1969
1970 C[0+8*n+i] += TempA[6] * TempB[0];
1971 C[1+8*n+i] += TempA[6] * TempB[1];
1972 C[2+8*n+i] += TempA[6] * TempB[2];
1973 C[3+8*n+i] += TempA[6] * TempB[3];
1974 C[4+8*n+i] += TempA[6] * TempB[4];
1975 C[5+8*n+i] += TempA[6] * TempB[5];
1976 C[6+8*n+i] += TempA[6] * TempB[6];
1977 C[7+8*n+i] += TempA[6] * TempB[7];
1978
1979
1980 TempB[0] = B[(7+8*m)*lda+0+8*n];
1981 TempB[1] = B[(7+8*m)*lda+1+8*n];
1982 TempB[2] = B[(7+8*m)*lda+2+8*n];
1983 TempB[3] = B[(7+8*m)*lda+3+8*n];
1984 TempB[4] = B[(7+8*m)*lda+4+8*n];
1985 TempB[5] = B[(7+8*m)*lda+5+8*n];
1986 TempB[6] = B[(7+8*m)*lda+6+8*n];
1987 TempB[7] = B[(7+8*m)*lda+7+8*n];
1988
1989 C[0+8*n+i] += TempA[7] * TempB[0];
1990 C[1+8*n+i] += TempA[7] * TempB[1];
1991 C[2+8*n+i] += TempA[7] * TempB[2];
1992 C[3+8*n+i] += TempA[7] * TempB[3];
1993 C[4+8*n+i] += TempA[7] * TempB[4];
1994 C[5+8*n+i] += TempA[7] * TempB[5];
1995 C[6+8*n+i] += TempA[7] * TempB[6];
1996 C[7+8*n+i] += TempA[7] * TempB[7];
1997
1998 */
1999 TempB[0] = B[(0+8*m)*lda+0+8*n];
2000 TempB[1] = B[(0+8*m)*lda+1+8*n];
2001 TempB[2] = B[(0+8*m)*lda+2+8*n];
2002 TempB[3] = B[(0+8*m)*lda+3+8*n];
2003 TempB[4] = B[(0+8*m)*lda+4+8*n];
2004 TempB[5] = B[(0+8*m)*lda+5+8*n];
2005 TempB[6] = B[(0+8*m)*lda+6+8*n];
2006 TempB[7] = B[(0+8*m)*lda+7+8*n];
2007
2008 C[0+8*n+j*lda] += TempA[0] * TempB[0];
2009 C[1+8*n+j*lda] += TempA[0] * TempB[1];
2010 C[2+8*n+j*lda] += TempA[0] * TempB[2];
2011 C[3+8*n+j*lda] += TempA[0] * TempB[3];
2012 C[4+8*n+j*lda] += TempA[0] * TempB[4];
2013 C[5+8*n+j*lda] += TempA[0] * TempB[5];
2014 C[6+8*n+j*lda] += TempA[0] * TempB[6];
2015 C[7+8*n+j*lda] += TempA[0] * TempB[7];
2016
2017
2018
2019 TempB[0] = B[(1+8*m)*lda+0+8*n];
2020 TempB[1] = B[(1+8*m)*lda+1+8*n];
2021 TempB[2] = B[(1+8*m)*lda+2+8*n];
2022 TempB[3] = B[(1+8*m)*lda+3+8*n];
2023 TempB[4] = B[(1+8*m)*lda+4+8*n];
2024 TempB[5] = B[(1+8*m)*lda+5+8*n];
2025 TempB[6] = B[(1+8*m)*lda+6+8*n];
2026 TempB[7] = B[(1+8*m)*lda+7+8*n];
2027
2028 C[0+8*n+j*lda] += TempA[1] * TempB[0];
2029 C[1+8*n+j*lda] += TempA[1] * TempB[1];
2030 C[2+8*n+j*lda] += TempA[1] * TempB[2];
2031 C[3+8*n+j*lda] += TempA[1] * TempB[3];
2032 C[4+8*n+j*lda] += TempA[1] * TempB[4];
2033 C[5+8*n+j*lda] += TempA[1] * TempB[5];
2034 C[6+8*n+j*lda] += TempA[1] * TempB[6];
2035 C[7+8*n+j*lda] += TempA[1] * TempB[7];
2036
2037
2038
2039 TempB[0] = B[(2+8*m)*lda+0+8*n];
2040 TempB[1] = B[(2+8*m)*lda+1+8*n];
2041 TempB[2] = B[(2+8*m)*lda+2+8*n];
2042 TempB[3] = B[(2+8*m)*lda+3+8*n];
2043 TempB[4] = B[(2+8*m)*lda+4+8*n];
2044 TempB[5] = B[(2+8*m)*lda+5+8*n];
2045 TempB[6] = B[(2+8*m)*lda+6+8*n];
2046 TempB[7] = B[(2+8*m)*lda+7+8*n];
2047
2048 C[0+8*n+j*lda] += TempA[2] * TempB[0];
2049 C[1+8*n+j*lda] += TempA[2] * TempB[1];
2050 C[2+8*n+j*lda] += TempA[2] * TempB[2];
2051 C[3+8*n+j*lda] += TempA[2] * TempB[3];
2052 C[4+8*n+j*lda] += TempA[2] * TempB[4];
2053 C[5+8*n+j*lda] += TempA[2] * TempB[5];
2054 C[6+8*n+j*lda] += TempA[2] * TempB[6];
2055 C[7+8*n+j*lda] += TempA[2] * TempB[7];
2056
2057
2058
2059 TempB[0] = B[(3+8*m)*lda+0+8*n];
2060 TempB[1] = B[(3+8*m)*lda+1+8*n];
2061 TempB[2] = B[(3+8*m)*lda+2+8*n];
2062 TempB[3] = B[(3+8*m)*lda+3+8*n];
2063 TempB[4] = B[(3+8*m)*lda+4+8*n];
2064 TempB[5] = B[(3+8*m)*lda+5+8*n];
2065 TempB[6] = B[(3+8*m)*lda+6+8*n];
2066 TempB[7] = B[(3+8*m)*lda+7+8*n];
2067
2068 C[0+8*n+j*lda] += TempA[3] * TempB[0];
2069 C[1+8*n+j*lda] += TempA[3] * TempB[1];
2070 C[2+8*n+j*lda] += TempA[3] * TempB[2];
2071 C[3+8*n+j*lda] += TempA[3] * TempB[3];
2072 C[4+8*n+j*lda] += TempA[3] * TempB[4];
2073 C[5+8*n+j*lda] += TempA[3] * TempB[5];
2074 C[6+8*n+j*lda] += TempA[3] * TempB[6];
2075 C[7+8*n+j*lda] += TempA[3] * TempB[7];
2076
2077
2078 TempB[0] = B[(4+8*m)*lda+0+8*n];
2079 TempB[1] = B[(4+8*m)*lda+1+8*n];
2080 TempB[2] = B[(4+8*m)*lda+2+8*n];
2081 TempB[3] = B[(4+8*m)*lda+3+8*n];
2082 TempB[4] = B[(4+8*m)*lda+4+8*n];
2083 TempB[5] = B[(4+8*m)*lda+5+8*n];
2084 TempB[6] = B[(4+8*m)*lda+6+8*n];
2085 TempB[7] = B[(4+8*m)*lda+7+8*n];
2086
2087 C[0+8*n+j*lda] += TempA[4] * TempB[0];
2088 C[1+8*n+j*lda] += TempA[4] * TempB[1];
2089 C[2+8*n+j*lda] += TempA[4] * TempB[2];
2090 C[3+8*n+j*lda] += TempA[4] * TempB[3];
2091 C[4+8*n+j*lda] += TempA[4] * TempB[4];
2092 C[5+8*n+j*lda] += TempA[4] * TempB[5];
2093 C[6+8*n+j*lda] += TempA[4] * TempB[6];
2094 C[7+8*n+j*lda] += TempA[4] * TempB[7];
2095
2096
2097
2098 TempB[0] = B[(5+8*m)*lda+0+8*n];
2099 TempB[1] = B[(5+8*m)*lda+1+8*n];
2100 TempB[2] = B[(5+8*m)*lda+2+8*n];
2101 TempB[3] = B[(5+8*m)*lda+3+8*n];
2102 TempB[4] = B[(5+8*m)*lda+4+8*n];
2103 TempB[5] = B[(5+8*m)*lda+5+8*n];
2104 TempB[6] = B[(5+8*m)*lda+6+8*n];
2105 TempB[7] = B[(5+8*m)*lda+7+8*n];
2106
2107 C[0+8*n+j*lda] += TempA[5] * TempB[0];
2108 C[1+8*n+j*lda] += TempA[5] * TempB[1];
2109 C[2+8*n+j*lda] += TempA[5] * TempB[2];
2110 C[3+8*n+j*lda] += TempA[5] * TempB[3];
2111 C[4+8*n+j*lda] += TempA[5] * TempB[4];
2112 C[5+8*n+j*lda] += TempA[5] * TempB[5];
2113 C[6+8*n+j*lda] += TempA[5] * TempB[6];
2114 C[7+8*n+j*lda] += TempA[5] * TempB[7];
2115
2116
2117
2118 TempB[0] = B[(6+8*m)*lda+0+8*n];
2119 TempB[1] = B[(6+8*m)*lda+1+8*n];
2120 TempB[2] = B[(6+8*m)*lda+2+8*n];
2121 TempB[3] = B[(6+8*m)*lda+3+8*n];
2122 TempB[4] = B[(6+8*m)*lda+4+8*n];
2123 TempB[5] = B[(6+8*m)*lda+5+8*n];
2124 TempB[6] = B[(6+8*m)*lda+6+8*n];
2125 TempB[7] = B[(6+8*m)*lda+7+8*n];
2126
2127 C[0+8*n+j*lda] += TempA[6] * TempB[0];
2128 C[1+8*n+j*lda] += TempA[6] * TempB[1];
2129 C[2+8*n+j*lda] += TempA[6] * TempB[2];
2130 C[3+8*n+j*lda] += TempA[6] * TempB[3];
2131 C[4+8*n+j*lda] += TempA[6] * TempB[4];
2132 C[5+8*n+j*lda] += TempA[6] * TempB[5];
2133 C[6+8*n+j*lda] += TempA[6] * TempB[6];
2134 C[7+8*n+j*lda] += TempA[6] * TempB[7];
2135
2136
2137 TempB[0] = B[(7+8*m)*lda+0+8*n];
2138 TempB[1] = B[(7+8*m)*lda+1+8*n];
2139 TempB[2] = B[(7+8*m)*lda+2+8*n];
2140 TempB[3] = B[(7+8*m)*lda+3+8*n];
2141 TempB[4] = B[(7+8*m)*lda+4+8*n];
2142 TempB[5] = B[(7+8*m)*lda+5+8*n];
2143 TempB[6] = B[(7+8*m)*lda+6+8*n];
2144 TempB[7] = B[(7+8*m)*lda+7+8*n];
2145
2146 C[0+8*n+j*lda] += TempA[7] * TempB[0];
2147 C[1+8*n+j*lda] += TempA[7] * TempB[1];
2148 C[2+8*n+j*lda] += TempA[7] * TempB[2];
2149 C[3+8*n+j*lda] += TempA[7] * TempB[3];
2150 C[4+8*n+j*lda] += TempA[7] * TempB[4];
2151 C[5+8*n+j*lda] += TempA[7] * TempB[5];
2152 C[6+8*n+j*lda] += TempA[7] * TempB[6];
2153 C[7+8*n+j*lda] += TempA[7] * TempB[7];
2154 }
2155
2156 }
2157 }
2158 }
2159
2160
2161 }
2162
2163 //--------------------------------------------------------------------------
2164 // Main
2165 //
2166 // all threads start executing thread_entry(). Use their "coreid" to
2167 // differentiate between threads (each thread is running on a separate core).
2168
2169 void thread_entry(int cid, int nc)
2170 {
2171 coreid = cid;
2172 ncores = nc;
2173
2174 // static allocates data in the binary, which is visible to both threads
2175 static data_t results_data[ARRAY_SIZE];
2176 /*
2177
2178 // Execute the provided, naive matmul
2179 barrier(nc);
2180 stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier(nc));
2181
2182
2183 // verify
2184 verifyMT(ARRAY_SIZE, results_data, verify_data);
2185
2186 // clear results from the first trial
2187 size_t i;
2188 if (coreid == 0)
2189 for (i=0; i < ARRAY_SIZE; i++)
2190 results_data[i] = 0;
2191 barrier(nc);
2192
2193 */
2194 // Execute your faster matmul
2195 barrier(nc);
2196 stats(matmul(DIM_SIZE, input1_data, input2_data, results_data); barrier(nc));
2197
2198 #ifdef DEBUG
2199 printArrayMT("results:", ARRAY_SIZE, results_data);
2200 printArrayMT("verify :", ARRAY_SIZE, verify_data);
2201 #endif
2202
2203 // verify
2204 verifyMT(ARRAY_SIZE, results_data, verify_data);
2205 barrier(nc);
2206
2207 exit(0);
2208 }
2209