6 void __attribute__((noinline
)) matmul(const int coreid
, const int ncores
, const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
8 int i
, j
, k
, ii
, jj
, kk
;
11 // for ( ii = 0; ii < 32; ii+=IC )
12 for ( kk
= 0; kk
< 32; kk
+=16 )
13 for ( j
= 0; j
< 16; j
++ )
14 // for ( j = 0; j < 16; j++ )
16 for ( i
= 0; i
< 32; i
+=8 )
17 // for ( i = ii; i < ii + IC && i < 32; i+=8 )
19 data_t temp0
= C
[i
+j
*32];
20 data_t temp1
= C
[i
+j
*32+1];
21 data_t temp2
= C
[i
+j
*32+2];
22 data_t temp3
= C
[i
+j
*32+3];
23 data_t temp4
= C
[i
+j
*32+4];
24 data_t temp5
= C
[i
+j
*32+5];
25 data_t temp6
= C
[i
+j
*32+6];
26 data_t temp7
= C
[i
+j
*32+7];
27 for ( k
= kk
; k
< kk
+16 && k
< 32; k
++ )
28 // for ( k = 0; k < 32; k++ )
30 data_t tempA
= A
[j
*32+k
];
31 temp0
+= tempA
* B
[k
*32 + i
];
32 temp1
+= tempA
* B
[k
*32 + i
+1];
33 temp2
+= tempA
* B
[k
*32 + i
+2];
34 temp3
+= tempA
* B
[k
*32 + i
+3];
35 temp4
+= tempA
* B
[k
*32 + i
+4];
36 temp5
+= tempA
* B
[k
*32 + i
+5];
37 temp6
+= tempA
* B
[k
*32 + i
+6];
38 temp7
+= tempA
* B
[k
*32 + i
+7];
51 if(coreid
== 1 || ncores
== 1) {
52 // for ( ii = 0; ii < 32; ii+=IC )
53 for ( kk
= 0; kk
< 32; kk
+=16 )
54 for ( j
= 16; j
< 32; j
++ )
55 // for ( j = 16; j < 32; j++ )
57 for ( i
= 0; i
< 32; i
+=8 )
58 // for ( i = ii; i < ii + IC && i < 32; i+=8 )
60 data_t temp0
= C
[i
+j
*32];
61 data_t temp1
= C
[i
+j
*32+1];
62 data_t temp2
= C
[i
+j
*32+2];
63 data_t temp3
= C
[i
+j
*32+3];
64 data_t temp4
= C
[i
+j
*32+4];
65 data_t temp5
= C
[i
+j
*32+5];
66 data_t temp6
= C
[i
+j
*32+6];
67 data_t temp7
= C
[i
+j
*32+7];
68 for ( k
= kk
; k
< kk
+16 && k
< 32; k
++ )
70 data_t tempA
= A
[j
*32+k
];
71 temp0
+= tempA
* B
[k
*32 + i
];
72 temp1
+= tempA
* B
[k
*32 + i
+1];
73 temp2
+= tempA
* B
[k
*32 + i
+2];
74 temp3
+= tempA
* B
[k
*32 + i
+3];
75 temp4
+= tempA
* B
[k
*32 + i
+4];
76 temp5
+= tempA
* B
[k
*32 + i
+5];
77 temp6
+= tempA
* B
[k
*32 + i
+6];
78 temp7
+= tempA
* B
[k
*32 + i
+7];