6 void __attribute__((noinline
)) matmul(const int coreid
, const int ncores
, const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
9 // ***************************** //
10 // **** ADD YOUR CODE HERE ***** //
11 // ***************************** //
13 // feel free to make a separate function for MI and MSI versions.
14 int i
, j
, k
, ii
, jj
, kk
;
15 if(coreid
> 1) return;
16 if (coreid
|| ncores
== 1) {
17 // for ( ii = 0; ii < 32; ii+=IC )
18 for ( kk
= 0; kk
< 32; kk
+=16 )
19 for ( j
= 0; j
< 16; j
++ )
20 // for ( j = 0; j < 16; j++ )
22 for ( i
= 0; i
< 32; i
+=8 )
23 // for ( i = ii; i < ii + IC && i < 32; i+=8 )
25 data_t temp0
= C
[i
+j
*32];
26 data_t temp1
= C
[i
+j
*32+1];
27 data_t temp2
= C
[i
+j
*32+2];
28 data_t temp3
= C
[i
+j
*32+3];
29 data_t temp4
= C
[i
+j
*32+4];
30 data_t temp5
= C
[i
+j
*32+5];
31 data_t temp6
= C
[i
+j
*32+6];
32 data_t temp7
= C
[i
+j
*32+7];
33 for ( k
= kk
; k
< kk
+16 && k
< 32; k
++ )
34 // for ( k = 0; k < 32; k++ )
36 data_t tempA
= A
[j
*32+k
];
37 temp0
+= tempA
* B
[k
*32 + i
];
38 temp1
+= tempA
* B
[k
*32 + i
+1];
39 temp2
+= tempA
* B
[k
*32 + i
+2];
40 temp3
+= tempA
* B
[k
*32 + i
+3];
41 temp4
+= tempA
* B
[k
*32 + i
+4];
42 temp5
+= tempA
* B
[k
*32 + i
+5];
43 temp6
+= tempA
* B
[k
*32 + i
+6];
44 temp7
+= tempA
* B
[k
*32 + i
+7];
58 // for ( ii = 0; ii < 32; ii+=IC )
59 for ( kk
= 0; kk
< 32; kk
+=16 )
60 for ( j
= 16; j
< 32; j
++ )
61 // for ( j = 16; j < 32; j++ )
63 for ( i
= 0; i
< 32; i
+=8 )
64 // for ( i = ii; i < ii + IC && i < 32; i+=8 )
66 data_t temp0
= C
[i
+j
*32];
67 data_t temp1
= C
[i
+j
*32+1];
68 data_t temp2
= C
[i
+j
*32+2];
69 data_t temp3
= C
[i
+j
*32+3];
70 data_t temp4
= C
[i
+j
*32+4];
71 data_t temp5
= C
[i
+j
*32+5];
72 data_t temp6
= C
[i
+j
*32+6];
73 data_t temp7
= C
[i
+j
*32+7];
74 for ( k
= kk
; k
< kk
+16 && k
< 32; k
++ )
76 data_t tempA
= A
[j
*32+k
];
77 temp0
+= tempA
* B
[k
*32 + i
];
78 temp1
+= tempA
* B
[k
*32 + i
+1];
79 temp2
+= tempA
* B
[k
*32 + i
+2];
80 temp3
+= tempA
* B
[k
*32 + i
+3];
81 temp4
+= tempA
* B
[k
*32 + i
+4];
82 temp5
+= tempA
* B
[k
*32 + i
+5];
83 temp6
+= tempA
* B
[k
*32 + i
+6];
84 temp7
+= tempA
* B
[k
*32 + i
+7];