6 void __attribute__((noinline
)) matmul(const int coreid
, const int ncores
, const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
10 data_t element1
, element2
, element3
, element4
, element5
, element6
, element7
, element8
;
12 int column1
, column2
, column3
, column4
, column5
, column6
, column7
, column8
;
13 data_t temp
[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
14 data_t temp2
[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
16 for (i
=0; i
<32; i
+=2){
19 for (j
=0; j
<16; j
+=4){
21 element2
= A
[row
+j
+1];
22 element3
= A
[row
+j
+2];
23 element4
= A
[row
+j
+3];
29 element6
= A
[row2
+j
+1];
30 element7
= A
[row2
+j
+2];
31 element8
= A
[row2
+j
+3];
33 for (k
=0; k
<32; k
+=4){
34 temp
[k
]+=element1
*B
[column1
+k
]+element2
*B
[column2
+k
]+element3
*B
[column3
+k
]+element4
*B
[column4
+k
];
35 temp
[k
+1]+=element1
*B
[column1
+k
+1]+element2
*B
[column2
+k
+1]+element3
*B
[column3
+k
+1]+element4
*B
[column4
+k
+1];
36 temp
[k
+2]+=element1
*B
[column1
+k
+2]+element2
*B
[column2
+k
+2]+element3
*B
[column3
+k
+2]+element4
*B
[column4
+k
+2];
37 temp
[k
+3]+=element1
*B
[column1
+k
+3]+element2
*B
[column2
+k
+3]+element3
*B
[column3
+k
+3]+element4
*B
[column4
+k
+3];
38 temp2
[k
]+=element5
*B
[column1
+k
]+element6
*B
[column2
+k
]+element7
*B
[column3
+k
]+element8
*B
[column4
+k
];
39 temp2
[k
+1]+=element5
*B
[column1
+k
+1]+element6
*B
[column2
+k
+1]+element7
*B
[column3
+k
+1]+element8
*B
[column4
+k
+1];
40 temp2
[k
+2]+=element5
*B
[column1
+k
+2]+element6
*B
[column2
+k
+2]+element7
*B
[column3
+k
+2]+element8
*B
[column4
+k
+2];
41 temp2
[k
+3]+=element5
*B
[column1
+k
+3]+element6
*B
[column2
+k
+3]+element7
*B
[column3
+k
+3]+element8
*B
[column4
+k
+3];
55 if(coreid
== 1 || ncores
== 1) {
56 for (i
=0; i
<32; i
+=2){
59 for (j
=16; j
<32; j
+=4){
61 element2
= A
[row
+j
+1];
62 element3
= A
[row
+j
+2];
63 element4
= A
[row
+j
+3];
65 element6
= A
[row2
+j
+1];
66 element7
= A
[row2
+j
+2];
67 element8
= A
[row2
+j
+3];
72 for (k
=0; k
<32; k
+=4){
73 temp
[k
]+=element1
*B
[column1
+k
]+element2
*B
[column2
+k
]+element3
*B
[column3
+k
]+element4
*B
[column4
+k
];
74 temp
[k
+1]+=element1
*B
[column1
+k
+1]+element2
*B
[column2
+k
+1]+element3
*B
[column3
+k
+1]+element4
*B
[column4
+k
+1];
75 temp
[k
+2]+=element1
*B
[column1
+k
+2]+element2
*B
[column2
+k
+2]+element3
*B
[column3
+k
+2]+element4
*B
[column4
+k
+2];
76 temp
[k
+3]+=element1
*B
[column1
+k
+3]+element2
*B
[column2
+k
+3]+element3
*B
[column3
+k
+3]+element4
*B
[column4
+k
+3];
77 temp2
[k
]+=element5
*B
[column1
+k
]+element6
*B
[column2
+k
]+element7
*B
[column3
+k
]+element8
*B
[column4
+k
];
78 temp2
[k
+1]+=element5
*B
[column1
+k
+1]+element6
*B
[column2
+k
+1]+element7
*B
[column3
+k
+1]+element8
*B
[column4
+k
+1];
79 temp2
[k
+2]+=element5
*B
[column1
+k
+2]+element6
*B
[column2
+k
+2]+element7
*B
[column3
+k
+2]+element8
*B
[column4
+k
+2];
80 temp2
[k
+3]+=element5
*B
[column1
+k
+3]+element6
*B
[column2
+k
+3]+element7
*B
[column3
+k
+3]+element8
*B
[column4
+k
+3];
94 // ***************************** //
95 // **** ADD YOUR CODE HERE ***** //
96 // ***************************** //
98 // feel free to make a separate function for MI and MSI versions.