6 void __attribute__((noinline
)) matmul(const int coreid
, const int ncores
, const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
9 data_t element1
, element2
, element3
, element4
, element5
, element6
, element7
, element8
;
11 int column1
, column2
, column3
, column4
, column5
, column6
, column7
, column8
;
12 data_t temp
[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
13 data_t temp2
[32]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
15 for (i
=0; i
<lda
; i
+=2){
18 for (j
=0; j
<16; j
+=4){
20 element2
= A
[row
+j
+1];
21 element3
= A
[row
+j
+2];
22 element4
= A
[row
+j
+3];
28 element6
= A
[row2
+j
+1];
29 element7
= A
[row2
+j
+2];
30 element8
= A
[row2
+j
+3];
32 for (k
=0; k
<32; k
+=4){
33 temp
[k
]+=element1
*B
[column1
+k
]+element2
*B
[column2
+k
]+element3
*B
[column3
+k
]+element4
*B
[column4
+k
];
34 temp
[k
+1]+=element1
*B
[column1
+k
+1]+element2
*B
[column2
+k
+1]+element3
*B
[column3
+k
+1]+element4
*B
[column4
+k
+1];
35 temp
[k
+2]+=element1
*B
[column1
+k
+2]+element2
*B
[column2
+k
+2]+element3
*B
[column3
+k
+2]+element4
*B
[column4
+k
+2];
36 temp
[k
+3]+=element1
*B
[column1
+k
+3]+element2
*B
[column2
+k
+3]+element3
*B
[column3
+k
+3]+element4
*B
[column4
+k
+3];
37 temp2
[k
]+=element5
*B
[column1
+k
]+element6
*B
[column2
+k
]+element7
*B
[column3
+k
]+element8
*B
[column4
+k
];
38 temp2
[k
+1]+=element5
*B
[column1
+k
+1]+element6
*B
[column2
+k
+1]+element7
*B
[column3
+k
+1]+element8
*B
[column4
+k
+1];
39 temp2
[k
+2]+=element5
*B
[column1
+k
+2]+element6
*B
[column2
+k
+2]+element7
*B
[column3
+k
+2]+element8
*B
[column4
+k
+2];
40 temp2
[k
+3]+=element5
*B
[column1
+k
+3]+element6
*B
[column2
+k
+3]+element7
*B
[column3
+k
+3]+element8
*B
[column4
+k
+3];
53 if (coreid
==1 || ncores
== 1){
54 for (i
=0; i
<32; i
+=2){
57 for (j
=16; j
<32; j
+=4){
59 element2
= A
[row
+j
+1];
60 element3
= A
[row
+j
+2];
61 element4
= A
[row
+j
+3];
63 element6
= A
[row2
+j
+1];
64 element7
= A
[row2
+j
+2];
65 element8
= A
[row2
+j
+3];
70 for (k
=0; k
<32; k
+=4){
71 temp
[k
]+=element1
*B
[column1
+k
]+element2
*B
[column2
+k
]+element3
*B
[column3
+k
]+element4
*B
[column4
+k
];
72 temp
[k
+1]+=element1
*B
[column1
+k
+1]+element2
*B
[column2
+k
+1]+element3
*B
[column3
+k
+1]+element4
*B
[column4
+k
+1];
73 temp
[k
+2]+=element1
*B
[column1
+k
+2]+element2
*B
[column2
+k
+2]+element3
*B
[column3
+k
+2]+element4
*B
[column4
+k
+2];
74 temp
[k
+3]+=element1
*B
[column1
+k
+3]+element2
*B
[column2
+k
+3]+element3
*B
[column3
+k
+3]+element4
*B
[column4
+k
+3];
75 temp2
[k
]+=element5
*B
[column1
+k
]+element6
*B
[column2
+k
]+element7
*B
[column3
+k
]+element8
*B
[column4
+k
];
76 temp2
[k
+1]+=element5
*B
[column1
+k
+1]+element6
*B
[column2
+k
+1]+element7
*B
[column3
+k
+1]+element8
*B
[column4
+k
+1];
77 temp2
[k
+2]+=element5
*B
[column1
+k
+2]+element6
*B
[column2
+k
+2]+element7
*B
[column3
+k
+2]+element8
*B
[column4
+k
+2];
78 temp2
[k
+3]+=element5
*B
[column1
+k
+3]+element6
*B
[column2
+k
+3]+element7
*B
[column3
+k
+3]+element8
*B
[column4
+k
+3];