6 void __attribute__((noinline
)) matmul(const int coreid
, const int ncores
, const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
9 int j
= coreid
*(lda
/ncores
);
10 int jend
= (coreid
+1)*(lda
/ncores
);
11 for ( ; j
< jend
; j
++ )
14 data_t
* Cj32
= C
+ j32
;
15 for ( k
= 0; k
< 32; k
+=2 )
17 data_t Aj32k
= A
[k
+ j32
];
18 data_t Aj32k2
= A
[k
+ 1 + j32
];
19 data_t
* Bk32
= B
+ (k
<< 5);
20 data_t
* Bk322
= Bk32
+ 32;
21 for ( i
= 0; i
< 32; i
+=4 )
23 Cj32
[i
] += Aj32k
* Bk32
[i
];
24 Cj32
[i
] += Aj32k2
* Bk322
[i
];
25 Cj32
[i
+1] += Aj32k
* Bk32
[i
+1];
26 Cj32
[i
+1] += Aj32k2
* Bk322
[i
+1];
27 Cj32
[i
+2] += Aj32k
* Bk32
[i
+2];
28 Cj32
[i
+2] += Aj32k2
* Bk322
[i
+2];
29 Cj32
[i
+3] += Aj32k
* Bk32
[i
+3];
30 Cj32
[i
+3] += Aj32k2
* Bk322
[i
+3];