1 // See LICENSE for license details.
7 #pragma GCC optimize ("unroll-loops")
9 void matmul(const size_t coreid
, const size_t ncores
, const size_t lda
, const data_t A
[], const data_t B
[], data_t C
[])
12 size_t block
= lda
/ ncores
;
13 size_t start
= block
* coreid
;
15 for (i
= 0; i
< lda
; i
++) {
16 for (j
= start
; j
< (start
+block
); j
++) {
18 for (k
= 0; k
< lda
; k
++)
19 sum
+= A
[j
*lda
+ k
] * B
[k
*lda
+ i
];