6 void __attribute__((noinline
)) matmul(const int coreid
, const int ncores
, const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
35 static data_t BB
[1024];
39 for ( k
= 0; k
< lda
; k
++) {
40 for ( i
= coreid
*(lda
/ncores
); i
< (coreid
+1)*(lda
/ncores
); i
++ ) {
41 BB
[i
*lda
+ k
] = B
[k
*lda
+ i
];
46 for ( int x
= 0; x
< ncores
; x
++) {
47 //split the i values into two chunks so the threads don't interfere on the B loads
48 //this could be generalized if needed, but I won't bother since it would be tricky
49 //and we already know the size and numthreads
50 start
= x
* (32 / ncores
);
51 end
= (x
+1) * (32 / ncores
);
52 for ( i
= start
; i
< end
; i
+=8 ) {
53 for ( j
= coreid
*(lda
/ncores
); j
< (coreid
+1)*(lda
/ncores
); j
++ ) {
54 c1
=0;c2
=0;c3
=0;c4
=0;c5
=0;c6
=0;c7
=0;c8
=0;
64 for ( k
= 0; k
< lda
; k
+=8 ) {
146 C
[i
+0 + j
*lda
] += c1
;
147 C
[i
+1 + j
*lda
] += c2
;
148 C
[i
+2 + j
*lda
] += c3
;
149 C
[i
+3 + j
*lda
] += c4
;
150 C
[i
+4 + j
*lda
] += c5
;
151 C
[i
+5 + j
*lda
] += c6
;
152 C
[i
+6 + j
*lda
] += c7
;
153 C
[i
+7 + j
*lda
] += c8
;