6 void __attribute__((noinline
)) matmul(const int coreid
, const int ncores
, const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
9 // ***************************** //
10 // **** ADD YOUR CODE HERE ***** //
11 // ***************************** //
13 // feel free to make a separate function for MI and MSI versions.
14 int i
, j
, k
, ii
, jj
, bsize
;
16 for ( jj
= bsize
*coreid
; jj
< lda
; jj
+= bsize
*ncores
) {
17 for ( ii
= 0; ii
< lda
; ii
+= bsize
) {
18 for ( j
= jj
; j
< lda
&& j
< jj
+ bsize
; j
++) {
19 for ( i
= ii
; i
< lda
&& i
< ii
+ bsize
; i
+= 8) {
20 data_t c1
= C
[i
+ j
*lda
];
21 data_t c2
= C
[i
+ j
*lda
+ 1];
22 data_t c3
= C
[i
+ j
*lda
+ 2];
23 data_t c4
= C
[i
+ j
*lda
+ 3];
24 data_t c5
= C
[i
+ j
*lda
+ 4];
25 data_t c6
= C
[i
+ j
*lda
+ 5];
26 data_t c7
= C
[i
+ j
*lda
+ 6];
27 data_t c8
= C
[i
+ j
*lda
+ 7];
28 for ( k
= 0; k
< lda
; k
+=4 ) {
29 for (int x
= 0; x
< 4; x
++) {
30 data_t a
= A
[j
*lda
+ k
+x
];
31 data_t b1
= B
[(k
+x
)*lda
+ i
];
32 data_t b2
= B
[(k
+x
)*lda
+ i
+ 1];
33 data_t b3
= B
[(k
+x
)*lda
+ i
+ 2];
34 data_t b4
= B
[(k
+x
)*lda
+ i
+ 3];
35 data_t b5
= B
[(k
+x
)*lda
+ i
+ 4];
36 data_t b6
= B
[(k
+x
)*lda
+ i
+ 5];
37 data_t b7
= B
[(k
+x
)*lda
+ i
+ 6];
38 data_t b8
= B
[(k
+x
)*lda
+ i
+ 7];
50 C
[i
+ j
*lda
+ 1] = c2
;
51 C
[i
+ j
*lda
+ 2] = c3
;
52 C
[i
+ j
*lda
+ 3] = c4
;
53 C
[i
+ j
*lda
+ 4] = c5
;
54 C
[i
+ j
*lda
+ 5] = c6
;
55 C
[i
+ j
*lda
+ 6] = c7
;
56 C
[i
+ j
*lda
+ 7] = c8
;