6 void __attribute__((noinline
)) matmul(const int coreid
, const int ncores
, const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
9 // ***************************** //
10 // **** ADD YOUR CODE HERE ***** //
11 // ***************************** //
13 // feel free to make a separate function for MI and MSI versions.
14 int i
, j
, k
, ii
, jj
, bsize
, start
;
17 for ( jj
= start
; jj
< lda
; jj
+= bsize
*ncores
) {
19 for ( ii
= start
; ii
!=start
|| first
; ii
=(bsize
+ii
) % lda
) {
21 for ( j
= jj
; j
< lda
&& j
< jj
+ bsize
; j
+=4) {
22 for ( i
= ii
; i
< lda
&& i
< ii
+ bsize
; i
+=2) {
23 data_t c1
= C
[i
+ j
*lda
];
24 data_t c2
= C
[i
+ j
*lda
+ 1];
25 data_t c3
= C
[i
+ (j
+1)*lda
];
26 data_t c4
= C
[i
+ (j
+1)*lda
+ 1];
27 data_t c5
= C
[i
+ (j
+2)*lda
];
28 data_t c6
= C
[i
+ (j
+2)*lda
+ 1];
29 data_t c7
= C
[i
+ (j
+3)*lda
];
30 data_t c8
= C
[i
+ (j
+3)*lda
+ 1];
31 for ( k
= 0; k
< lda
; k
+=8){
32 for (int x
= 0; x
< 8; x
++) {
33 data_t a
= A
[j
*lda
+ k
+x
];
34 data_t a1
= A
[(j
+1)*lda
+k
+x
];
35 data_t a2
= A
[(j
+2)*lda
+k
+x
];
36 data_t a3
= A
[(j
+3)*lda
+k
+x
];
37 data_t b1
= B
[(k
+x
)*lda
+ i
];
38 data_t b2
= B
[(k
+x
)*lda
+ i
+ 1];
50 C
[i
+ j
*lda
+ 1] = c2
;
51 C
[i
+ (j
+1)*lda
] = c3
;
52 C
[i
+ (j
+1)*lda
+ 1] = c4
;
53 C
[i
+ (j
+2)*lda
] = c5
;
54 C
[i
+ (j
+2)*lda
+ 1] = c6
;
55 C
[i
+ (j
+3)*lda
] = c7
;
56 C
[i
+ (j
+3)*lda
+ 1] = c8
;