6 void __attribute__((noinline
)) matmul(const int coreid
, const int ncores
, const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
10 for ( i
= 0; i
< lda
; i
+=2 )
12 for (k
= 0; k
< lda
; k
+=4)
14 int d0
= B
[k
*lda
+ i
];
15 int c0
= B
[k
*lda
+ i
+ 1];
16 int d1
= B
[(k
+1)*lda
+ i
];
17 int c1
= B
[(k
+1)*lda
+ i
+ 1];
18 int d2
= B
[(k
+2)*lda
+ i
];
19 int c2
= B
[(k
+2)*lda
+ i
+ 1];
20 int d3
= B
[(k
+3)*lda
+ i
];
21 int c3
= B
[(k
+3)*lda
+ i
+ 1];
23 for ( j
= coreid
*(lda
/ncores
); j
< (coreid
+1)*(lda
/ncores
); j
+=4)
26 int sum
= A
[j
*lda
+ k
] * d0
;
27 sum
+= A
[j
*lda
+ k
+ 1] * d1
;
28 sum
+= A
[j
*lda
+ k
+ 2] * d2
;
29 sum
+= A
[j
*lda
+ k
+ 3] * d3
;
32 sum
= A
[j
*lda
+ k
] * c0
;
33 sum
+= A
[j
*lda
+ k
+ 1] * c1
;
34 sum
+= A
[j
*lda
+ k
+ 2] * c2
;
35 sum
+= A
[j
*lda
+ k
+ 3] * c3
;
36 C
[j
*lda
+ i
+ 1] += sum
;
38 sum
= A
[(j
+1)*lda
+ k
] * d0
;
39 sum
+= A
[(j
+1)*lda
+ k
+ 1] * d1
;
40 sum
+= A
[(j
+1)*lda
+ k
+ 2] * d2
;
41 sum
+= A
[(j
+1)*lda
+ k
+ 3] * d3
;
42 C
[(j
+1)*lda
+i
] += sum
;
44 sum
= A
[(j
+1)*lda
+ k
] * c0
;
45 sum
+= A
[(j
+1)*lda
+ k
+ 1] * c1
;
46 sum
+= A
[(j
+1)*lda
+ k
+ 2] * c2
;
47 sum
+= A
[(j
+1)*lda
+ k
+ 3] * c3
;
48 C
[(j
+1)*lda
+ i
+ 1] += sum
;
50 sum
= A
[(j
+2)*lda
+ k
] * d0
;
51 sum
+= A
[(j
+2)*lda
+ k
+ 1] * d1
;
52 sum
+= A
[(j
+2)*lda
+ k
+ 2] * d2
;
53 sum
+= A
[(j
+2)*lda
+ k
+ 3] * d3
;
54 C
[(j
+2)*lda
+i
] += sum
;
56 sum
= A
[(j
+2)*lda
+ k
] * c0
;
57 sum
+= A
[(j
+2)*lda
+ k
+ 1] * c1
;
58 sum
+= A
[(j
+2)*lda
+ k
+ 2] * c2
;
59 sum
+= A
[(j
+2)*lda
+ k
+ 3] * c3
;
60 C
[(j
+2)*lda
+ i
+ 1] += sum
;
62 sum
= A
[(j
+3)*lda
+ k
] * d0
;
63 sum
+= A
[(j
+3)*lda
+ k
+ 1] * d1
;
64 sum
+= A
[(j
+3)*lda
+ k
+ 2] * d2
;
65 sum
+= A
[(j
+3)*lda
+ k
+ 3] * d3
;
66 C
[(j
+3)*lda
+i
] += sum
;
68 sum
= A
[(j
+3)*lda
+ k
] * c0
;
69 sum
+= A
[(j
+3)*lda
+ k
+ 1] * c1
;
70 sum
+= A
[(j
+3)*lda
+ k
+ 2] * c2
;
71 sum
+= A
[(j
+3)*lda
+ k
+ 3] * c3
;
72 C
[(j
+3)*lda
+ i
+ 1] += sum
;