7 void __attribute__((noinline
)) matmul(const int coreid
, const int ncores
, const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
11 for ( i
= 0; i
< lda
; i
+=2 )
13 for (k
= 0; k
< lda
; k
+=4)
15 int d0
= B
[k
*lda
+ i
];
16 int c0
= B
[k
*lda
+ i
+ 1];
17 int d1
= B
[(k
+1)*lda
+ i
];
18 int c1
= B
[(k
+1)*lda
+ i
+ 1];
19 int d2
= B
[(k
+2)*lda
+ i
];
20 int c2
= B
[(k
+2)*lda
+ i
+ 1];
21 int d3
= B
[(k
+3)*lda
+ i
];
22 int c3
= B
[(k
+3)*lda
+ i
+ 1];
24 for ( j
= coreid
*(lda
/ncores
); j
< (coreid
+1)*(lda
/ncores
); j
+=4)
27 int sum
= A
[j
*lda
+ k
] * d0
;
28 sum
+= A
[j
*lda
+ k
+ 1] * d1
;
29 sum
+= A
[j
*lda
+ k
+ 2] * d2
;
30 sum
+= A
[j
*lda
+ k
+ 3] * d3
;
33 sum
= A
[j
*lda
+ k
] * c0
;
34 sum
+= A
[j
*lda
+ k
+ 1] * c1
;
35 sum
+= A
[j
*lda
+ k
+ 2] * c2
;
36 sum
+= A
[j
*lda
+ k
+ 3] * c3
;
37 C
[j
*lda
+ i
+ 1] += sum
;
39 sum
= A
[(j
+1)*lda
+ k
] * d0
;
40 sum
+= A
[(j
+1)*lda
+ k
+ 1] * d1
;
41 sum
+= A
[(j
+1)*lda
+ k
+ 2] * d2
;
42 sum
+= A
[(j
+1)*lda
+ k
+ 3] * d3
;
43 C
[(j
+1)*lda
+i
] += sum
;
45 sum
= A
[(j
+1)*lda
+ k
] * c0
;
46 sum
+= A
[(j
+1)*lda
+ k
+ 1] * c1
;
47 sum
+= A
[(j
+1)*lda
+ k
+ 2] * c2
;
48 sum
+= A
[(j
+1)*lda
+ k
+ 3] * c3
;
49 C
[(j
+1)*lda
+ i
+ 1] += sum
;
51 sum
= A
[(j
+2)*lda
+ k
] * d0
;
52 sum
+= A
[(j
+2)*lda
+ k
+ 1] * d1
;
53 sum
+= A
[(j
+2)*lda
+ k
+ 2] * d2
;
54 sum
+= A
[(j
+2)*lda
+ k
+ 3] * d3
;
55 C
[(j
+2)*lda
+i
] += sum
;
57 sum
= A
[(j
+2)*lda
+ k
] * c0
;
58 sum
+= A
[(j
+2)*lda
+ k
+ 1] * c1
;
59 sum
+= A
[(j
+2)*lda
+ k
+ 2] * c2
;
60 sum
+= A
[(j
+2)*lda
+ k
+ 3] * c3
;
61 C
[(j
+2)*lda
+ i
+ 1] += sum
;
63 sum
= A
[(j
+3)*lda
+ k
] * d0
;
64 sum
+= A
[(j
+3)*lda
+ k
+ 1] * d1
;
65 sum
+= A
[(j
+3)*lda
+ k
+ 2] * d2
;
66 sum
+= A
[(j
+3)*lda
+ k
+ 3] * d3
;
67 C
[(j
+3)*lda
+i
] += sum
;
69 sum
= A
[(j
+3)*lda
+ k
] * c0
;
70 sum
+= A
[(j
+3)*lda
+ k
+ 1] * c1
;
71 sum
+= A
[(j
+3)*lda
+ k
+ 2] * c2
;
72 sum
+= A
[(j
+3)*lda
+ k
+ 3] * c3
;
73 C
[(j
+3)*lda
+ i
+ 1] += sum
;