6 void __attribute__((noinline
)) matmul(const int coreid
, const int ncores
, const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
10 data_t acc_temp0
, acc_temp1
;
12 data_t
*A_j_k
, *B_i_k
;
15 //for (i = 0; i < 32; i++) {
16 // for (j = 0; j < 32; j++) {
17 // B_trans[i*lda+j] = B[i+j*lda];
22 for (i
= 0; i
< 32; i
++) {
24 for (z
= 0; z
< 32; z
++) {
27 for (j
= 0; j
< 16; j
+=2) {
30 for (k
= 0; k
< 32; k
+=8) {
33 acc_temp0
+= *(A_j_k
) * *(B_i_k
);
34 acc_temp0
+= *(A_j_k
+ 1) * *(B_i_k
+ 1);
35 acc_temp0
+= *(A_j_k
+ 2) * *(B_i_k
+ 2);
36 acc_temp0
+= *(A_j_k
+ 3) * *(B_i_k
+ 3);
37 acc_temp0
+= *(A_j_k
+ 4) * *(B_i_k
+ 4);
38 acc_temp0
+= *(A_j_k
+ 5) * *(B_i_k
+ 5);
39 acc_temp0
+= *(A_j_k
+ 6) * *(B_i_k
+ 6);
40 acc_temp0
+= *(A_j_k
+ 7) * *(B_i_k
+ 7);
45 for (k
= 0; k
< 32; k
+=8) {
46 acc_temp1
+= *(A_j
+k
) * *(B_i
+k
);
47 acc_temp1
+= *(A_j
+k
+ 1) * *(B_i
+k
+ 1);
48 acc_temp1
+= *(A_j
+k
+ 2) * *(B_i
+k
+ 2);
49 acc_temp1
+= *(A_j
+k
+ 3) * *(B_i
+k
+ 3);
50 acc_temp1
+= *(A_j
+k
+ 4) * *(B_i
+k
+ 4);
51 acc_temp1
+= *(A_j
+k
+ 5) * *(B_i
+k
+ 5);
52 acc_temp1
+= *(A_j
+k
+ 6) * *(B_i
+k
+ 6);
53 acc_temp1
+= *(A_j
+k
+ 7) * *(B_i
+k
+ 7);
56 C
[i
+ j
*lda
] = acc_temp0
;
57 C
[i
+ (j
+1)*lda
] = acc_temp1
;
61 if (coreid
== 1 || ncores
== 1) {
62 for (i
= 0; i
< 32; i
++) {
64 for (z
= 0; z
< 32; z
++) {
67 for (j
= 16; j
< 32; j
+=2) {
70 for (k
= 0; k
< 32; k
+=8) {
71 acc_temp0
+= *(A_j
+k
) * *(B_i
+k
);
72 acc_temp0
+= *(A_j
+k
+ 1) * *(B_i
+k
+ 1);
73 acc_temp0
+= *(A_j
+k
+ 2) * *(B_i
+k
+ 2);
74 acc_temp0
+= *(A_j
+k
+ 3) * *(B_i
+k
+ 3);
75 acc_temp0
+= *(A_j
+k
+ 4) * *(B_i
+k
+ 4);
76 acc_temp0
+= *(A_j
+k
+ 5) * *(B_i
+k
+ 5);
77 acc_temp0
+= *(A_j
+k
+ 6) * *(B_i
+k
+ 6);
78 acc_temp0
+= *(A_j
+k
+ 7) * *(B_i
+k
+ 7);
83 for (k
= 0; k
< 32; k
+=8) {
84 acc_temp1
+= *(A_j
+k
) * *(B_i
+k
);
85 acc_temp1
+= *(A_j
+k
+ 1) * *(B_i
+k
+ 1);
86 acc_temp1
+= *(A_j
+k
+ 2) * *(B_i
+k
+ 2);
87 acc_temp1
+= *(A_j
+k
+ 3) * *(B_i
+k
+ 3);
88 acc_temp1
+= *(A_j
+k
+ 4) * *(B_i
+k
+ 4);
89 acc_temp1
+= *(A_j
+k
+ 5) * *(B_i
+k
+ 5);
90 acc_temp1
+= *(A_j
+k
+ 6) * *(B_i
+k
+ 6);
91 acc_temp1
+= *(A_j
+k
+ 7) * *(B_i
+k
+ 7);
93 C
[i
+ j
*lda
] = acc_temp0
;
94 C
[i
+ (j
+1)*lda
] = acc_temp1
;