6 void __attribute__((noinline
)) matmul(const int coreid
, const int ncores
, const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
9 // ***************************** //
10 // **** ADD YOUR CODE HERE ***** //
11 // ***************************** //
13 // feel free to make a separate function for MI and MSI versions.
16 int max
= space
*coreid
+space
;
17 static data_t B1
[32*32];
18 if (coreid
==ncores
-1){
19 for (i
=0; i
<lda
*lda
/ncores
;i
++)
25 for (i
=lda
*lda
/ncores
;i
<lda
*lda
;i
++)
52 if (coreid
!=ncores
-1){
53 for (i
=space
*coreid
;i
<max
/4*4;i
+=4)
55 for(j
=0;j
<lda
/4*4;j
+=4)
62 temp1_1
=C
[j
+1+(i
+1)*lda
];
63 temp2_1
=C
[j
+1+(i
+2)*lda
];
64 temp3_1
=C
[j
+1+(i
+3)*lda
];
66 temp1_2
=C
[j
+2+(i
+1)*lda
];
67 temp2_2
=C
[j
+2+(i
+2)*lda
];
68 temp3_2
=C
[j
+2+(i
+3)*lda
];
70 temp1_3
=C
[j
+3+(i
+1)*lda
];
71 temp2_3
=C
[j
+3+(i
+2)*lda
];
72 temp3_3
=C
[j
+3+(i
+3)*lda
];
76 temp
+=A
[k
+i
*lda
]*tempB
;
77 temp1
+=A
[k
+(i
+1)*lda
]*tempB
;
78 temp2
+=A
[k
+(i
+2)*lda
]*tempB
;
79 temp3
+=A
[k
+(i
+3)*lda
]*tempB
;
82 temp_1
+=A
[k
+i
*lda
]*tempB_1
;
83 temp1_1
+=A
[k
+(i
+1)*lda
]*tempB_1
;
84 temp2_1
+=A
[k
+(i
+2)*lda
]*tempB_1
;
85 temp3_1
+=A
[k
+(i
+3)*lda
]*tempB_1
;
88 temp_2
+=A
[k
+i
*lda
]*tempB_2
;
89 temp1_2
+=A
[k
+(i
+1)*lda
]*tempB_2
;
90 temp2_2
+=A
[k
+(i
+2)*lda
]*tempB_2
;
91 temp3_2
+=A
[k
+(i
+3)*lda
]*tempB_2
;
94 temp_3
+=A
[k
+i
*lda
]*tempB_3
;
95 temp1_3
+=A
[k
+(i
+1)*lda
]*tempB_3
;
96 temp2_3
+=A
[k
+(i
+2)*lda
]*tempB_3
;
97 temp3_3
+=A
[k
+(i
+3)*lda
]*tempB_3
;
100 C
[j
+(i
+1)*lda
]=temp1
;
101 C
[j
+(i
+2)*lda
]=temp2
;
102 C
[j
+(i
+3)*lda
]=temp3
;
105 C
[j
+1+(i
+1)*lda
]=temp1_1
;
106 C
[j
+1+(i
+2)*lda
]=temp2_1
;
107 C
[j
+1+(i
+3)*lda
]=temp3_1
;
110 C
[j
+2+(i
+1)*lda
]=temp1_2
;
111 C
[j
+2+(i
+2)*lda
]=temp2_2
;
112 C
[j
+2+(i
+3)*lda
]=temp3_2
;
115 C
[j
+3+(i
+1)*lda
]=temp1_3
;
116 C
[j
+3+(i
+2)*lda
]=temp2_3
;
117 C
[j
+3+(i
+3)*lda
]=temp3_3
;
123 for (i
=space
*coreid
;i
<lda
/4*4;i
+=4)
125 for(j
=0;j
<lda
/4*4;j
+=4)
128 temp1
=C
[j
+(i
+1)*lda
];
129 temp2
=C
[j
+(i
+2)*lda
];
130 temp3
=C
[j
+(i
+3)*lda
];
132 temp1_1
=C
[j
+1+(i
+1)*lda
];
133 temp2_1
=C
[j
+1+(i
+2)*lda
];
134 temp3_1
=C
[j
+1+(i
+3)*lda
];
136 temp1_2
=C
[j
+2+(i
+1)*lda
];
137 temp2_2
=C
[j
+2+(i
+2)*lda
];
138 temp3_2
=C
[j
+2+(i
+3)*lda
];
140 temp1_3
=C
[j
+3+(i
+1)*lda
];
141 temp2_3
=C
[j
+3+(i
+2)*lda
];
142 temp3_3
=C
[j
+3+(i
+3)*lda
];
146 temp
+=A
[k
+i
*lda
]*tempB
;
147 temp1
+=A
[k
+(i
+1)*lda
]*tempB
;
148 temp2
+=A
[k
+(i
+2)*lda
]*tempB
;
149 temp3
+=A
[k
+(i
+3)*lda
]*tempB
;
151 tempB_1
=B1
[j
+1+k
*lda
];
152 temp_1
+=A
[k
+i
*lda
]*tempB_1
;
153 temp1_1
+=A
[k
+(i
+1)*lda
]*tempB_1
;
154 temp2_1
+=A
[k
+(i
+2)*lda
]*tempB_1
;
155 temp3_1
+=A
[k
+(i
+3)*lda
]*tempB_1
;
157 tempB_2
=B1
[j
+2+k
*lda
];
158 temp_2
+=A
[k
+i
*lda
]*tempB_2
;
159 temp1_2
+=A
[k
+(i
+1)*lda
]*tempB_2
;
160 temp2_2
+=A
[k
+(i
+2)*lda
]*tempB_2
;
161 temp3_2
+=A
[k
+(i
+3)*lda
]*tempB_2
;
163 tempB_3
=B1
[j
+3+k
*lda
];
164 temp_3
+=A
[k
+i
*lda
]*tempB_3
;
165 temp1_3
+=A
[k
+(i
+1)*lda
]*tempB_3
;
166 temp2_3
+=A
[k
+(i
+2)*lda
]*tempB_3
;
167 temp3_3
+=A
[k
+(i
+3)*lda
]*tempB_3
;
170 C
[j
+(i
+1)*lda
]=temp1
;
171 C
[j
+(i
+2)*lda
]=temp2
;
172 C
[j
+(i
+3)*lda
]=temp3
;
175 C
[j
+1+(i
+1)*lda
]=temp1_1
;
176 C
[j
+1+(i
+2)*lda
]=temp2_1
;
177 C
[j
+1+(i
+3)*lda
]=temp3_1
;
180 C
[j
+2+(i
+1)*lda
]=temp1_2
;
181 C
[j
+2+(i
+2)*lda
]=temp2_2
;
182 C
[j
+2+(i
+3)*lda
]=temp3_2
;
185 C
[j
+3+(i
+1)*lda
]=temp1_3
;
186 C
[j
+3+(i
+2)*lda
]=temp2_3
;
187 C
[j
+3+(i
+3)*lda
]=temp3_3
;