6 void __attribute__((noinline
)) matmul(const int coreid
, const int ncores
, const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
9 // ***************************** //
10 // **** ADD YOUR CODE HERE ***** //
11 // ***************************** //
13 // feel free to make a separate function for MI and MSI versions.
16 int max
= space
*coreid
+space
;
45 if (coreid
!=ncores
-1){
47 for (i
=space
*coreid
;i
<max
/4*4;i
+=4)
56 temp1_1
=C
[j
+(i
+1)*lda
];
57 temp2_1
=C
[j
+1+(i
+1)*lda
];
58 temp3_1
=C
[j
+2+(i
+1)*lda
];
59 temp4_1
=C
[j
+3+(i
+1)*lda
];
61 temp1_2
=C
[j
+(i
+2)*lda
];
62 temp2_2
=C
[j
+1+(i
+2)*lda
];
63 temp3_2
=C
[j
+2+(i
+2)*lda
];
64 temp4_2
=C
[j
+3+(i
+2)*lda
];
66 temp1_3
=C
[j
+(i
+3)*lda
];
67 temp2_3
=C
[j
+1+(i
+3)*lda
];
68 temp3_3
=C
[j
+2+(i
+3)*lda
];
69 temp4_3
=C
[j
+3+(i
+3)*lda
];
73 temp1
+=temp
*B
[j
+k
*lda
];
74 temp2
+=temp
*B
[j
+1+k
*lda
];
75 temp3
+=temp
*B
[j
+2+k
*lda
];
76 temp4
+=temp
*B
[j
+3+k
*lda
];
78 temp_1
=A
[k
+(i
+1)*lda
];
79 temp1_1
+=temp_1
*B
[j
+k
*lda
];
80 temp2_1
+=temp_1
*B
[j
+1+k
*lda
];
81 temp3_1
+=temp_1
*B
[j
+2+k
*lda
];
82 temp4_1
+=temp_1
*B
[j
+3+k
*lda
];
84 temp_2
=A
[k
+(i
+2)*lda
];
85 temp1_2
+=temp_2
*B
[j
+k
*lda
];
86 temp2_2
+=temp_2
*B
[j
+1+k
*lda
];
87 temp3_2
+=temp_2
*B
[j
+2+k
*lda
];
88 temp4_2
+=temp_2
*B
[j
+3+k
*lda
];
90 temp_3
=A
[k
+(i
+3)*lda
];
91 temp1_3
+=temp_3
*B
[j
+k
*lda
];
92 temp2_3
+=temp_3
*B
[j
+1+k
*lda
];
93 temp3_3
+=temp_3
*B
[j
+2+k
*lda
];
94 temp4_3
+=temp_3
*B
[j
+3+k
*lda
];
102 C
[j
+(i
+1)*lda
]=temp1_1
;
103 C
[j
+1+(i
+1)*lda
]=temp2_1
;
104 C
[j
+2+(i
+1)*lda
]=temp3_1
;
105 C
[j
+3+(i
+1)*lda
]=temp4_1
;
107 C
[j
+(i
+2)*lda
]=temp1_2
;
108 C
[j
+1+(i
+2)*lda
]=temp2_2
;
109 C
[j
+2+(i
+2)*lda
]=temp3_2
;
110 C
[j
+3+(i
+2)*lda
]=temp4_2
;
112 C
[j
+(i
+3)*lda
]=temp1_3
;
113 C
[j
+1+(i
+3)*lda
]=temp2_3
;
114 C
[j
+2+(i
+3)*lda
]=temp3_3
;
115 C
[j
+3+(i
+3)*lda
]=temp4_3
;
127 for (i
=space
*coreid
;i
<lda
/4*4;i
+=4)
136 temp1_1
=C
[j
+(i
+1)*lda
];
137 temp2_1
=C
[j
+1+(i
+1)*lda
];
138 temp3_1
=C
[j
+2+(i
+1)*lda
];
139 temp4_1
=C
[j
+3+(i
+1)*lda
];
141 temp1_2
=C
[j
+(i
+2)*lda
];
142 temp2_2
=C
[j
+1+(i
+2)*lda
];
143 temp3_2
=C
[j
+2+(i
+2)*lda
];
144 temp4_2
=C
[j
+3+(i
+2)*lda
];
146 temp1_3
=C
[j
+(i
+3)*lda
];
147 temp2_3
=C
[j
+1+(i
+3)*lda
];
148 temp3_3
=C
[j
+2+(i
+3)*lda
];
149 temp4_3
=C
[j
+3+(i
+3)*lda
];
153 temp1
+=temp
*B
[j
+k
*lda
];
154 temp2
+=temp
*B
[j
+1+k
*lda
];
155 temp3
+=temp
*B
[j
+2+k
*lda
];
156 temp4
+=temp
*B
[j
+3+k
*lda
];
158 temp_1
=A
[k
+(i
+1)*lda
];
159 temp1_1
+=temp_1
*B
[j
+k
*lda
];
160 temp2_1
+=temp_1
*B
[j
+1+k
*lda
];
161 temp3_1
+=temp_1
*B
[j
+2+k
*lda
];
162 temp4_1
+=temp_1
*B
[j
+3+k
*lda
];
164 temp_2
=A
[k
+(i
+2)*lda
];
165 temp1_2
+=temp_2
*B
[j
+k
*lda
];
166 temp2_2
+=temp_2
*B
[j
+1+k
*lda
];
167 temp3_2
+=temp_2
*B
[j
+2+k
*lda
];
168 temp4_2
+=temp_2
*B
[j
+3+k
*lda
];
170 temp_3
=A
[k
+(i
+3)*lda
];
171 temp1_3
+=temp_3
*B
[j
+k
*lda
];
172 temp2_3
+=temp_3
*B
[j
+1+k
*lda
];
173 temp3_3
+=temp_3
*B
[j
+2+k
*lda
];
174 temp4_3
+=temp_3
*B
[j
+3+k
*lda
];
182 C
[j
+(i
+1)*lda
]=temp1_1
;
183 C
[j
+1+(i
+1)*lda
]=temp2_1
;
184 C
[j
+2+(i
+1)*lda
]=temp3_1
;
185 C
[j
+3+(i
+1)*lda
]=temp4_1
;
187 C
[j
+(i
+2)*lda
]=temp1_2
;
188 C
[j
+1+(i
+2)*lda
]=temp2_2
;
189 C
[j
+2+(i
+2)*lda
]=temp3_2
;
190 C
[j
+3+(i
+2)*lda
]=temp4_2
;
192 C
[j
+(i
+3)*lda
]=temp1_3
;
193 C
[j
+1+(i
+3)*lda
]=temp2_3
;
194 C
[j
+2+(i
+3)*lda
]=temp3_3
;
195 C
[j
+3+(i
+3)*lda
]=temp4_3
;