Change the global pointer symbol to __global_pointer$
[riscv-tests.git] / mt / df_matmul.c
1 #include "stdlib.h"
2
3 #include "util.h"
4
5 #include "dataset.h"
6 void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
7 {
8
9 // ***************************** //
10 // **** ADD YOUR CODE HERE ***** //
11 // ***************************** //
12 //
13 // feel free to make a separate function for MI and MSI versions.
14 int j, k;
15 data_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
16 data_t temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15;
17 if(coreid == 0) {
18 //16*0:16*(0+1) ;; 16*1+16*(1+1)
19 //0:16 ;; 16:32
20
21 //complete Q1
22 for(j = 0; j < 16; j++) {
23 temp0 = C[j*lda];
24 temp1 = C[1 + j*lda];
25 temp2 = C[2 + j*lda];
26 temp3 = C[3 + j*lda];
27 temp4 = C[4 + j*lda];
28 temp5 = C[5 + j*lda];
29 temp6 = C[6 + j*lda];
30 temp7 = C[7 + j*lda];
31 temp8 = C[8 + j*lda];
32 temp9 = C[9 + j*lda];
33 temp10 = C[10 + j*lda];
34 temp11 = C[11 + j*lda];
35 temp12 = C[12 + j*lda];
36 temp13 = C[13 + j*lda];
37 temp14 = C[14 + j*lda];
38 temp15 = C[15 + j*lda];
39 for(k = 0; k < 32; k++) {
40 temp0 += A[j*lda + k] * B[k*lda];
41 temp1 += A[j*lda + k] * B[1+k*lda];
42 temp2 += A[j*lda + k] * B[2+k*lda];
43 temp3 += A[j*lda + k] * B[3+k*lda];
44 temp4 += A[j*lda + k] * B[4+k*lda];
45 temp5 += A[j*lda + k] * B[5+k*lda];
46 temp6 += A[j*lda + k] * B[6+k*lda];
47 temp7 += A[j*lda + k] * B[7+k*lda];
48 temp8 += A[j*lda + k] * B[8+k*lda];
49 temp9 += A[j*lda + k] * B[9+k*lda];
50 temp10 += A[j*lda + k] * B[10+k*lda];
51 temp11 += A[j*lda + k] * B[11+k*lda];
52 temp12 += A[j*lda + k] * B[12+k*lda];
53 temp13 += A[j*lda + k] * B[13+k*lda];
54 temp14 += A[j*lda + k] * B[14+k*lda];
55 temp15 += A[j*lda + k] * B[15+k*lda];
56 }
57 C[j*lda] = temp0;
58 C[1 + j*lda] = temp1;
59 C[2 + j*lda] = temp2;
60 C[3 + j*lda] = temp3;
61 C[4 + j*lda] = temp4;
62 C[5 + j*lda] = temp5;
63 C[6 + j*lda] = temp6;
64 C[7 + j*lda] = temp7;
65 C[8 + j*lda] = temp8;
66 C[9 + j*lda] = temp9;
67 C[10 + j*lda] = temp10;
68 C[11 + j*lda] = temp11;
69 C[12 + j*lda] = temp12;
70 C[13 + j*lda] = temp13;
71 C[14 + j*lda] = temp14;
72 C[15 + j*lda] = temp15;
73 }
74 for(j = 16; j < 32; j++) {
75 temp0 = C[j*lda];
76 temp1 = C[1 + j*lda];
77 temp2 = C[2 + j*lda];
78 temp3 = C[3 + j*lda];
79 temp4 = C[4 + j*lda];
80 temp5 = C[5 + j*lda];
81 temp6 = C[6 + j*lda];
82 temp7 = C[7 + j*lda];
83 temp8 = C[8 + j*lda];
84 temp9 = C[9 + j*lda];
85 temp10 = C[10 + j*lda];
86 temp11 = C[11 + j*lda];
87 temp12 = C[12 + j*lda];
88 temp13 = C[13 + j*lda];
89 temp14 = C[14 + j*lda];
90 temp15 = C[15 + j*lda];
91 for(k = 0; k < 32; k++) {
92 temp0 += A[j*lda + k] * B[k*lda];
93 temp1 += A[j*lda + k] * B[1+k*lda];
94 temp2 += A[j*lda + k] * B[2+k*lda];
95 temp3 += A[j*lda + k] * B[3+k*lda];
96 temp4 += A[j*lda + k] * B[4+k*lda];
97 temp5 += A[j*lda + k] * B[5+k*lda];
98 temp6 += A[j*lda + k] * B[6+k*lda];
99 temp7 += A[j*lda + k] * B[7+k*lda];
100 temp8 += A[j*lda + k] * B[8+k*lda];
101 temp9 += A[j*lda + k] * B[9+k*lda];
102 temp10 += A[j*lda + k] * B[10+k*lda];
103 temp11 += A[j*lda + k] * B[11+k*lda];
104 temp12 += A[j*lda + k] * B[12+k*lda];
105 temp13 += A[j*lda + k] * B[13+k*lda];
106 temp14 += A[j*lda + k] * B[14+k*lda];
107 temp15 += A[j*lda + k] * B[15+k*lda];
108 }
109 C[j*lda] = temp0;
110 C[1 + j*lda] = temp1;
111 C[2 + j*lda] = temp2;
112 C[3 + j*lda] = temp3;
113 C[4 + j*lda] = temp4;
114 C[5 + j*lda] = temp5;
115 C[6 + j*lda] = temp6;
116 C[7 + j*lda] = temp7;
117 C[8 + j*lda] = temp8;
118 C[9 + j*lda] = temp9;
119 C[10 + j*lda] = temp10;
120 C[11 + j*lda] = temp11;
121 C[12 + j*lda] = temp12;
122 C[13 + j*lda] = temp13;
123 C[14 + j*lda] = temp14;
124 C[15 + j*lda] = temp15;
125 }
126 }
127 //16*(2-1) : 16*2 ;; 16*(1-1) : 16*1
128 //16:32 ;; 0:16
129 if(coreid == 1 || ncores == 1) {
130 //complete Q3
131 for(j = 16; j < 32; j++) {
132 temp0 = C[16+j*lda];
133 temp1 = C[17+j*lda];
134 temp2 = C[18+j*lda];
135 temp3 = C[19+j*lda];
136 temp4 = C[20+j*lda];
137 temp5 = C[21+j*lda];
138 temp6 = C[22+j*lda];
139 temp7 = C[23+j*lda];
140 temp8 = C[24+j*lda];
141 temp9 = C[25+j*lda];
142 temp10 = C[26+j*lda];
143 temp11 = C[27+j*lda];
144 temp12 = C[28+j*lda];
145 temp13 = C[29+j*lda];
146 temp14 = C[30+j*lda];
147 temp15 = C[31+j*lda];
148 for(k = 0; k < 32; k++) {
149 temp0 += A[j*lda + k] * B[16+k*lda];
150 temp1 += A[j*lda + k] * B[17+k*lda];
151 temp2 += A[j*lda + k] * B[18+k*lda];
152 temp3 += A[j*lda + k] * B[19+k*lda];
153 temp4 += A[j*lda + k] * B[20+k*lda];
154 temp5 += A[j*lda + k] * B[21+k*lda];
155 temp6 += A[j*lda + k] * B[22+k*lda];
156 temp7 += A[j*lda + k] * B[23+k*lda];
157 temp8 += A[j*lda + k] * B[24+k*lda];
158 temp9 += A[j*lda + k] * B[25+k*lda];
159 temp10 += A[j*lda + k] * B[26+k*lda];
160 temp11 += A[j*lda + k] * B[27+k*lda];
161 temp12 += A[j*lda + k] * B[28+k*lda];
162 temp13 += A[j*lda + k] * B[29+k*lda];
163 temp14 += A[j*lda + k] * B[30+k*lda];
164 temp15 += A[j*lda + k] * B[31+k*lda];
165 }
166 C[16 + j*lda] = temp0;
167 C[17 + j*lda] = temp1;
168 C[18 + j*lda] = temp2;
169 C[19 + j*lda] = temp3;
170 C[20 + j*lda] = temp4;
171 C[21 + j*lda] = temp5;
172 C[22 + j*lda] = temp6;
173 C[23 + j*lda] = temp7;
174 C[24 + j*lda] = temp8;
175 C[25 + j*lda] = temp9;
176 C[26 + j*lda] = temp10;
177 C[27 + j*lda] = temp11;
178 C[28 + j*lda] = temp12;
179 C[29 + j*lda] = temp13;
180 C[30 + j*lda] = temp14;
181 C[31 + j*lda] = temp15;
182 }
183 //complete Q4
184 for(j = 0; j < 16; j++) {
185 temp0 = C[16 + j*lda];
186 temp1 = C[17 + j*lda];
187 temp2 = C[18 + j*lda];
188 temp3 = C[19 + j*lda];
189 temp4 = C[20 + j*lda];
190 temp5 = C[21 + j*lda];
191 temp6 = C[22 + j*lda];
192 temp7 = C[23 + j*lda];
193 temp8 = C[24 + j*lda];
194 temp9 = C[25 + j*lda];
195 temp10 = C[26 + j*lda];
196 temp11 = C[27 + j*lda];
197 temp12 = C[28 + j*lda];
198 temp13 = C[29 + j*lda];
199 temp14 = C[30 + j*lda];
200 temp15 = C[31 + j*lda];
201 for(k = 0; k < 32; k++) {
202 temp0 += A[j*lda + k] * B[16 + k*lda];
203 temp1 += A[j*lda + k] * B[17 + k*lda];
204 temp2 += A[j*lda + k] * B[18 + k*lda];
205 temp3 += A[j*lda + k] * B[19 + k*lda];
206 temp4 += A[j*lda + k] * B[20 + k*lda];
207 temp5 += A[j*lda + k] * B[21 + k*lda];
208 temp6 += A[j*lda + k] * B[22 + k*lda];
209 temp7 += A[j*lda + k] * B[23 + k*lda];
210 temp8 += A[j*lda + k] * B[24 + k*lda];
211 temp9 += A[j*lda + k] * B[25 + k*lda];
212 temp10 += A[j*lda + k] * B[26 + k*lda];
213 temp11 += A[j*lda + k] * B[27 + k*lda];
214 temp12 += A[j*lda + k] * B[28 + k*lda];
215 temp13 += A[j*lda + k] * B[29 + k*lda];
216 temp14 += A[j*lda + k] * B[30 + k*lda];
217 temp15 += A[j*lda + k] * B[31 + k*lda];
218 }
219 C[16 + j*lda] = temp0;
220 C[17 + j*lda] = temp1;
221 C[18 + j*lda] = temp2;
222 C[19 + j*lda] = temp3;
223 C[20 + j*lda] = temp4;
224 C[21 + j*lda] = temp5;
225 C[22 + j*lda] = temp6;
226 C[23 + j*lda] = temp7;
227 C[24 + j*lda] = temp8;
228 C[25 + j*lda] = temp9;
229 C[26 + j*lda] = temp10;
230 C[27 + j*lda] = temp11;
231 C[28 + j*lda] = temp12;
232 C[29 + j*lda] = temp13;
233 C[30 + j*lda] = temp14;
234 C[31 + j*lda] = temp15;
235 }
236 }
237 }