d69f8fe65dc17837cbc636cfa3c53ffdd43be75a
[riscv-tests.git] / mt / at_matmul / at_matmul.c
1 //**************************************************************************
2 // Multi-threaded Matrix Multiply benchmark
3 //--------------------------------------------------------------------------
4 // TA : Christopher Celio
5 // Student:
6 //
7 //
8 // This benchmark multiplies two 2-D arrays together and writes the results to
9 // a third vector. The input data (and reference data) should be generated
10 // using the matmul_gendata.pl perl script and dumped to a file named
11 // dataset.h.
12
13
14 // print out arrays, etc.
15 //#define DEBUG
16
17 //--------------------------------------------------------------------------
18 // Includes
19
20 #include <string.h>
21 #include <stdlib.h>
22 #include <stdio.h>
23
24
25 //--------------------------------------------------------------------------
26 // Input/Reference Data
27
28 typedef float data_t;
29 #include "dataset.h"
30
31
32 //--------------------------------------------------------------------------
33 // Basic Utilities and Multi-thread Support
34
35 __thread unsigned long coreid;
36 unsigned long ncores;
37
38 #include "util.h"
39
40 #define stringify_1(s) #s
41 #define stringify(s) stringify_1(s)
42 #define stats(code) do { \
43 unsigned long _c = -rdcycle(), _i = -rdinstret(); \
44 code; \
45 _c += rdcycle(), _i += rdinstret(); \
46 if (coreid == 0) \
47 printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
48 stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
49 } while(0)
50
51
52 //--------------------------------------------------------------------------
53 // Helper functions
54
55 void printArray( char name[], int n, data_t arr[] )
56 {
57 int i;
58 if (coreid != 0)
59 return;
60
61 printf( " %10s :", name );
62 for ( i = 0; i < n; i++ )
63 printf( " %3ld ", (long) arr[i] );
64 printf( "\n" );
65 }
66
67 void __attribute__((noinline)) verify(size_t n, const data_t* test, const data_t* correct)
68 {
69 if (coreid != 0)
70 return;
71
72 size_t i;
73 for (i = 0; i < n; i++)
74 {
75 if (test[i] != correct[i])
76 {
77 printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n",
78 i, (long)test[i], i, (long)correct[i]);
79 exit(-1);
80 }
81 }
82
83 return;
84 }
85
86 //--------------------------------------------------------------------------
87 // matmul function
88
89 // single-thread, naive version
90 void __attribute__((noinline)) matmul_naive(const int lda, const data_t A[], const data_t B[], data_t C[] )
91 {
92 int i, j, k;
93
94 if (coreid > 0)
95 return;
96
97 for ( i = 0; i < lda; i++ )
98 {
99 for ( j = 0; j < lda; j++ )
100 {
101 for ( k = 0; k < lda; k++ )
102 {
103 C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
104 }
105 }
106 }
107 }
108
109
110
111 void __attribute__((noinline)) matmul(const int lda, const data_t A[], const data_t B[], data_t C[] )
112 {
113
114 // ***************************** //
115 // **** ADD YOUR CODE HERE ***** //
116 // ***************************** //
117 //
118 // feel free to make a separate function for MI and MSI versions.
119
120 int i, j, k;
121
122 /*547287
123 for ( i = coreid*lda/ncores; i < (coreid+1)*lda/ncores; i++ )
124 {
125 for ( j = 0; j < lda; j++ )
126 {
127 int aIndex = j*lda;
128 int cIndex = i + aIndex;
129 C[cIndex] += A[aIndex] * B[i];
130 C[cIndex] += A[aIndex + 1] * B[1*lda + i];
131 C[cIndex] += A[aIndex + 2] * B[2*lda + i];
132 C[cIndex] += A[aIndex + 3] * B[3*lda + i];
133 C[cIndex] += A[aIndex + 4] * B[4*lda + i];
134 C[cIndex] += A[aIndex + 5] * B[5*lda + i];
135 C[cIndex] += A[aIndex + 6] * B[6*lda + i];
136 C[cIndex] += A[aIndex + 7] * B[7*lda + i];
137 C[cIndex] += A[aIndex + 8] * B[8*lda + i];
138 C[cIndex] += A[aIndex + 9] * B[9*lda + i];
139 C[cIndex] += A[aIndex + 10] * B[10*lda + i];
140 C[cIndex] += A[aIndex + 11] * B[11*lda + i];
141 C[cIndex] += A[aIndex + 12] * B[12*lda + i];
142 C[cIndex] += A[aIndex + 13] * B[13*lda + i];
143 C[cIndex] += A[aIndex + 14] * B[14*lda + i];
144 C[cIndex] += A[aIndex + 15] * B[15*lda + i];
145 C[cIndex] += A[aIndex + 16] * B[16*lda + i];
146 C[cIndex] += A[aIndex + 17] * B[17*lda + i];
147 C[cIndex] += A[aIndex + 18] * B[18*lda + i];
148 C[cIndex] += A[aIndex + 19] * B[19*lda + i];
149 C[cIndex] += A[aIndex + 20] * B[20*lda + i];
150 C[cIndex] += A[aIndex + 21] * B[21*lda + i];
151 C[cIndex] += A[aIndex + 22] * B[22*lda + i];
152 C[cIndex] += A[aIndex + 23] * B[23*lda + i];
153 C[cIndex] += A[aIndex + 24] * B[24*lda + i];
154 C[cIndex] += A[aIndex + 25] * B[25*lda + i];
155 C[cIndex] += A[aIndex + 26] * B[26*lda + i];
156 C[cIndex] += A[aIndex + 27] * B[27*lda + i];
157 C[cIndex] += A[aIndex + 28] * B[28*lda + i];
158 C[cIndex] += A[aIndex + 29] * B[29*lda + i];
159 C[cIndex] += A[aIndex + 30] * B[30*lda + i];
160 C[cIndex] += A[aIndex + 31] * B[31*lda + i];
161 }
162 }
163 */
164
165 //492827
166 /* for ( i = coreid*lda/ncores; i < (coreid+1)*lda/ncores; i++ )
167 {
168 for ( j = 0; j < lda; j++ )
169 {
170
171 int aIndex = j*lda;
172 int cIndex = i + aIndex;
173 for ( k = 0; k < lda; k++)
174 {
175 C[cIndex] += A[aIndex + k] * B[k*lda + i];
176 /* C[cIndex] += A[aIndex + k+1] * B[(k+1)*lda + i];
177 C[cIndex] += A[aIndex + k+2] * B[(k+2)*lda + i];
178 C[cIndex] += A[aIndex + k+3] * B[(k+3)*lda + i];
179 C[cIndex] += A[aIndex + k+4] * B[(k+4)*lda + i];
180 C[cIndex] += A[aIndex + k+5] * B[(k+5)*lda + i];
181 C[cIndex] += A[aIndex + k+6] * B[(k+6)*lda + i];
182 C[cIndex] += A[aIndex + k+7] * B[(k+7)*lda + i];
183 C[cIndex] += A[aIndex + k+8] * B[(k+8)*lda + i];
184 C[cIndex] += A[aIndex + k+9] * B[(k+9)*lda + i];
185 C[cIndex] += A[aIndex + k+10] * B[(k+10)*lda + i];
186 C[cIndex] += A[aIndex + k+11] * B[(k+11)*lda + i];
187 C[cIndex] += A[aIndex + k+12] * B[(k+12)*lda + i];
188 C[cIndex] += A[aIndex + k+13] * B[(k+13)*lda + i];
189 C[cIndex] += A[aIndex + k+14] * B[(k+14)*lda + i];
190 C[cIndex] += A[aIndex + k+15] * B[(k+15)*lda + i];*/
191 /* }
192 }
193 }*/
194 /*
195 //326378
196 data_t bTrans[1024];
197
198 for (int counti = 0; counti < 32; counti++) {
199 for (int countj = 0; countj < 32; countj++) {
200 *(bTrans + counti + countj*lda) = *(B + countj + counti*lda);
201 }
202 }
203
204
205 int BLOCKSIZE = 8;
206 for ( i = 0; i < lda; i+=BLOCKSIZE )
207 {
208 for ( int iTemp = i; iTemp < i + BLOCKSIZE; iTemp++ ) {
209 int iFlag = iTemp*lda;
210 for ( j = coreid*lda/ncores; j < (coreid+1)*lda/ncores; j++ ) {
211 int jFlag = j*lda;
212 int cLoc = jFlag+iTemp;
213 for ( k = 0; k < lda; k+=8) {
214 *(C+cLoc) += *(A+jFlag+k) * *(bTrans+iFlag+k);
215 *(C+cLoc) += *(A+jFlag+k+1) * *(bTrans+iFlag+k+1);
216 *(C+cLoc) += *(A+jFlag+k+2) * *(bTrans+iFlag+k+2);
217 *(C+cLoc) += *(A+jFlag+k+3) * *(bTrans+iFlag+k+3);
218 *(C+cLoc) += *(A+jFlag+k+4) * *(bTrans+iFlag+k+4);
219 *(C+cLoc) += *(A+jFlag+k+5) * *(bTrans+iFlag+k+5);
220 *(C+cLoc) += *(A+jFlag+k+6) * *(bTrans+iFlag+k+6);
221 *(C+cLoc) += *(A+jFlag+k+7) * *(bTrans+iFlag+k+7);
222 }
223 }
224 }
225 }*/
226 data_t bTrans[1024];
227
228 for (int counti = 0; counti < 32; counti++) {
229 for (int countj = 0; countj < 32; countj++) {
230 *(bTrans + counti + countj*lda) = *(B + countj + counti*lda);
231 }
232 }
233
234
235 int BLOCKSIZE = 8;
236 for ( j = 0; j < lda; j++ )
237 {
238 //for ( int jTemp = j; jTemp < j + BLOCKSIZE; jTemp++ ) {
239 int jFlag = j*lda;
240 for ( i = coreid*lda/ncores; i < (coreid+1)*lda/ncores; i+=BLOCKSIZE ) {
241 for ( int iTemp = i; iTemp < i + BLOCKSIZE; iTemp++ ) {
242
243 int iFlag = iTemp*lda;
244 int cLoc = jFlag+iTemp;
245 for ( k = 0; k < lda; k+=16) {
246 *(C+cLoc) += *(A+jFlag+k) * *(bTrans+iFlag+k);
247 *(C+cLoc) += *(A+jFlag+k+1) * *(bTrans+iFlag+k+1);
248 *(C+cLoc) += *(A+jFlag+k+2) * *(bTrans+iFlag+k+2);
249 *(C+cLoc) += *(A+jFlag+k+3) * *(bTrans+iFlag+k+3);
250 *(C+cLoc) += *(A+jFlag+k+4) * *(bTrans+iFlag+k+4);
251 *(C+cLoc) += *(A+jFlag+k+5) * *(bTrans+iFlag+k+5);
252 *(C+cLoc) += *(A+jFlag+k+6) * *(bTrans+iFlag+k+6);
253 *(C+cLoc) += *(A+jFlag+k+7) * *(bTrans+iFlag+k+7);
254 *(C+cLoc) += *(A+jFlag+k+8) * *(bTrans+iFlag+k+8);
255 *(C+cLoc) += *(A+jFlag+k+9) * *(bTrans+iFlag+k+9);
256 *(C+cLoc) += *(A+jFlag+k+10) * *(bTrans+iFlag+k+10);
257 *(C+cLoc) += *(A+jFlag+k+11) * *(bTrans+iFlag+k+11);
258 *(C+cLoc) += *(A+jFlag+k+12) * *(bTrans+iFlag+k+12);
259 *(C+cLoc) += *(A+jFlag+k+13) * *(bTrans+iFlag+k+13);
260 *(C+cLoc) += *(A+jFlag+k+14) * *(bTrans+iFlag+k+14);
261 *(C+cLoc) += *(A+jFlag+k+15) * *(bTrans+iFlag+k+15);
262 }
263 }
264 }
265 //}
266 }
267
268
269 }
270
271 //--------------------------------------------------------------------------
272 // Main
273 //
274 // all threads start executing thread_entry(). Use their "coreid" to
275 // differentiate between threads (each thread is running on a separate core).
276
277 void thread_entry(int cid, int nc)
278 {
279 coreid = cid;
280 ncores = nc;
281
282 // static allocates data in the binary, which is visible to both threads
283 static data_t results_data[ARRAY_SIZE];
284
285
286 // // Execute the provided, naive matmul
287 // barrier();
288 // stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier());
289 //
290 //
291 // // verify
292 // verify(ARRAY_SIZE, results_data, verify_data);
293 //
294 // // clear results from the first trial
295 // size_t i;
296 // if (coreid == 0)
297 // for (i=0; i < ARRAY_SIZE; i++)
298 // results_data[i] = 0;
299 // barrier();
300
301
302 // Execute your faster matmul
303 barrier();
304 stats(matmul(DIM_SIZE, input1_data, input2_data, results_data); barrier());
305
306 #ifdef DEBUG
307 printArray("results:", ARRAY_SIZE, results_data);
308 printArray("verify :", ARRAY_SIZE, verify_data);
309 #endif
310
311 // verify
312 verify(ARRAY_SIZE, results_data, verify_data);
313 barrier();
314
315 exit(0);
316 }
317