multithreading tests from 152 lab 5
[riscv-tests.git] / mt / bp_matmul / matmul_mi.c
1 //**************************************************************************
2 // Multi-threaded Matrix Multiply benchmark
3 //--------------------------------------------------------------------------
4 // TA : Christopher Celio
5 // Student:
6 //
7 //
8 // This benchmark multiplies two 2-D arrays together and writes the results to
9 // a third vector. The input data (and reference data) should be generated
10 // using the matmul_gendata.pl perl script and dumped to a file named
11 // dataset.h.
12
13
14 // print out arrays, etc.
15 //#define DEBUG
16
17 //--------------------------------------------------------------------------
18 // Includes
19
20 #include <string.h>
21 #include <stdlib.h>
22 #include <stdio.h>
23
24
25 //--------------------------------------------------------------------------
26 // Input/Reference Data
27
28 typedef float data_t;
29 #include "dataset.h"
30
31
32 //--------------------------------------------------------------------------
33 // Basic Utilities and Multi-thread Support
34
35 __thread unsigned long coreid;
36 unsigned long ncores;
37
38 #include "util.h"
39
40 #define stringify_1(s) #s
41 #define stringify(s) stringify_1(s)
42 #define stats(code) do { \
43 unsigned long _c = -rdcycle(), _i = -rdinstret(); \
44 code; \
45 _c += rdcycle(), _i += rdinstret(); \
46 if (coreid == 0) \
47 printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
48 stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
49 } while(0)
50
51
52 //--------------------------------------------------------------------------
53 // Helper functions
54
55 void printArray( char name[], int n, data_t arr[] )
56 {
57 int i;
58 if (coreid != 0)
59 return;
60
61 printf( " %10s :", name );
62 for ( i = 0; i < n; i++ )
63 printf( " %3ld ", (long) arr[i] );
64 printf( "\n" );
65 }
66
67 void __attribute__((noinline)) verify(size_t n, const data_t* test, const data_t* correct)
68 {
69 if (coreid != 0)
70 return;
71
72 size_t i;
73 for (i = 0; i < n; i++)
74 {
75 if (test[i] != correct[i])
76 {
77 printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n",
78 i, (long)test[i], i, (long)correct[i]);
79 exit(-1);
80 }
81 }
82
83 return;
84 }
85
86 //--------------------------------------------------------------------------
87 // matmul function
88
89 // single-thread, naive version
90 void __attribute__((noinline)) matmul_naive(const int lda, const data_t A[], const data_t B[], data_t C[] )
91 {
92 int i, j, k;
93
94 if (coreid > 0)
95 return;
96
97 for ( i = 0; i < lda; i++ )
98 for ( j = 0; j < lda; j++ )
99 {
100 for ( k = 0; k < lda; k++ )
101 {
102 C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
103 }
104 }
105
106 }
107
108 void __attribute__((noinline)) matmul_MI_transpose(const int lda, const data_t A[], const data_t B[], data_t C[] )
109 {
110 int i, j, k;
111 data_t B_trans[32*32];
112 data_t acc_temp0, acc_temp1;
113 data_t *A_j, *B_i;
114 data_t *A_j_k, *B_i_k;
115 int z;
116
117 //for (i = 0; i < 32; i++) {
118 // for (j = 0; j < 32; j++) {
119 // B_trans[i*lda+j] = B[i+j*lda];
120 // }
121 //}
122
123 if (coreid == 0) {
124 for (i = 0; i < 32; i++) {
125 B_i = B_trans+i*32;
126 for (z = 0; z < 32; z++) {
127 *(B_i+z) = B[i+z*32];
128 }
129 for (j = 0; j < 16; j+=2) {
130 A_j = A+j*lda;
131 acc_temp0 = 0;
132 for (k = 0; k < 32; k+=8) {
133 A_j_k = A_j+k;
134 B_i_k = B_i+k;
135 acc_temp0 += *(A_j_k) * *(B_i_k);
136 acc_temp0 += *(A_j_k + 1) * *(B_i_k + 1);
137 acc_temp0 += *(A_j_k + 2) * *(B_i_k + 2);
138 acc_temp0 += *(A_j_k + 3) * *(B_i_k + 3);
139 acc_temp0 += *(A_j_k + 4) * *(B_i_k + 4);
140 acc_temp0 += *(A_j_k + 5) * *(B_i_k + 5);
141 acc_temp0 += *(A_j_k + 6) * *(B_i_k + 6);
142 acc_temp0 += *(A_j_k + 7) * *(B_i_k + 7);
143 }
144 A_j += 32;
145
146 acc_temp1 = 0;
147 for (k = 0; k < 32; k+=8) {
148 acc_temp1 += *(A_j+k) * *(B_i+k);
149 acc_temp1 += *(A_j+k + 1) * *(B_i+k + 1);
150 acc_temp1 += *(A_j+k + 2) * *(B_i+k + 2);
151 acc_temp1 += *(A_j+k + 3) * *(B_i+k + 3);
152 acc_temp1 += *(A_j+k + 4) * *(B_i+k + 4);
153 acc_temp1 += *(A_j+k + 5) * *(B_i+k + 5);
154 acc_temp1 += *(A_j+k + 6) * *(B_i+k + 6);
155 acc_temp1 += *(A_j+k + 7) * *(B_i+k + 7);
156 }
157
158 C[i + j*lda] = acc_temp0;
159 C[i + (j+1)*lda] = acc_temp1;
160 }
161 }
162 } else if (coreid == 1) {
163 for (i = 0; i < 32; i++) {
164 B_i = B_trans+i*32;
165 for (z = 0; z < 32; z++) {
166 *(B_i+z) = B[i+z*32];
167 }
168 for (j = 16; j < 32; j+=2) {
169 A_j = A+j*lda;
170 acc_temp0 = 0;
171 for (k = 0; k < 32; k+=8) {
172 acc_temp0 += *(A_j+k) * *(B_i+k);
173 acc_temp0 += *(A_j+k + 1) * *(B_i+k + 1);
174 acc_temp0 += *(A_j+k + 2) * *(B_i+k + 2);
175 acc_temp0 += *(A_j+k + 3) * *(B_i+k + 3);
176 acc_temp0 += *(A_j+k + 4) * *(B_i+k + 4);
177 acc_temp0 += *(A_j+k + 5) * *(B_i+k + 5);
178 acc_temp0 += *(A_j+k + 6) * *(B_i+k + 6);
179 acc_temp0 += *(A_j+k + 7) * *(B_i+k + 7);
180 }
181 A_j += 32;
182
183 acc_temp1 = 0;
184 for (k = 0; k < 32; k+=8) {
185 acc_temp1 += *(A_j+k) * *(B_i+k);
186 acc_temp1 += *(A_j+k + 1) * *(B_i+k + 1);
187 acc_temp1 += *(A_j+k + 2) * *(B_i+k + 2);
188 acc_temp1 += *(A_j+k + 3) * *(B_i+k + 3);
189 acc_temp1 += *(A_j+k + 4) * *(B_i+k + 4);
190 acc_temp1 += *(A_j+k + 5) * *(B_i+k + 5);
191 acc_temp1 += *(A_j+k + 6) * *(B_i+k + 6);
192 acc_temp1 += *(A_j+k + 7) * *(B_i+k + 7);
193 }
194 C[i + j*lda] = acc_temp0;
195 C[i + (j+1)*lda] = acc_temp1;
196 }
197 }
198 }
199 }
200
201 void __attribute__((noinline)) matmul_MI(const int lda, const data_t A[], const data_t B[], data_t C[] )
202 {
203 int i, j, k;
204 data_t acc_temp;
205 data_t *A_j, *B_i;
206 int j_start = coreid*16;
207 int j_end = (coreid*16)+16;
208 if (coreid == 0) {
209 for ( i = 0; i < 32; i++ ) {
210 B_i = B + i;
211 for ( j = j_start; j < j_end; j++ )
212 {
213 acc_temp = 0;
214 A_j = A + j*32;
215 for ( k = 0; k < 32; k++ )
216 {
217 acc_temp += *(A_j + k) * *(B_i + k*32);
218 }
219 C[i + j*32] = acc_temp;
220 }
221 }
222 } else if (coreid == 1) {
223 for ( i = 16; i < 32; i++ ) {
224 B_i = B + i;
225 for ( j = j_start; j < j_end; j++ )
226 {
227 acc_temp = 0;
228 A_j = A + j*32;
229 for ( k = 0; k < 32; k+=4 )
230 {
231 acc_temp += *(A_j + k) * *(B_i + k*32);
232 acc_temp += *(A_j + k + 1) * *(B_i + (k+1)*32);
233 acc_temp += *(A_j + k + 2) * *(B_i + (k+2)*32);
234 acc_temp += *(A_j + k + 3) * *(B_i + (k+3)*32);
235 }
236 C[i + j*32] = acc_temp;
237 }
238 }
239 for ( i = 0; i < 16; i++ ) {
240 B_i = B + i;
241 for ( j = j_start; j < j_end; j++ )
242 {
243 acc_temp = 0;
244 A_j = A + j*32;
245 for ( k = 0; k < 32; k+=4 )
246 {
247 acc_temp += *(A_j + k) * *(B_i + k*32);
248 acc_temp += *(A_j + k + 1) * *(B_i + (k+1)*32);
249 acc_temp += *(A_j + k + 2) * *(B_i + (k+2)*32);
250 acc_temp += *(A_j + k + 3) * *(B_i + (k+3)*32);
251 }
252 C[i + j*32] = acc_temp;
253 }
254 }
255
256 }
257 }
258
259 void __attribute__((noinline)) matmul_MSI(const int lda, const data_t A[], const data_t B[], data_t C[] )
260 {
261 int i, j, k;
262 data_t acc_temp;
263 data_t *A_j, *B_i;
264 int j_start = coreid*16;
265 int j_end = (coreid*16)+16;
266 for ( i = 0; i < 32; i++ ) {
267 B_i = B + i;
268 for ( j = j_start; j < j_end; j++ )
269 {
270 acc_temp = 0;
271 A_j = A + j*32;
272 for ( k = 0; k < 32; k++ )
273 {
274 acc_temp += *(A_j + k) * *(B_i + k*32);
275 }
276 C[i + j*32] = acc_temp;
277 }
278 }
279 }
280
281 void __attribute__((noinline)) matmul(const int lda, const data_t A[], const data_t B[], data_t C[] )
282 {
283
284 // ***************************** //
285 // **** ADD YOUR CODE HERE ***** //
286 // ***************************** //
287 //
288 // feel free to make a separate function for MI and MSI versions.
289 // ENABLE_SHARING = false is MI
290 // ENABLE_SHARING = true is MSI
291 matmul_MI_transpose(lda, A, B, C);
292 //matmul_MSI(lda, A, B, C);
293 }
294
295 //--------------------------------------------------------------------------
296 // Main
297 //
298 // all threads start executing thread_entry(). Use their "coreid" to
299 // differentiate between threads (each thread is running on a separate core).
300
301 void thread_entry(int cid, int nc)
302 {
303 coreid = cid;
304 ncores = nc;
305
306 // static allocates data in the binary, which is visible to both threads
307 static data_t results_data[ARRAY_SIZE];
308
309
310 // // Execute the provided, naive matmul
311 // barrier();
312 // //stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier());
313 //
314 //
315 // // verify
316 // //verify(ARRAY_SIZE, results_data, verify_data);
317 //
318 // // clear results from the first trial
319 // size_t i;
320 // if (coreid == 0)
321 // for (i=0; i < ARRAY_SIZE; i++)
322 // results_data[i] = 0;
323 // barrier();
324
325
326 // Execute your faster matmul
327 barrier();
328 stats(matmul(DIM_SIZE, input1_data, input2_data, results_data); barrier());
329
330 #ifdef DEBUG
331 printArray("results:", ARRAY_SIZE, results_data);
332 printArray("verify :", ARRAY_SIZE, verify_data);
333 #endif
334
335 // verify
336 verify(ARRAY_SIZE, results_data, verify_data);
337 barrier();
338
339 exit(0);
340 }
341