1 //**************************************************************************
2 // Multi-threaded Matrix Multiply benchmark
3 //--------------------------------------------------------------------------
4 // TA : Christopher Celio
8 // This benchmark multiplies two 2-D arrays together and writes the results to
9 // a third vector. The input data (and reference data) should be generated
10 // using the matmul_gendata.pl perl script and dumped to a file named
14 // print out arrays, etc.
17 //--------------------------------------------------------------------------
31 #define MIN(X,Y) (X < Y ? X : Y)
33 //--------------------------------------------------------------------------
34 // Input/Reference Data
40 //--------------------------------------------------------------------------
41 // Basic Utilities and Multi-thread Support
43 __thread
unsigned long coreid
;
48 #define stringify_1(s) #s
49 #define stringify(s) stringify_1(s)
50 #define stats(code) do { \
51 unsigned long _c = -rdcycle(), _i = -rdinstret(); \
53 _c += rdcycle(), _i += rdinstret(); \
55 printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
56 stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
60 //--------------------------------------------------------------------------
63 void printArrayMT( char name
[], int n
, data_t arr
[] )
69 printf( " %10s :", name
);
70 for ( i
= 0; i
< n
; i
++ )
71 printf( " %3ld ", (long) arr
[i
] );
75 void __attribute__((noinline
)) verifyMT(size_t n
, const data_t
* test
, const data_t
* correct
)
81 for (i
= 0; i
< n
; i
++)
83 if (test
[i
] != correct
[i
])
85 printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n",
86 i
, (long)test
[i
], i
, (long)correct
[i
]);
94 //--------------------------------------------------------------------------
97 // single-thread, naive version
98 void __attribute__((noinline
)) matmul_naive(const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
105 for ( i
= 0; i
< lda
; i
++ )
106 for ( j
= 0; j
< lda
; j
++ )
108 for ( k
= 0; k
< lda
; k
++ )
110 C
[i
+ j
*lda
] += A
[j
*lda
+ k
] * B
[k
*lda
+ i
];
118 void __attribute__((noinline
)) matmul(const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
121 // ***************************** //
122 // **** ADD YOUR CODE HERE ***** //
123 // ***************************** //
125 // feel free to make a separate function for MI and MSI versions.
127 int i
, j
, k
, ri
, rj
, ii
, jj
, kk
;
128 data_t
*Aj
, *Cj
, *Bi
;
129 data_t c
[REG_I
][REG_J
], a
[REG_J
], b
[REG_I
];
130 size_t start
= coreid
* (LDA
/ NCORES
), end
= (coreid
== NCORES
- 1 ? LDA
: (coreid
+ 1) * (LDA
/ NCORES
));
132 /* if (coreid > 0) { */
135 /* start = 0, end = lda; */
136 if (ncores
== NCORES
&& lda
== LDA
) {
137 for (jj
= start
; jj
< end
; jj
+= BLOCK_J
)
138 for (kk
= 0; kk
< LDA
; kk
+= BLOCK_K
)
139 //for (ii = 0; ii < LDA; ii += BLOCK_I)
140 for (j
= jj
; j
< MIN(end
, jj
+ BLOCK_J
); j
+= REG_J
) {
143 for (i
= 0; i
< LDA
; i
+= REG_I
) {
144 /* Load C in register blocks. */
146 for (ri
= 0; ri
< REG_I
; ri
++) {
147 for (rj
= 0; rj
< REG_J
; rj
++) {
148 c
[ri
][rj
] = Cj
[i
+ ri
+ ( rj
)*LDA
];
153 for (k
= kk
; k
< MIN(LDA
, kk
+ BLOCK_K
); k
++) {
154 /* Load a,b in register blocks. */
155 /* for (rj = 0; rj < REG_J; rj++) {
156 a[rj] = A[(j + rj)*LDA + k];
158 /* for (ri = 0; ri < REG_I; ri++) { */
159 /* b[ri] = Bi[k*LDA + ri]; */
161 /* /\* Compute C in register blocks. *\/ */
162 /* for (rj = 0; rj < REG_J; rj++) { */
163 /* a[rj] = Aj[( rj)*LDA + k]; */
164 /* for (ri = 0; ri < REG_I; ri++) { */
165 /* c[ri][rj] += a[rj] * b[ri]; */
171 b
[1] = Bi
[k
*LDA
+ 1];
172 b
[2] = Bi
[k
*LDA
+ 2];
173 b
[3] = Bi
[k
*LDA
+ 3];
174 b
[4] = Bi
[k
*LDA
+ 4];
175 b
[5] = Bi
[k
*LDA
+ 5];
176 b
[6] = Bi
[k
*LDA
+ 6];
177 b
[7] = Bi
[k
*LDA
+ 7];
180 c
[0][0] += b
[0] * a
[0];
181 c
[0][1] += b
[0] * a
[1];
182 c
[1][0] += b
[1] * a
[0];
183 c
[1][1] += b
[1] * a
[1];
184 c
[2][0] += b
[2] * a
[0];
185 c
[2][1] += b
[2] * a
[1];
186 c
[3][0] += b
[3] * a
[0];
187 c
[3][1] += b
[3] * a
[1];
188 c
[4][0] += b
[4] * a
[0];
189 c
[4][1] += b
[4] * a
[1];
190 c
[5][0] += b
[5] * a
[0];
191 c
[5][1] += b
[5] * a
[1];
192 c
[6][0] += b
[6] * a
[0];
193 c
[6][1] += b
[6] * a
[1];
194 c
[7][0] += b
[7] * a
[0];
195 c
[7][1] += b
[7] * a
[1];
198 /* c[0][0] += b[0] * a[0]; */
199 /* c[1][1] += b[1] * a[1]; */
200 /* c[2][0] += b[2] * a[0]; */
201 /* c[3][1] += b[3] * a[1]; */
202 /* c[4][0] += b[4] * a[0]; */
203 /* c[5][1] += b[5] * a[1]; */
204 /* c[6][0] += b[6] * a[0]; */
205 /* c[7][1] += b[7] * a[1]; */
206 /* c[0][0] += b[0] * a[0]; */
207 /* c[1][1] += b[1] * a[1]; */
208 /* c[2][0] += b[2] * a[0]; */
209 /* c[3][1] += b[3] * a[1]; */
210 /* c[4][0] += b[4] * a[0]; */
211 /* c[5][1] += b[5] * a[1]; */
212 /* c[6][0] += b[6] * a[0]; */
213 /* c[7][1] += b[7] * a[1]; */
217 /* store C in register blocks. */
218 for (ri
= 0; ri
< REG_I
; ri
++) {
219 for (rj
= 0; rj
< REG_J
; rj
++) {
220 Cj
[i
+ ri
+ (rj
)*LDA
] = c
[ri
][rj
];
229 /* We only care about performance for 32x32 matrices and 2 cores. Otherwise just naive mat_mul */
234 for ( i
= 0; i
< lda
; i
++ )
235 for ( j
= 0; j
< lda
; j
++ )
236 for ( k
= 0; k
< lda
; k
++ )
237 C
[i
+ j
*lda
] += A
[j
*lda
+ k
] * B
[k
*lda
+ i
];
241 //--------------------------------------------------------------------------
244 // all threads start executing thread_entry(). Use their "coreid" to
245 // differentiate between threads (each thread is running on a separate core).
247 void thread_entry(int cid
, int nc
)
252 // static allocates data in the binary, which is visible to both threads
253 static data_t results_data
[ARRAY_SIZE
];
256 // /* // Execute the provided, naive matmul */
258 // stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier(nc));
262 // verifyMT(ARRAY_SIZE, results_data, verify_data);
264 // // clear results from the first trial
267 // for (i=0; i < ARRAY_SIZE; i++)
268 // results_data[i] = 0;
272 // Execute your faster matmul
274 stats(matmul(DIM_SIZE
, input1_data
, input2_data
, results_data
); barrier(nc
));
277 printArrayMT("results:", ARRAY_SIZE
, results_data
);
278 printArrayMT("verify :", ARRAY_SIZE
, verify_data
);
282 verifyMT(ARRAY_SIZE
, results_data
, verify_data
);