1 //**************************************************************************
2 // Multi-threaded Matrix Multiply benchmark
3 //--------------------------------------------------------------------------
4 // TA : Christopher Celio
8 // This benchmark multiplies two 2-D arrays together and writes the results to
9 // a third vector. The input data (and reference data) should be generated
10 // using the matmul_gendata.pl perl script and dumped to a file named
14 // print out arrays, etc.
17 //--------------------------------------------------------------------------
31 #define MIN(X,Y) (X < Y ? X : Y)
33 //--------------------------------------------------------------------------
34 // Input/Reference Data
40 //--------------------------------------------------------------------------
41 // Basic Utilities and Multi-thread Support
43 __thread
unsigned long coreid
;
48 #define stringify_1(s) #s
49 #define stringify(s) stringify_1(s)
50 #define stats(code) do { \
51 unsigned long _c = -rdcycle(), _i = -rdinstret(); \
53 _c += rdcycle(), _i += rdinstret(); \
55 printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
56 stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
60 //--------------------------------------------------------------------------
63 void printArray( char name
[], int n
, data_t arr
[] )
69 printf( " %10s :", name
);
70 for ( i
= 0; i
< n
; i
++ )
71 printf( " %3ld ", (long) arr
[i
] );
75 void __attribute__((noinline
)) verify(size_t n
, const data_t
* test
, const data_t
* correct
)
81 for (i
= 0; i
< n
; i
++)
83 if (test
[i
] != correct
[i
])
85 printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n",
86 i
, (long)test
[i
], i
, (long)correct
[i
]);
94 //--------------------------------------------------------------------------
97 // single-thread, naive version
98 void __attribute__((noinline
)) matmul_naive(const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
105 for ( i
= 0; i
< lda
; i
++ )
106 for ( j
= 0; j
< lda
; j
++ )
108 for ( k
= 0; k
< lda
; k
++ )
110 C
[i
+ j
*lda
] += A
[j
*lda
+ k
] * B
[k
*lda
+ i
];
118 void __attribute__((noinline
)) matmul(const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
121 // ***************************** //
122 // **** ADD YOUR CODE HERE ***** //
123 // ***************************** //
125 // feel free to make a separate function for MI and MSI versions.
127 int i
, j
, k
, ri
, rj
, ii
, jj
, kk
;
128 data_t
*Aj
, *Cj
, *Bi
;
129 data_t c
[REG_I
][REG_J
], a
[REG_J
], b
[REG_I
];
130 size_t start
= coreid
* (LDA
/ NCORES
), end
= (coreid
== NCORES
- 1 ? LDA
: (coreid
+ 1) * (LDA
/ NCORES
));
132 /* if (coreid > 0) { */
135 /* start = 0, end = lda; */
136 if (ncores
== NCORES
&& lda
== LDA
) {
137 for (jj
= start
; jj
< end
; jj
+= BLOCK_J
) {
138 int kk_start
= (coreid
== 0 ? 0 : LDA
/2) ,kk_end
= (coreid
== 0 ? LDA
/2 : LDA
);
139 for (kk
= kk_start
; kk
< kk_end
; kk
+= BLOCK_K
) {
140 // for (ii = 0; ii < LDA; ii += BLOCK_I)
141 for (j
= jj
; j
< MIN(end
, jj
+ BLOCK_J
); j
+= REG_J
) {
144 for (i
= 0; i
< LDA
/*, ii + BLOCK_I)*/; i
+= REG_I
) {
145 /* Load C in register blocks. */
147 for (ri
= 0; ri
< REG_I
; ri
++) {
148 for (rj
= 0; rj
< REG_J
; rj
++) {
149 c
[ri
][rj
] = Cj
[i
+ ri
+ ( rj
)*LDA
];
154 for (k
= kk
; k
< MIN(LDA
, kk
+ BLOCK_K
); k
++) {
155 for (ri
= 0; ri
< REG_I
; ri
++) {
156 b
[ri
] = Bi
[k
*LDA
+ ri
];
158 /* Compute C in register blocks. */
159 for (rj
= 0; rj
< REG_J
; rj
++) {
160 a
[rj
] = Aj
[(rj
)*LDA
+ k
];
161 for (ri
= 0; ri
< REG_I
; ri
++) {
162 c
[ri
][rj
] += a
[rj
] * b
[ri
];
167 /* store C in register blocks. */
168 for (ri
= 0; ri
< REG_I
; ri
++) {
169 for (rj
= 0; rj
< REG_J
; rj
++) {
170 Cj
[i
+ ri
+ ( rj
)*LDA
] = c
[ri
][rj
];
177 /* kk_start= (coreid == 1 ? 0 : LDA/2); */
178 /* kk_end = (coreid == 1 ? LDA/2 : LDA); */
179 /* for (kk = kk_start; kk < kk_end; kk += BLOCK_K) { */
180 /* // for (ii = 0; ii < LDA; ii += BLOCK_I) */
181 /* for (j = jj; j < MIN(end, jj + BLOCK_J); j += REG_J) { */
182 /* Aj = A + j*LDA; */
183 /* Cj = C + j*LDA; */
184 /* for (i = 0; i < LDA/\*, ii + BLOCK_I)*\/; i += REG_I) { */
185 /* /\* Load C in register blocks. *\/ */
187 /* for (ri = 0; ri < REG_I; ri++) { */
188 /* for (rj = 0; rj < REG_J; rj++) { */
189 /* c[ri][rj] = Cj[i + ri + ( rj)*LDA]; */
194 /* for (k = kk; k < MIN(LDA, kk + BLOCK_K); k++) { */
195 /* for (ri = 0; ri < REG_I; ri++) { */
196 /* b[ri] = Bi[k*LDA + ri]; */
198 /* /\* Compute C in register blocks. *\/ */
199 /* for (rj = 0; rj < REG_J; rj++) { */
200 /* a[rj] = Aj[(rj)*LDA + k]; */
201 /* for (ri = 0; ri < REG_I; ri++) { */
202 /* c[ri][rj] += a[rj] * b[ri]; */
207 /* store C in register blocks. */
208 /* for (ri = 0; ri < REG_I; ri++) { */
209 /* for (rj = 0; rj < REG_J; rj++) { */
210 /* Cj[i + ri + ( rj)*LDA] = c[ri][rj]; */
220 for (jj
= start
; jj
< end
; jj
+= BLOCK_J
) {
221 int kk_start
= (coreid
!= 0 ? 0 : LDA
/2), kk_end
= (coreid
!= 0 ? LDA
/2 : LDA
);
222 for (kk
= kk_start
; kk
< kk_end
; kk
+= BLOCK_K
) {
223 // for (ii = 0; ii < LDA; ii += BLOCK_I)
224 for (j
= jj
; j
< MIN(end
, jj
+ BLOCK_J
); j
+= REG_J
) {
227 for (i
= 0; i
< LDA
/*, ii + BLOCK_I)*/; i
+= REG_I
) {
228 /* Load C in register blocks. */
230 for (ri
= 0; ri
< REG_I
; ri
++) {
231 for (rj
= 0; rj
< REG_J
; rj
++) {
232 c
[ri
][rj
] = Cj
[i
+ ri
+ ( rj
)*LDA
];
237 for (k
= kk
; k
< MIN(LDA
, kk
+ BLOCK_K
); k
++) {
238 for (ri
= 0; ri
< REG_I
; ri
++) {
239 b
[ri
] = Bi
[k
*LDA
+ ri
];
241 /* Compute C in register blocks. */
242 for (rj
= 0; rj
< REG_J
; rj
++) {
243 a
[rj
] = Aj
[(rj
)*LDA
+ k
];
244 for (ri
= 0; ri
< REG_I
; ri
++) {
245 c
[ri
][rj
] += a
[rj
] * b
[ri
];
250 /* store C in register blocks. */
251 for (ri
= 0; ri
< REG_I
; ri
++) {
252 for (rj
= 0; rj
< REG_J
; rj
++) {
253 Cj
[i
+ ri
+ ( rj
)*LDA
] = c
[ri
][rj
];
260 /* We only care about performance for 32x32 matrices and 2 cores. Otherwise just naive mat_mul */
265 for ( i
= 0; i
< lda
; i
++ )
266 for ( j
= 0; j
< lda
; j
++ )
267 for ( k
= 0; k
< lda
; k
++ )
268 C
[i
+ j
*lda
] += A
[j
*lda
+ k
] * B
[k
*lda
+ i
];
272 //--------------------------------------------------------------------------
275 // all threads start executing thread_entry(). Use their "coreid" to
276 // differentiate between threads (each thread is running on a separate core).
278 void thread_entry(int cid
, int nc
)
283 // static allocates data in the binary, which is visible to both threads
284 static data_t results_data
[ARRAY_SIZE
];
287 // /* // Execute the provided, naive matmul */
289 // stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier());
293 // verify(ARRAY_SIZE, results_data, verify_data);
295 // // clear results from the first trial
298 // for (i=0; i < ARRAY_SIZE; i++)
299 // results_data[i] = 0;
303 // Execute your faster matmul
305 stats(matmul(DIM_SIZE
, input1_data
, input2_data
, results_data
); barrier());
308 printArray("results:", ARRAY_SIZE
, results_data
);
309 printArray("verify :", ARRAY_SIZE
, verify_data
);
313 verify(ARRAY_SIZE
, results_data
, verify_data
);