1 //**************************************************************************
2 // Multi-threaded Matrix Multiply benchmark
3 //--------------------------------------------------------------------------
4 // TA : Christopher Celio
8 // This benchmark multiplies two 2-D arrays together and writes the results to
9 // a third vector. The input data (and reference data) should be generated
10 // using the matmul_gendata.pl perl script and dumped to a file named
14 // print out arrays, etc.
17 //--------------------------------------------------------------------------
25 //--------------------------------------------------------------------------
26 // Input/Reference Data
32 //--------------------------------------------------------------------------
33 // Basic Utilities and Multi-thread Support
35 __thread
unsigned long coreid
;
40 #define stringify_1(s) #s
41 #define stringify(s) stringify_1(s)
42 #define stats(code) do { \
43 unsigned long _c = -rdcycle(), _i = -rdinstret(); \
45 _c += rdcycle(), _i += rdinstret(); \
47 printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
48 stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
52 //--------------------------------------------------------------------------
55 void printArrayMT( char name
[], int n
, data_t arr
[] )
61 printf( " %10s :", name
);
62 for ( i
= 0; i
< n
; i
++ )
63 printf( " %3ld ", (long) arr
[i
] );
67 void __attribute__((noinline
)) verifyMT(size_t n
, const data_t
* test
, const data_t
* correct
)
73 for (i
= 0; i
< n
; i
++)
75 if (test
[i
] != correct
[i
])
77 printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n",
78 i
, (long)test
[i
], i
, (long)correct
[i
]);
86 //--------------------------------------------------------------------------
89 // single-thread, naive version
90 void __attribute__((noinline
)) matmul_naive(const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
98 for ( i = 0; i < lda; i++ )
99 for ( j = 0; j < lda; j++ )
101 for ( k = 0; k < lda; k++ )
103 C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
111 void __attribute__((noinline
)) matmul(const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
113 size_t c_start
= lda
/ ncores
* coreid
;
136 for (block
= 0; block
< 2; block
++) {
137 for (colSplit
= 0; colSplit
< 4; colSplit
++) {
138 useSplit
= (coreid
== 0) ? colSplit
: (colSplit
+ 2 ) % 4;
139 for (c_row
= c_start
+ block
* 8; c_row
< c_start
+ block
* 8 + 8; c_row
+= 2) {
140 for (c_col
= 0; c_col
< lda
; c_col
+=4) {
141 c1
= C
[c_row
*lda
+c_col
];
142 c2
= C
[(c_row
+1)*lda
+c_col
];
143 c3
= C
[c_row
*lda
+c_col
+1];
144 c4
= C
[(c_row
+1)*lda
+c_col
+1];
145 c5
= C
[c_row
*lda
+c_col
+2];
146 c6
= C
[(c_row
+1)*lda
+c_col
+2];
147 c7
= C
[c_row
*lda
+c_col
+3];
148 c8
= C
[(c_row
+1)*lda
+c_col
+3];
149 for (i
= useSplit
* lda
/ 4; i
< (useSplit
+ 1) * lda
/ 4; i
+=4) {
151 a2
= A
[(c_row
+1)*lda
+i
];
152 a3
= A
[c_row
*lda
+i
+1];
153 a4
= A
[(c_row
+1)*lda
+i
+1];
154 a5
= A
[c_row
*lda
+i
+2];
155 a6
= A
[(c_row
+1)*lda
+i
+2];
156 a7
= A
[c_row
*lda
+i
+3];
157 a8
= A
[(c_row
+1)*lda
+i
+3];
159 c1
+= a1
* B
[i
*lda
+c_col
];
160 c2
+= a2
* B
[i
*lda
+c_col
];
162 c1
+= a3
* B
[(i
+1)*lda
+c_col
];
163 c2
+= a4
* B
[(i
+1)*lda
+c_col
];
165 c1
+= a5
* B
[(i
+2)*lda
+c_col
];
166 c2
+= a6
* B
[(i
+2)*lda
+c_col
];
168 c1
+= a7
* B
[(i
+3)*lda
+c_col
];
169 c2
+= a8
* B
[(i
+3)*lda
+c_col
];
171 c3
+= a1
* B
[i
*lda
+c_col
+1];
172 c4
+= a2
* B
[i
*lda
+c_col
+1];
174 c3
+= a3
* B
[(i
+1)*lda
+c_col
+1];
175 c4
+= a4
* B
[(i
+1)*lda
+c_col
+1];
177 c3
+= a5
* B
[(i
+2)*lda
+c_col
+1];
178 c4
+= a6
* B
[(i
+2)*lda
+c_col
+1];
180 c3
+= a7
* B
[(i
+3)*lda
+c_col
+1];
181 c4
+= a8
* B
[(i
+3)*lda
+c_col
+1];
183 c5
+= a1
* B
[i
*lda
+c_col
+2];
184 c6
+= a2
* B
[i
*lda
+c_col
+2];
186 c5
+= a3
* B
[(i
+1)*lda
+c_col
+2];
187 c6
+= a4
* B
[(i
+1)*lda
+c_col
+2];
189 c5
+= a5
* B
[(i
+2)*lda
+c_col
+2];
190 c6
+= a6
* B
[(i
+2)*lda
+c_col
+2];
192 c5
+= a7
* B
[(i
+3)*lda
+c_col
+2];
193 c6
+= a8
* B
[(i
+3)*lda
+c_col
+2];
195 c7
+= a1
* B
[i
*lda
+c_col
+3];
196 c8
+= a2
* B
[i
*lda
+c_col
+3];
198 c7
+= a3
* B
[(i
+1)*lda
+c_col
+3];
199 c8
+= a4
* B
[(i
+1)*lda
+c_col
+3];
201 c7
+= a5
* B
[(i
+2)*lda
+c_col
+3];
202 c8
+= a6
* B
[(i
+2)*lda
+c_col
+3];
204 c7
+= a7
* B
[(i
+3)*lda
+c_col
+3];
205 c8
+= a8
* B
[(i
+3)*lda
+c_col
+3];
208 C
[c_row
*lda
+c_col
] = c1
;
209 C
[(c_row
+1)*lda
+c_col
] = c2
;
211 C
[c_row
*lda
+c_col
+1] = c3
;
212 C
[(c_row
+1)*lda
+c_col
+1] = c4
;
214 C
[c_row
*lda
+c_col
+2] = c5
;
215 C
[(c_row
+1)*lda
+c_col
+2] = c6
;
217 C
[c_row
*lda
+c_col
+3] = c7
;
218 C
[(c_row
+1)*lda
+c_col
+3] = c8
;
225 //--------------------------------------------------------------------------
228 // all threads start executing thread_entry(). Use their "coreid" to
229 // differentiate between threads (each thread is running on a separate core).
231 void thread_entry(int cid
, int nc
)
236 // static allocates data in the binary, which is visible to both threads
237 static data_t results_data
[ARRAY_SIZE
];
240 // Execute the provided, naive matmul
242 // stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier(nc));
246 // verifyMT(ARRAY_SIZE, results_data, verify_data);
248 // clear results from the first trial
251 // for (i=0; i < ARRAY_SIZE; i++)
252 // results_data[i] = 0;
256 // Execute your faster matmul
258 stats(matmul(DIM_SIZE
, input1_data
, input2_data
, results_data
); barrier(nc
));
261 printArrayMT("results:", ARRAY_SIZE
, results_data
);
262 printArrayMT("verify :", ARRAY_SIZE
, verify_data
);
266 verifyMT(ARRAY_SIZE
, results_data
, verify_data
);