1 //**************************************************************************
2 // Multi-threaded Matrix Multiply benchmark
3 //--------------------------------------------------------------------------
4 // TA : Christopher Celio
8 // This benchmark multiplies two 2-D arrays together and writes the results to
9 // a third vector. The input data (and reference data) should be generated
10 // using the matmul_gendata.pl perl script and dumped to a file named
14 // print out arrays, etc.
17 //--------------------------------------------------------------------------
25 //--------------------------------------------------------------------------
26 // Input/Reference Data
32 //--------------------------------------------------------------------------
33 // Basic Utilities and Multi-thread Support
35 __thread
unsigned long coreid
;
40 #define stringify_1(s) #s
41 #define stringify(s) stringify_1(s)
42 #define stats(code) do { \
43 unsigned long _c = -rdcycle(), _i = -rdinstret(); \
45 _c += rdcycle(), _i += rdinstret(); \
47 printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
48 stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
52 //--------------------------------------------------------------------------
55 void printArrayMT( char name
[], int n
, data_t arr
[] )
61 printf( " %10s :", name
);
62 for ( i
= 0; i
< n
; i
++ )
63 printf( " %3ld ", (long) arr
[i
] );
67 void __attribute__((noinline
)) verifyMT(size_t n
, const data_t
* test
, const data_t
* correct
)
73 for (i
= 0; i
< n
; i
++)
75 if (test
[i
] != correct
[i
])
77 printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n",
78 i
, (long)test
[i
], i
, (long)correct
[i
]);
86 //--------------------------------------------------------------------------
89 // single-thread, naive version
90 void __attribute__((noinline
)) matmul_naive(const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
98 for ( i
= 0; i
< lda
; i
++ )
99 for ( j
= 0; j
< lda
; j
++ )
101 for ( k
= 0; k
< lda
; k
++ )
103 C
[i
+ j
*lda
] += A
[j
*lda
+ k
] * B
[k
*lda
+ i
];
111 void __attribute__((noinline
)) matmul(const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
114 // ***************************** //
115 // **** ADD YOUR CODE HERE ***** //
116 // ***************************** //
118 // feel free to make a separate function for MI and MSI versions.
120 int m
, i
, j
, k
, iB0
, iB1
;
121 data_t tempC0
, tempC1
, tempC2
, tempC3
, tempC4
, tempC5
, tempC6
, tempC7
;
122 data_t tempA0
, tempA1
;
125 for (m
= 0; m
< 2; m
++){
126 for (j
= 0; j
< lda
/2; j
++){
127 for (i
= 0; i
< lda
; i
+=8){
128 tempC0
= C
[i
+ j
*lda
];
129 tempC1
= C
[i
+ j
*lda
+1];
130 tempC2
= C
[i
+ j
*lda
+2];
131 tempC3
= C
[i
+ j
*lda
+3];
132 tempC4
= C
[i
+ j
*lda
+4];
133 tempC5
= C
[i
+ j
*lda
+5];
134 tempC6
= C
[i
+ j
*lda
+6];
135 tempC7
= C
[i
+ j
*lda
+7];
138 for (k
= m
*lda
/2; k
< (m
+1)*lda
/2; k
+=2){
140 tempA1
= A
[j
*lda
+k
+1];
141 tempC0
+= tempA0
*B
[iB0
]+tempA1
*B
[iB1
];
142 tempC1
+= tempA0
*B
[iB0
+1]+tempA1
*B
[iB1
+1];
143 tempC2
+= tempA0
*B
[iB0
+2]+tempA1
*B
[iB1
+2];
144 tempC3
+= tempA0
*B
[iB0
+3]+tempA1
*B
[iB1
+3];
145 tempC4
+= tempA0
*B
[iB0
+4]+tempA1
*B
[iB1
+4];
146 tempC5
+= tempA0
*B
[iB0
+5]+tempA1
*B
[iB1
+5];
147 tempC6
+= tempA0
*B
[iB0
+6]+tempA1
*B
[iB1
+6];
148 tempC7
+= tempA0
*B
[iB0
+7]+tempA1
*B
[iB1
+7];
153 C
[i
+ j
*lda
] = tempC0
;
154 C
[i
+ j
*lda
+ 1] = tempC1
;
155 C
[i
+ j
*lda
+ 2] = tempC2
;
156 C
[i
+ j
*lda
+ 3] = tempC3
;
157 C
[i
+ j
*lda
+ 4] = tempC4
;
158 C
[i
+ j
*lda
+ 5] = tempC5
;
159 C
[i
+ j
*lda
+ 6] = tempC6
;
160 C
[i
+ j
*lda
+ 7] = tempC7
;
165 for (m
= 2; m
> 0; m
--){
166 for (j
= lda
-1; j
>= lda
/2; j
--){
167 for (i
= lda
-1; i
>= 0; i
-=8){
168 tempC0
= C
[i
+ j
*lda
];
169 tempC1
= C
[i
+ j
*lda
- 1];
170 tempC2
= C
[i
+ j
*lda
- 2];
171 tempC3
= C
[i
+ j
*lda
- 3];
172 tempC4
= C
[i
+ j
*lda
- 4];
173 tempC5
= C
[i
+ j
*lda
- 5];
174 tempC6
= C
[i
+ j
*lda
- 6];
175 tempC7
= C
[i
+ j
*lda
- 7];
176 for (k
= m
*lda
/2-1; k
>= (m
-1)*lda
/2; k
-=2){
178 tempA1
= A
[j
*lda
+k
-1];
179 tempC0
+= tempA0
*B
[k
*lda
+i
]+tempA1
*B
[(k
-1)*lda
+i
];
180 tempC1
+= tempA0
*B
[k
*lda
+i
-1]+tempA1
*B
[(k
-1)*lda
+i
-1];
181 tempC2
+= tempA0
*B
[k
*lda
+i
-2]+tempA1
*B
[(k
-1)*lda
+i
-2];
182 tempC3
+= tempA0
*B
[k
*lda
+i
-3]+tempA1
*B
[(k
-1)*lda
+i
-3];
183 tempC4
+= tempA0
*B
[k
*lda
+i
-4]+tempA1
*B
[(k
-1)*lda
+i
-4];
184 tempC5
+= tempA0
*B
[k
*lda
+i
-5]+tempA1
*B
[(k
-1)*lda
+i
-5];
185 tempC6
+= tempA0
*B
[k
*lda
+i
-6]+tempA1
*B
[(k
-1)*lda
+i
-6];
186 tempC7
+= tempA0
*B
[k
*lda
+i
-7]+tempA1
*B
[(k
-1)*lda
+i
-7];
188 C
[i
+ j
*lda
] = tempC0
;
189 C
[i
+ j
*lda
- 1] = tempC1
;
190 C
[i
+ j
*lda
- 2] = tempC2
;
191 C
[i
+ j
*lda
- 3] = tempC3
;
192 C
[i
+ j
*lda
- 4] = tempC4
;
193 C
[i
+ j
*lda
- 5] = tempC5
;
194 C
[i
+ j
*lda
- 6] = tempC6
;
195 C
[i
+ j
*lda
- 7] = tempC7
;
202 //--------------------------------------------------------------------------
205 // all threads start executing thread_entry(). Use their "coreid" to
206 // differentiate between threads (each thread is running on a separate core).
208 void thread_entry(int cid
, int nc
)
213 // static allocates data in the binary, which is visible to both threads
214 static data_t results_data
[ARRAY_SIZE
];
217 // // Execute the provided, naive matmul
219 // stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier(nc));
223 // verifyMT(ARRAY_SIZE, results_data, verify_data);
225 // // clear results from the first trial
228 // for (i=0; i < ARRAY_SIZE; i++)
229 // results_data[i] = 0;
233 // Execute your faster matmul
235 stats(matmul(DIM_SIZE
, input1_data
, input2_data
, results_data
); barrier(nc
));
238 printArrayMT("results:", ARRAY_SIZE
, results_data
);
239 printArrayMT("verify :", ARRAY_SIZE
, verify_data
);
243 verifyMT(ARRAY_SIZE
, results_data
, verify_data
);