Clean up benchmarks build
[riscv-tests.git] / mt / ck_matmul.c
1 #include "stdlib.h"
2
3 #include "util.h"
4
5 #include "dataset.h"
6 void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
7 {
8
9 // ***************************** //
10 // **** ADD YOUR CODE HERE ***** //
11 // ***************************** //
12 //
13 // feel free to make a separate function for MI and MSI versions.
14 int i, j, k, ii, jj, bsize, start;
15 bsize = 16;
16 start = bsize*coreid;
17 for ( jj = start; jj < lda; jj += bsize*ncores) {
18 int first = 1;
19 for ( ii = start; ii !=start || first; ii=(bsize+ii) % lda) {
20 first = 0;
21 for ( j = jj; j < lda && j < jj + bsize; j+=4) {
22 for ( i = ii; i < lda && i < ii + bsize; i+=2) {
23 data_t c1 = C[i + j*lda];
24 data_t c2 = C[i + j*lda + 1];
25 data_t c3 = C[i + (j+1)*lda];
26 data_t c4 = C[i + (j+1)*lda + 1];
27 data_t c5 = C[i + (j+2)*lda];
28 data_t c6 = C[i + (j+2)*lda + 1];
29 data_t c7 = C[i + (j+3)*lda];
30 data_t c8 = C[i + (j+3)*lda + 1];
31 for ( k = 0; k < lda; k+=8){
32 for (int x = 0; x < 8; x++) {
33 data_t a = A[j*lda + k+x];
34 data_t a1 = A[(j+1)*lda +k+x];
35 data_t a2 = A[(j+2)*lda +k+x];
36 data_t a3 = A[(j+3)*lda +k+x];
37 data_t b1 = B[(k+x)*lda + i];
38 data_t b2 = B[(k+x)*lda + i + 1];
39 c1 += a * b1;
40 c2 += a * b2;
41 c3 += a1* b1;
42 c4 += a1* b2;
43 c5 += a2* b1;
44 c6 += a2* b2;
45 c7 += a3* b1;
46 c8 += a3* b2;
47 }
48 }
49 C[i + j*lda] = c1;
50 C[i + j*lda + 1] = c2;
51 C[i + (j+1)*lda] = c3;
52 C[i + (j+1)*lda + 1] = c4;
53 C[i + (j+2)*lda] = c5;
54 C[i + (j+2)*lda + 1] = c6;
55 C[i + (j+3)*lda] = c7;
56 C[i + (j+3)*lda + 1] = c8;
57 }
58 }
59 }
60 }
61 }