Add missing RV32 slt[i]u tests
[riscv-tests.git] / mt / bc_matmul.c
1 #include "stdlib.h"
2
3 #include "util.h"
4
5 #include "dataset.h"
6
7 #define REG_I 8
8 #define REG_J 2
9 //#define BLOCK_I 32
10 #define BLOCK_J 16
11 #define BLOCK_K 16
12 #define LDA 32
13 #define NCORES 2
14 #define MIN(X,Y) (X < Y ? X : Y)
15
16 void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
17 {
18
19 // ***************************** //
20 // **** ADD YOUR CODE HERE ***** //
21 // ***************************** //
22 //
23 // feel free to make a separate function for MI and MSI versions.
24
25 int i, j, k, ri, rj, ii, jj, kk;
26 data_t *Aj, *Cj, *Bi;
27 data_t c[REG_I][REG_J], a[REG_J], b[REG_I];
28 size_t start = coreid * (LDA / NCORES), end = (coreid == NCORES - 1 ? LDA : (coreid + 1) * (LDA / NCORES));
29
30 /* if (coreid > 0) { */
31 /* return; */
32 /* } */
33 /* start = 0, end = lda; */
34 if (ncores == NCORES && lda == LDA) {
35 for (jj = start; jj < end; jj += BLOCK_J)
36 for (kk = 0; kk < LDA; kk += BLOCK_K)
37 //for (ii = 0; ii < LDA; ii += BLOCK_I)
38 for (j = jj; j < MIN(end, jj + BLOCK_J); j += REG_J) {
39 Aj = A + j*LDA;
40 Cj = C + j*LDA;
41 for (i = 0; i < LDA; i += REG_I) {
42 /* Load C in register blocks. */
43 Bi = B + i;
44 for (ri = 0; ri < REG_I; ri++) {
45 for (rj = 0; rj < REG_J; rj++) {
46 c[ri][rj] = Cj[i + ri + ( rj)*LDA];
47 }
48 }
49
50
51 for (k = kk; k < MIN(LDA, kk + BLOCK_K); k++) {
52 /* Load a,b in register blocks. */
53 /* for (rj = 0; rj < REG_J; rj++) {
54 a[rj] = A[(j + rj)*LDA + k];
55 }*/
56 /* for (ri = 0; ri < REG_I; ri++) { */
57 /* b[ri] = Bi[k*LDA + ri]; */
58 /* } */
59 /* /\* Compute C in register blocks. *\/ */
60 /* for (rj = 0; rj < REG_J; rj++) { */
61 /* a[rj] = Aj[( rj)*LDA + k]; */
62 /* for (ri = 0; ri < REG_I; ri++) { */
63 /* c[ri][rj] += a[rj] * b[ri]; */
64 /* } */
65 /* } */
66 a[0] = Aj[k];
67 a[1] = Aj[k + LDA];
68 b[0] = Bi[k*LDA];
69 b[1] = Bi[k*LDA + 1];
70 b[2] = Bi[k*LDA + 2];
71 b[3] = Bi[k*LDA + 3];
72 b[4] = Bi[k*LDA + 4];
73 b[5] = Bi[k*LDA + 5];
74 b[6] = Bi[k*LDA + 6];
75 b[7] = Bi[k*LDA + 7];
76
77
78 c[0][0] += b[0] * a[0];
79 c[0][1] += b[0] * a[1];
80 c[1][0] += b[1] * a[0];
81 c[1][1] += b[1] * a[1];
82 c[2][0] += b[2] * a[0];
83 c[2][1] += b[2] * a[1];
84 c[3][0] += b[3] * a[0];
85 c[3][1] += b[3] * a[1];
86 c[4][0] += b[4] * a[0];
87 c[4][1] += b[4] * a[1];
88 c[5][0] += b[5] * a[0];
89 c[5][1] += b[5] * a[1];
90 c[6][0] += b[6] * a[0];
91 c[6][1] += b[6] * a[1];
92 c[7][0] += b[7] * a[0];
93 c[7][1] += b[7] * a[1];
94
95
96 /* c[0][0] += b[0] * a[0]; */
97 /* c[1][1] += b[1] * a[1]; */
98 /* c[2][0] += b[2] * a[0]; */
99 /* c[3][1] += b[3] * a[1]; */
100 /* c[4][0] += b[4] * a[0]; */
101 /* c[5][1] += b[5] * a[1]; */
102 /* c[6][0] += b[6] * a[0]; */
103 /* c[7][1] += b[7] * a[1]; */
104 /* c[0][0] += b[0] * a[0]; */
105 /* c[1][1] += b[1] * a[1]; */
106 /* c[2][0] += b[2] * a[0]; */
107 /* c[3][1] += b[3] * a[1]; */
108 /* c[4][0] += b[4] * a[0]; */
109 /* c[5][1] += b[5] * a[1]; */
110 /* c[6][0] += b[6] * a[0]; */
111 /* c[7][1] += b[7] * a[1]; */
112
113 }
114
115 /* store C in register blocks. */
116 for (ri = 0; ri < REG_I; ri++) {
117 for (rj = 0; rj < REG_J; rj++) {
118 Cj[i + ri + (rj)*LDA] = c[ri][rj];
119 }
120 }
121 }
122
123
124
125
126 }
127 /* We only care about performance for 32x32 matrices and 2 cores. Otherwise just naive mat_mul */
128 } else {
129 if (coreid > 0)
130 return;
131
132 for ( i = 0; i < lda; i++ )
133 for ( j = 0; j < lda; j++ )
134 for ( k = 0; k < lda; k++ )
135 C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
136 }
137 }