Run debug tests from main Makefile.
[riscv-tests.git] / mt / ce_matmul.c
1 #include "stdlib.h"
2
3 #include "util.h"
4
5 #include "dataset.h"
6 void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
7 {
8
9 data_t a1;
10 data_t a2;
11 data_t a3;
12 data_t a4;
13 data_t a5;
14 data_t a6;
15 data_t a7;
16 data_t a8;
17 data_t *b1;
18 data_t *b2;
19 data_t *b3;
20 data_t *b4;
21 data_t *b5;
22 data_t *b6;
23 data_t *b7;
24 data_t *b8;
25 data_t c1;
26 data_t c2;
27 data_t c3;
28 data_t c4;
29 data_t c5;
30 data_t c6;
31 data_t c7;
32 data_t c8;
33 int i, j, k;
34 int start, end;
35 static data_t BB[1024];
36
37
38 //transpose B
39 for ( k = 0; k < lda; k++) {
40 for ( i = coreid*(lda/ncores); i < (coreid+1)*(lda/ncores); i++ ) {
41 BB[i*lda + k] = B[k*lda + i];
42 }
43 barrier(ncores);
44 }
45
46 for ( int x = 0; x < ncores; x++) {
47 //split the i values into two chunks so the threads don't interfere on the B loads
48 //this could be generalized if needed, but I won't bother since it would be tricky
49 //and we already know the size and numthreads
50 start = x * (32 / ncores);
51 end = (x+1) * (32 / ncores);
52 for ( i = start; i < end; i+=8 ) {
53 for ( j = coreid*(lda/ncores); j < (coreid+1)*(lda/ncores); j++ ) {
54 c1=0;c2=0;c3=0;c4=0;c5=0;c6=0;c7=0;c8=0;
55 b1 = &BB[(i+0)*lda];
56 b2 = &BB[(i+1)*lda];
57 b3 = &BB[(i+2)*lda];
58 b4 = &BB[(i+3)*lda];
59 b5 = &BB[(i+4)*lda];
60 b6 = &BB[(i+5)*lda];
61 b7 = &BB[(i+6)*lda];
62 b8 = &BB[(i+7)*lda];
63
64 for ( k = 0; k < lda; k+=8 ) {
65 a1 = A[j*lda + k+0];
66 a2 = A[j*lda + k+1];
67 a3 = A[j*lda + k+2];
68 a4 = A[j*lda + k+3];
69 a5 = A[j*lda + k+4];
70 a6 = A[j*lda + k+5];
71 a7 = A[j*lda + k+6];
72 a8 = A[j*lda + k+7];
73
74 c1 += a1 * b1[k+0];
75 c1 += a2 * b1[k+1];
76 c1 += a3 * b1[k+2];
77 c1 += a4 * b1[k+3];
78 c1 += a5 * b1[k+4];
79 c1 += a6 * b1[k+5];
80 c1 += a7 * b1[k+6];
81 c1 += a8 * b1[k+7];
82
83 c2 += a1 * b2[k+0];
84 c2 += a2 * b2[k+1];
85 c2 += a3 * b2[k+2];
86 c2 += a4 * b2[k+3];
87 c2 += a5 * b2[k+4];
88 c2 += a6 * b2[k+5];
89 c2 += a7 * b2[k+6];
90 c2 += a8 * b2[k+7];
91
92 c3 += a1 * b3[k+0];
93 c3 += a2 * b3[k+1];
94 c3 += a3 * b3[k+2];
95 c3 += a4 * b3[k+3];
96 c3 += a5 * b3[k+4];
97 c3 += a6 * b3[k+5];
98 c3 += a7 * b3[k+6];
99 c3 += a8 * b3[k+7];
100
101 c4 += a1 * b4[k+0];
102 c4 += a2 * b4[k+1];
103 c4 += a3 * b4[k+2];
104 c4 += a4 * b4[k+3];
105 c4 += a5 * b4[k+4];
106 c4 += a6 * b4[k+5];
107 c4 += a7 * b4[k+6];
108 c4 += a8 * b4[k+7];
109
110 c5 += a1 * b5[k+0];
111 c5 += a2 * b5[k+1];
112 c5 += a3 * b5[k+2];
113 c5 += a4 * b5[k+3];
114 c5 += a5 * b5[k+4];
115 c5 += a6 * b5[k+5];
116 c5 += a7 * b5[k+6];
117 c5 += a8 * b5[k+7];
118
119 c6 += a1 * b6[k+0];
120 c6 += a2 * b6[k+1];
121 c6 += a3 * b6[k+2];
122 c6 += a4 * b6[k+3];
123 c6 += a5 * b6[k+4];
124 c6 += a6 * b6[k+5];
125 c6 += a7 * b6[k+6];
126 c6 += a8 * b6[k+7];
127
128 c7 += a1 * b7[k+0];
129 c7 += a2 * b7[k+1];
130 c7 += a3 * b7[k+2];
131 c7 += a4 * b7[k+3];
132 c7 += a5 * b7[k+4];
133 c7 += a6 * b7[k+5];
134 c7 += a7 * b7[k+6];
135 c7 += a8 * b7[k+7];
136
137 c8 += a1 * b8[k+0];
138 c8 += a2 * b8[k+1];
139 c8 += a3 * b8[k+2];
140 c8 += a4 * b8[k+3];
141 c8 += a5 * b8[k+4];
142 c8 += a6 * b8[k+5];
143 c8 += a7 * b8[k+6];
144 c8 += a8 * b8[k+7];
145 }
146 C[i+0 + j*lda] += c1;
147 C[i+1 + j*lda] += c2;
148 C[i+2 + j*lda] += c3;
149 C[i+3 + j*lda] += c4;
150 C[i+4 + j*lda] += c5;
151 C[i+5 + j*lda] += c6;
152 C[i+6 + j*lda] += c7;
153 C[i+7 + j*lda] += c8;
154 }
155 }
156 }
157 }