bump env
[riscv-tests.git] / mt / cl_matmul.c
1 #include "stdlib.h"
2
3 #include "util.h"
4
5 #include "dataset.h"
6 void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
7 {
8 if(coreid > 1) return;
9 // feel free to make a separate function for MI and MSI versions.
10 int i, j, k, x;
11 data_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
12 data_t temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15;
13
14
15 if(coreid == 0) {
16 for(j = 0; j < 32; j++) {
17 temp0 = C[j*lda];
18 temp1 = C[1 + j*lda];
19 temp2 = C[2 + j*lda];
20 temp3 = C[3 + j*lda];
21 temp4 = C[4 + j*lda];
22 temp5 = C[5 + j*lda];
23 temp6 = C[6 + j*lda];
24 temp7 = C[7 + j*lda];
25 temp8 = C[8 + j*lda];
26 temp9 = C[9 + j*lda];
27 temp10 = C[10 + j*lda];
28 temp11 = C[11 + j*lda];
29 temp12 = C[12 + j*lda];
30 temp13 = C[13 + j*lda];
31 temp14 = C[14 + j*lda];
32 temp15 = C[15 + j*lda];
33 for(k = 0; k < 32; k++) {
34 temp0 += A[j*lda + k] * B[k*lda];
35 temp1 += A[j*lda + k] * B[1 + k*lda];
36 temp2 += A[j*lda + k] * B[2 + k*lda];
37 temp3 += A[j*lda + k] * B[3 + k*lda];
38 temp4 += A[j*lda + k] * B[4 + k*lda];
39 temp5 += A[j*lda + k] * B[5 + k*lda];
40 temp6 += A[j*lda + k] * B[6 + k*lda];
41 temp7 += A[j*lda + k] * B[7 + k*lda];
42 temp8 += A[j*lda + k] * B[8 + k*lda];
43 temp9 += A[j*lda + k] * B[9 + k*lda];
44 temp10 += A[j*lda + k] * B[10 + k*lda];
45 temp11 += A[j*lda + k] * B[11 + k*lda];
46 temp12 += A[j*lda + k] * B[12 + k*lda];
47 temp13 += A[j*lda + k] * B[13 + k*lda];
48 temp14 += A[j*lda + k] * B[14 + k*lda];
49 temp15 += A[j*lda + k] * B[15 + k*lda];
50 }
51 C[j*lda] = temp0;
52 C[1 + j*lda] = temp1;
53 C[2 + j*lda] = temp2;
54 C[3 + j*lda] = temp3;
55 C[4 + j*lda] = temp4;
56 C[5 + j*lda] = temp5;
57 C[6 + j*lda] = temp6;
58 C[7 + j*lda] = temp7;
59 C[8 + j*lda] = temp8;
60 C[9 + j*lda] = temp9;
61 C[10 + j*lda] = temp10;
62 C[11 + j*lda] = temp11;
63 C[12 + j*lda] = temp12;
64 C[13 + j*lda] = temp13;
65 C[14 + j*lda] = temp14;
66 C[15 + j*lda] = temp15;
67 }
68 }
69
70 if(coreid == 1 || ncores == 1) {
71 for(j = 16; j < 32; j++) {
72 temp0 = C[16 + j*lda];
73 temp1 = C[17 + j*lda];
74 temp2 = C[18 + j*lda];
75 temp3 = C[19 + j*lda];
76 temp4 = C[20 + j*lda];
77 temp5 = C[21 + j*lda];
78 temp6 = C[22 + j*lda];
79 temp7 = C[23 + j*lda];
80 temp8 = C[24 + j*lda];
81 temp9 = C[25 + j*lda];
82 temp10 = C[26 + j*lda];
83 temp11 = C[27 + j*lda];
84 temp12 = C[28 + j*lda];
85 temp13 = C[29 + j*lda];
86 temp14 = C[30 + j*lda];
87 temp15 = C[31 + j*lda];
88 for(k = 0; k < 32; k++) {
89 temp0 += A[j*lda + k] * B[16 + k*lda];
90 temp1 += A[j*lda + k] * B[17 + k*lda];
91 temp2 += A[j*lda + k] * B[18 + k*lda];
92 temp3 += A[j*lda + k] * B[19 + k*lda];
93 temp4 += A[j*lda + k] * B[20 + k*lda];
94 temp5 += A[j*lda + k] * B[21 + k*lda];
95 temp6 += A[j*lda + k] * B[22 + k*lda];
96 temp7 += A[j*lda + k] * B[23 + k*lda];
97 temp8 += A[j*lda + k] * B[24 + k*lda];
98 temp9 += A[j*lda + k] * B[25 + k*lda];
99 temp10 += A[j*lda + k] * B[26 + k*lda];
100 temp11 += A[j*lda + k] * B[27 + k*lda];
101 temp12 += A[j*lda + k] * B[28 + k*lda];
102 temp13 += A[j*lda + k] * B[29 + k*lda];
103 temp14 += A[j*lda + k] * B[30 + k*lda];
104 temp15 += A[j*lda + k] * B[31 + k*lda];
105 }
106 C[16 + j*lda] = temp0;
107 C[17 + j*lda] = temp1;
108 C[18 + j*lda] = temp2;
109 C[19 + j*lda] = temp3;
110 C[20 + j*lda] = temp4;
111 C[21 + j*lda] = temp5;
112 C[22 + j*lda] = temp6;
113 C[23 + j*lda] = temp7;
114 C[24 + j*lda] = temp8;
115 C[25 + j*lda] = temp9;
116 C[26 + j*lda] = temp10;
117 C[27 + j*lda] = temp11;
118 C[28 + j*lda] = temp12;
119 C[29 + j*lda] = temp13;
120 C[30 + j*lda] = temp14;
121 C[31 + j*lda] = temp15;
122 }
123 for(j = 0; j <16; j++) {
124 temp0 = C[16 + j*lda];
125 temp1 = C[17 + j*lda];
126 temp2 = C[18 + j*lda];
127 temp3 = C[19 + j*lda];
128 temp4 = C[20 + j*lda];
129 temp5 = C[21 + j*lda];
130 temp6 = C[22 + j*lda];
131 temp7 = C[23 + j*lda];
132 temp8 = C[24 + j*lda];
133 temp9 = C[25 + j*lda];
134 temp10 = C[26 + j*lda];
135 temp11 = C[27 + j*lda];
136 temp12 = C[28 + j*lda];
137 temp13 = C[29 + j*lda];
138 temp14 = C[30 + j*lda];
139 temp15 = C[31 + j*lda];
140 for(k = 0; k < 32; k++) {
141 temp0 += A[j*lda + k] * B[16 + k*lda];
142 temp1 += A[j*lda + k] * B[17 + k*lda];
143 temp2 += A[j*lda + k] * B[18 + k*lda];
144 temp3 += A[j*lda + k] * B[19 + k*lda];
145 temp4 += A[j*lda + k] * B[20 + k*lda];
146 temp5 += A[j*lda + k] * B[21 + k*lda];
147 temp6 += A[j*lda + k] * B[22 + k*lda];
148 temp7 += A[j*lda + k] * B[23 + k*lda];
149 temp8 += A[j*lda + k] * B[24 + k*lda];
150 temp9 += A[j*lda + k] * B[25 + k*lda];
151 temp10 += A[j*lda + k] * B[26 + k*lda];
152 temp11 += A[j*lda + k] * B[27 + k*lda];
153 temp12 += A[j*lda + k] * B[28 + k*lda];
154 temp13 += A[j*lda + k] * B[29 + k*lda];
155 temp14 += A[j*lda + k] * B[30 + k*lda];
156 temp15 += A[j*lda + k] * B[31 + k*lda];
157 }
158 C[16 + j*lda] = temp0;
159 C[17 + j*lda] = temp1;
160 C[18 + j*lda] = temp2;
161 C[19 + j*lda] = temp3;
162 C[20 + j*lda] = temp4;
163 C[21 + j*lda] = temp5;
164 C[22 + j*lda] = temp6;
165 C[23 + j*lda] = temp7;
166 C[24 + j*lda] = temp8;
167 C[25 + j*lda] = temp9;
168 C[26 + j*lda] = temp10;
169 C[27 + j*lda] = temp11;
170 C[28 + j*lda] = temp12;
171 C[29 + j*lda] = temp13;
172 C[30 + j*lda] = temp14;
173 C[31 + j*lda] = temp15;
174 }
175 }
176 }