Merge pull request #8 from riscv/sqrt-171
[riscv-tests.git] / mt / dm_matmul.c
1 #include "stdlib.h"
2
3 #include "util.h"
4
5 #include "dataset.h"
6 void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
7 {
8
9 // ***************************** //
10 // **** ADD YOUR CODE HERE ***** //
11 // ***************************** //
12 //
13 // feel free to make a separate function for MI and MSI versions.
14 int i, j, k;
15 int space=lda/ncores;
16 int max= space*coreid+space;
17 static data_t B1[32*32];
18 if (coreid==ncores-1){
19 for (i=0; i<lda*lda/ncores;i++)
20 {
21 B1[i]=B[i];
22 }
23 }
24 else{
25 for (i=lda*lda/ncores;i<lda*lda;i++)
26 B1[i]=B[i];
27 }
28 data_t temp=0;
29 data_t temp1=0;
30 data_t temp2=0;
31 data_t temp3=0;
32 data_t tempB=0;
33
34 data_t temp_1=0;
35 data_t temp1_1=0;
36 data_t temp2_1=0;
37 data_t temp3_1=0;
38 data_t tempB_1=0;
39
40 data_t temp_2=0;
41 data_t temp1_2=0;
42 data_t temp2_2=0;
43 data_t temp3_2=0;
44 data_t tempB_2=0;
45
46 data_t temp_3=0;
47 data_t temp1_3=0;
48 data_t temp2_3=0;
49 data_t temp3_3=0;
50 data_t tempB_3=0;
51 barrier(ncores);
52 if (coreid!=ncores-1){
53 for (i=space*coreid;i<max/4*4;i+=4)
54 {
55 for(j=0;j<lda/4*4;j+=4)
56 {
57 temp=C[j+i*lda];
58 temp1=C[j+(i+1)*lda];
59 temp2=C[j+(i+2)*lda];
60 temp3=C[j+(i+3)*lda];
61 temp_1=C[j+1+i*lda];
62 temp1_1=C[j+1+(i+1)*lda];
63 temp2_1=C[j+1+(i+2)*lda];
64 temp3_1=C[j+1+(i+3)*lda];
65 temp_2=C[j+2+i*lda];
66 temp1_2=C[j+2+(i+1)*lda];
67 temp2_2=C[j+2+(i+2)*lda];
68 temp3_2=C[j+2+(i+3)*lda];
69 temp_3=C[j+3+i*lda];
70 temp1_3=C[j+3+(i+1)*lda];
71 temp2_3=C[j+3+(i+2)*lda];
72 temp3_3=C[j+3+(i+3)*lda];
73 for (k=0;k<lda;k++)
74 {
75 tempB=B[j+k*lda];
76 temp+=A[k+i*lda]*tempB;
77 temp1+=A[k+(i+1)*lda]*tempB;
78 temp2+=A[k+(i+2)*lda]*tempB;
79 temp3+=A[k+(i+3)*lda]*tempB;
80
81 tempB_1=B[j+1+k*lda];
82 temp_1+=A[k+i*lda]*tempB_1;
83 temp1_1+=A[k+(i+1)*lda]*tempB_1;
84 temp2_1+=A[k+(i+2)*lda]*tempB_1;
85 temp3_1+=A[k+(i+3)*lda]*tempB_1;
86
87 tempB_2=B[j+2+k*lda];
88 temp_2+=A[k+i*lda]*tempB_2;
89 temp1_2+=A[k+(i+1)*lda]*tempB_2;
90 temp2_2+=A[k+(i+2)*lda]*tempB_2;
91 temp3_2+=A[k+(i+3)*lda]*tempB_2;
92
93 tempB_3=B[j+3+k*lda];
94 temp_3+=A[k+i*lda]*tempB_3;
95 temp1_3+=A[k+(i+1)*lda]*tempB_3;
96 temp2_3+=A[k+(i+2)*lda]*tempB_3;
97 temp3_3+=A[k+(i+3)*lda]*tempB_3;
98 }
99 C[j+i*lda]=temp;
100 C[j+(i+1)*lda]=temp1;
101 C[j+(i+2)*lda]=temp2;
102 C[j+(i+3)*lda]=temp3;
103
104 C[j+1+i*lda]=temp_1;
105 C[j+1+(i+1)*lda]=temp1_1;
106 C[j+1+(i+2)*lda]=temp2_1;
107 C[j+1+(i+3)*lda]=temp3_1;
108
109 C[j+2+i*lda]=temp_2;
110 C[j+2+(i+1)*lda]=temp1_2;
111 C[j+2+(i+2)*lda]=temp2_2;
112 C[j+2+(i+3)*lda]=temp3_2;
113
114 C[j+3+i*lda]=temp_3;
115 C[j+3+(i+1)*lda]=temp1_3;
116 C[j+3+(i+2)*lda]=temp2_3;
117 C[j+3+(i+3)*lda]=temp3_3;
118
119 }
120 }
121 }
122 else{
123 for (i=space*coreid;i<lda/4*4;i+=4)
124 {
125 for(j=0;j<lda/4*4;j+=4)
126 {
127 temp=C[j+i*lda];
128 temp1=C[j+(i+1)*lda];
129 temp2=C[j+(i+2)*lda];
130 temp3=C[j+(i+3)*lda];
131 temp_1=C[j+1+i*lda];
132 temp1_1=C[j+1+(i+1)*lda];
133 temp2_1=C[j+1+(i+2)*lda];
134 temp3_1=C[j+1+(i+3)*lda];
135 temp_2=C[j+2+i*lda];
136 temp1_2=C[j+2+(i+1)*lda];
137 temp2_2=C[j+2+(i+2)*lda];
138 temp3_2=C[j+2+(i+3)*lda];
139 temp_3=C[j+3+i*lda];
140 temp1_3=C[j+3+(i+1)*lda];
141 temp2_3=C[j+3+(i+2)*lda];
142 temp3_3=C[j+3+(i+3)*lda];
143 for (k=0;k<lda;k++)
144 {
145 tempB=B1[j+k*lda];
146 temp+=A[k+i*lda]*tempB;
147 temp1+=A[k+(i+1)*lda]*tempB;
148 temp2+=A[k+(i+2)*lda]*tempB;
149 temp3+=A[k+(i+3)*lda]*tempB;
150
151 tempB_1=B1[j+1+k*lda];
152 temp_1+=A[k+i*lda]*tempB_1;
153 temp1_1+=A[k+(i+1)*lda]*tempB_1;
154 temp2_1+=A[k+(i+2)*lda]*tempB_1;
155 temp3_1+=A[k+(i+3)*lda]*tempB_1;
156
157 tempB_2=B1[j+2+k*lda];
158 temp_2+=A[k+i*lda]*tempB_2;
159 temp1_2+=A[k+(i+1)*lda]*tempB_2;
160 temp2_2+=A[k+(i+2)*lda]*tempB_2;
161 temp3_2+=A[k+(i+3)*lda]*tempB_2;
162
163 tempB_3=B1[j+3+k*lda];
164 temp_3+=A[k+i*lda]*tempB_3;
165 temp1_3+=A[k+(i+1)*lda]*tempB_3;
166 temp2_3+=A[k+(i+2)*lda]*tempB_3;
167 temp3_3+=A[k+(i+3)*lda]*tempB_3;
168 }
169 C[j+i*lda]=temp;
170 C[j+(i+1)*lda]=temp1;
171 C[j+(i+2)*lda]=temp2;
172 C[j+(i+3)*lda]=temp3;
173
174 C[j+1+i*lda]=temp_1;
175 C[j+1+(i+1)*lda]=temp1_1;
176 C[j+1+(i+2)*lda]=temp2_1;
177 C[j+1+(i+3)*lda]=temp3_1;
178
179 C[j+2+i*lda]=temp_2;
180 C[j+2+(i+1)*lda]=temp1_2;
181 C[j+2+(i+2)*lda]=temp2_2;
182 C[j+2+(i+3)*lda]=temp3_2;
183
184 C[j+3+i*lda]=temp_3;
185 C[j+3+(i+1)*lda]=temp1_3;
186 C[j+3+(i+2)*lda]=temp2_3;
187 C[j+3+(i+3)*lda]=temp3_3;
188
189 }
190 }
191 }
192
193
194
195
196 }