Separate page faults from physical memory access exceptions
[riscv-tests.git] / mt / bm_matmul.c
1 #include "stdlib.h"
2
3 #include "util.h"
4
5 #include "dataset.h"
6 void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
7 {
8
9 // ***************************** //
10 // **** ADD YOUR CODE HERE ***** //
11 // ***************************** //
12 //
13 // feel free to make a separate function for MI and MSI versions.
14 int i, j, k;
15 int space=lda/ncores;
16 int max= space*coreid+space;
17 data_t temp=0;
18
19 data_t temp1=0;
20 data_t temp2=0;
21 data_t temp3=0;
22 data_t temp4=0;
23
24 data_t temp_1=0;
25
26 data_t temp1_1=0;
27 data_t temp2_1=0;
28 data_t temp3_1=0;
29 data_t temp4_1=0;
30
31 data_t temp_2=0;
32
33 data_t temp1_2=0;
34 data_t temp2_2=0;
35 data_t temp3_2=0;
36 data_t temp4_2=0;
37
38 data_t temp_3=0;
39
40 data_t temp1_3=0;
41 data_t temp2_3=0;
42 data_t temp3_3=0;
43 data_t temp4_3=0;
44
45 if (coreid!=ncores-1){
46 //main loop
47 for (i=space*coreid;i<max/4*4;i+=4)
48 {
49 for(j=0;j<lda;j+=4)
50 {
51 temp1=C[j+i*lda];
52 temp2=C[j+1+i*lda];
53 temp3=C[j+2+i*lda];
54 temp4=C[j+3+i*lda];
55
56 temp1_1=C[j+(i+1)*lda];
57 temp2_1=C[j+1+(i+1)*lda];
58 temp3_1=C[j+2+(i+1)*lda];
59 temp4_1=C[j+3+(i+1)*lda];
60
61 temp1_2=C[j+(i+2)*lda];
62 temp2_2=C[j+1+(i+2)*lda];
63 temp3_2=C[j+2+(i+2)*lda];
64 temp4_2=C[j+3+(i+2)*lda];
65
66 temp1_3=C[j+(i+3)*lda];
67 temp2_3=C[j+1+(i+3)*lda];
68 temp3_3=C[j+2+(i+3)*lda];
69 temp4_3=C[j+3+(i+3)*lda];
70 for (k=0;k<lda;k++)
71 {
72 temp=A[k+i*lda];
73 temp1+=temp*B[j+k*lda];
74 temp2+=temp*B[j+1+k*lda];
75 temp3+=temp*B[j+2+k*lda];
76 temp4+=temp*B[j+3+k*lda];
77
78 temp_1=A[k+(i+1)*lda];
79 temp1_1+=temp_1*B[j+k*lda];
80 temp2_1+=temp_1*B[j+1+k*lda];
81 temp3_1+=temp_1*B[j+2+k*lda];
82 temp4_1+=temp_1*B[j+3+k*lda];
83
84 temp_2=A[k+(i+2)*lda];
85 temp1_2+=temp_2*B[j+k*lda];
86 temp2_2+=temp_2*B[j+1+k*lda];
87 temp3_2+=temp_2*B[j+2+k*lda];
88 temp4_2+=temp_2*B[j+3+k*lda];
89
90 temp_3=A[k+(i+3)*lda];
91 temp1_3+=temp_3*B[j+k*lda];
92 temp2_3+=temp_3*B[j+1+k*lda];
93 temp3_3+=temp_3*B[j+2+k*lda];
94 temp4_3+=temp_3*B[j+3+k*lda];
95
96 }
97 C[j+i*lda]=temp1;
98 C[j+1+i*lda]=temp2;
99 C[j+2+i*lda]=temp3;
100 C[j+3+i*lda]=temp4;
101
102 C[j+(i+1)*lda]=temp1_1;
103 C[j+1+(i+1)*lda]=temp2_1;
104 C[j+2+(i+1)*lda]=temp3_1;
105 C[j+3+(i+1)*lda]=temp4_1;
106
107 C[j+(i+2)*lda]=temp1_2;
108 C[j+1+(i+2)*lda]=temp2_2;
109 C[j+2+(i+2)*lda]=temp3_2;
110 C[j+3+(i+2)*lda]=temp4_2;
111
112 C[j+(i+3)*lda]=temp1_3;
113 C[j+1+(i+3)*lda]=temp2_3;
114 C[j+2+(i+3)*lda]=temp3_3;
115 C[j+3+(i+3)*lda]=temp4_3;
116
117 }
118
119 }
120
121
122
123 }
124
125 //second core
126 else{
127 for (i=space*coreid;i<lda/4*4;i+=4)
128 {
129 for(j=0;j<lda;j+=4)
130 {
131 temp1=C[j+i*lda];
132 temp2=C[j+1+i*lda];
133 temp3=C[j+2+i*lda];
134 temp4=C[j+3+i*lda];
135
136 temp1_1=C[j+(i+1)*lda];
137 temp2_1=C[j+1+(i+1)*lda];
138 temp3_1=C[j+2+(i+1)*lda];
139 temp4_1=C[j+3+(i+1)*lda];
140
141 temp1_2=C[j+(i+2)*lda];
142 temp2_2=C[j+1+(i+2)*lda];
143 temp3_2=C[j+2+(i+2)*lda];
144 temp4_2=C[j+3+(i+2)*lda];
145
146 temp1_3=C[j+(i+3)*lda];
147 temp2_3=C[j+1+(i+3)*lda];
148 temp3_3=C[j+2+(i+3)*lda];
149 temp4_3=C[j+3+(i+3)*lda];
150 for (k=0;k<lda;k++)
151 {
152 temp=A[k+i*lda];
153 temp1+=temp*B[j+k*lda];
154 temp2+=temp*B[j+1+k*lda];
155 temp3+=temp*B[j+2+k*lda];
156 temp4+=temp*B[j+3+k*lda];
157
158 temp_1=A[k+(i+1)*lda];
159 temp1_1+=temp_1*B[j+k*lda];
160 temp2_1+=temp_1*B[j+1+k*lda];
161 temp3_1+=temp_1*B[j+2+k*lda];
162 temp4_1+=temp_1*B[j+3+k*lda];
163
164 temp_2=A[k+(i+2)*lda];
165 temp1_2+=temp_2*B[j+k*lda];
166 temp2_2+=temp_2*B[j+1+k*lda];
167 temp3_2+=temp_2*B[j+2+k*lda];
168 temp4_2+=temp_2*B[j+3+k*lda];
169
170 temp_3=A[k+(i+3)*lda];
171 temp1_3+=temp_3*B[j+k*lda];
172 temp2_3+=temp_3*B[j+1+k*lda];
173 temp3_3+=temp_3*B[j+2+k*lda];
174 temp4_3+=temp_3*B[j+3+k*lda];
175
176 }
177 C[j+i*lda]=temp1;
178 C[j+1+i*lda]=temp2;
179 C[j+2+i*lda]=temp3;
180 C[j+3+i*lda]=temp4;
181
182 C[j+(i+1)*lda]=temp1_1;
183 C[j+1+(i+1)*lda]=temp2_1;
184 C[j+2+(i+1)*lda]=temp3_1;
185 C[j+3+(i+1)*lda]=temp4_1;
186
187 C[j+(i+2)*lda]=temp1_2;
188 C[j+1+(i+2)*lda]=temp2_2;
189 C[j+2+(i+2)*lda]=temp3_2;
190 C[j+3+(i+2)*lda]=temp4_2;
191
192 C[j+(i+3)*lda]=temp1_3;
193 C[j+1+(i+3)*lda]=temp2_3;
194 C[j+2+(i+3)*lda]=temp3_3;
195 C[j+3+(i+3)*lda]=temp4_3;
196
197 }
198
199 }
200
201
202 }
203
204
205 }