New PMP encoding
[riscv-tests.git] / mt / dv_matmul.c
1 #include "stdlib.h"
2
3 #include "util.h"
4
5 #include "dataset.h"
6 void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
7 {
8
9 // ***************************** //
10 // **** ADD YOUR CODE HERE ***** //
11 // ***************************** //
12 //
13 // feel free to make a separate function for MI and MSI versions.
14 int i, j, k, ii, jj, kk;
15 if(coreid > 1) return;
16 if (coreid || ncores == 1) {
17 // for ( ii = 0; ii < 32; ii+=IC )
18 for ( kk = 0; kk < 32; kk+=16 )
19 for ( j = 0; j < 16; j++ )
20 // for ( j = 0; j < 16; j++ )
21 {
22 for ( i = 0; i < 32; i+=8 )
23 // for ( i = ii; i < ii + IC && i < 32; i+=8 )
24 {
25 data_t temp0 = C[i+j*32];
26 data_t temp1 = C[i+j*32+1];
27 data_t temp2 = C[i+j*32+2];
28 data_t temp3 = C[i+j*32+3];
29 data_t temp4 = C[i+j*32+4];
30 data_t temp5 = C[i+j*32+5];
31 data_t temp6 = C[i+j*32+6];
32 data_t temp7 = C[i+j*32+7];
33 for ( k = kk; k < kk+16 && k < 32; k++ )
34 // for ( k = 0; k < 32; k++ )
35 {
36 data_t tempA = A[j*32+k];
37 temp0 += tempA * B[k*32 + i];
38 temp1 += tempA * B[k*32 + i+1];
39 temp2 += tempA * B[k*32 + i+2];
40 temp3 += tempA * B[k*32 + i+3];
41 temp4 += tempA * B[k*32 + i+4];
42 temp5 += tempA * B[k*32 + i+5];
43 temp6 += tempA * B[k*32 + i+6];
44 temp7 += tempA * B[k*32 + i+7];
45 }
46 C[i+j*32] = temp0;
47 C[i+j*32+1] = temp1;
48 C[i+j*32+2] = temp2;
49 C[i+j*32+3] = temp3;
50 C[i+j*32+4] = temp4;
51 C[i+j*32+5] = temp5;
52 C[i+j*32+6] = temp6;
53 C[i+j*32+7] = temp7;
54 }
55 }
56 }
57 if(coreid == 0){
58 // for ( ii = 0; ii < 32; ii+=IC )
59 for ( kk = 0; kk < 32; kk+=16 )
60 for ( j = 16; j < 32; j++ )
61 // for ( j = 16; j < 32; j++ )
62 {
63 for ( i = 0; i < 32; i+=8 )
64 // for ( i = ii; i < ii + IC && i < 32; i+=8 )
65 {
66 data_t temp0 = C[i+j*32];
67 data_t temp1 = C[i+j*32+1];
68 data_t temp2 = C[i+j*32+2];
69 data_t temp3 = C[i+j*32+3];
70 data_t temp4 = C[i+j*32+4];
71 data_t temp5 = C[i+j*32+5];
72 data_t temp6 = C[i+j*32+6];
73 data_t temp7 = C[i+j*32+7];
74 for ( k = kk; k < kk+16 && k < 32; k++ )
75 {
76 data_t tempA = A[j*32+k];
77 temp0 += tempA * B[k*32 + i];
78 temp1 += tempA * B[k*32 + i+1];
79 temp2 += tempA * B[k*32 + i+2];
80 temp3 += tempA * B[k*32 + i+3];
81 temp4 += tempA * B[k*32 + i+4];
82 temp5 += tempA * B[k*32 + i+5];
83 temp6 += tempA * B[k*32 + i+6];
84 temp7 += tempA * B[k*32 + i+7];
85 }
86 C[i+j*32] = temp0;
87 C[i+j*32+1] = temp1;
88 C[i+j*32+2] = temp2;
89 C[i+j*32+3] = temp3;
90 C[i+j*32+4] = temp4;
91 C[i+j*32+5] = temp5;
92 C[i+j*32+6] = temp6;
93 C[i+j*32+7] = temp7;
94 }
95
96 }
97 }
98 }