Use XLEN macro for these sources as well.
[riscv-tests.git] / mt / at_matmul.c
1 #include "stdlib.h"
2
3 #include "util.h"
4
5 #include "dataset.h"
6 void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
7 {
8
9 // ***************************** //
10 // **** ADD YOUR CODE HERE ***** //
11 // ***************************** //
12 //
13 // feel free to make a separate function for MI and MSI versions.
14
15 int i, j, k;
16
17 /*547287
18 for ( i = coreid*lda/ncores; i < (coreid+1)*lda/ncores; i++ )
19 {
20 for ( j = 0; j < lda; j++ )
21 {
22 int aIndex = j*lda;
23 int cIndex = i + aIndex;
24 C[cIndex] += A[aIndex] * B[i];
25 C[cIndex] += A[aIndex + 1] * B[1*lda + i];
26 C[cIndex] += A[aIndex + 2] * B[2*lda + i];
27 C[cIndex] += A[aIndex + 3] * B[3*lda + i];
28 C[cIndex] += A[aIndex + 4] * B[4*lda + i];
29 C[cIndex] += A[aIndex + 5] * B[5*lda + i];
30 C[cIndex] += A[aIndex + 6] * B[6*lda + i];
31 C[cIndex] += A[aIndex + 7] * B[7*lda + i];
32 C[cIndex] += A[aIndex + 8] * B[8*lda + i];
33 C[cIndex] += A[aIndex + 9] * B[9*lda + i];
34 C[cIndex] += A[aIndex + 10] * B[10*lda + i];
35 C[cIndex] += A[aIndex + 11] * B[11*lda + i];
36 C[cIndex] += A[aIndex + 12] * B[12*lda + i];
37 C[cIndex] += A[aIndex + 13] * B[13*lda + i];
38 C[cIndex] += A[aIndex + 14] * B[14*lda + i];
39 C[cIndex] += A[aIndex + 15] * B[15*lda + i];
40 C[cIndex] += A[aIndex + 16] * B[16*lda + i];
41 C[cIndex] += A[aIndex + 17] * B[17*lda + i];
42 C[cIndex] += A[aIndex + 18] * B[18*lda + i];
43 C[cIndex] += A[aIndex + 19] * B[19*lda + i];
44 C[cIndex] += A[aIndex + 20] * B[20*lda + i];
45 C[cIndex] += A[aIndex + 21] * B[21*lda + i];
46 C[cIndex] += A[aIndex + 22] * B[22*lda + i];
47 C[cIndex] += A[aIndex + 23] * B[23*lda + i];
48 C[cIndex] += A[aIndex + 24] * B[24*lda + i];
49 C[cIndex] += A[aIndex + 25] * B[25*lda + i];
50 C[cIndex] += A[aIndex + 26] * B[26*lda + i];
51 C[cIndex] += A[aIndex + 27] * B[27*lda + i];
52 C[cIndex] += A[aIndex + 28] * B[28*lda + i];
53 C[cIndex] += A[aIndex + 29] * B[29*lda + i];
54 C[cIndex] += A[aIndex + 30] * B[30*lda + i];
55 C[cIndex] += A[aIndex + 31] * B[31*lda + i];
56 }
57 }
58 */
59
60 //492827
61 /* for ( i = coreid*lda/ncores; i < (coreid+1)*lda/ncores; i++ )
62 {
63 for ( j = 0; j < lda; j++ )
64 {
65
66 int aIndex = j*lda;
67 int cIndex = i + aIndex;
68 for ( k = 0; k < lda; k++)
69 {
70 C[cIndex] += A[aIndex + k] * B[k*lda + i];
71 /* C[cIndex] += A[aIndex + k+1] * B[(k+1)*lda + i];
72 C[cIndex] += A[aIndex + k+2] * B[(k+2)*lda + i];
73 C[cIndex] += A[aIndex + k+3] * B[(k+3)*lda + i];
74 C[cIndex] += A[aIndex + k+4] * B[(k+4)*lda + i];
75 C[cIndex] += A[aIndex + k+5] * B[(k+5)*lda + i];
76 C[cIndex] += A[aIndex + k+6] * B[(k+6)*lda + i];
77 C[cIndex] += A[aIndex + k+7] * B[(k+7)*lda + i];
78 C[cIndex] += A[aIndex + k+8] * B[(k+8)*lda + i];
79 C[cIndex] += A[aIndex + k+9] * B[(k+9)*lda + i];
80 C[cIndex] += A[aIndex + k+10] * B[(k+10)*lda + i];
81 C[cIndex] += A[aIndex + k+11] * B[(k+11)*lda + i];
82 C[cIndex] += A[aIndex + k+12] * B[(k+12)*lda + i];
83 C[cIndex] += A[aIndex + k+13] * B[(k+13)*lda + i];
84 C[cIndex] += A[aIndex + k+14] * B[(k+14)*lda + i];
85 C[cIndex] += A[aIndex + k+15] * B[(k+15)*lda + i];*/
86 /* }
87 }
88 }*/
89 /*
90 //326378
91 data_t bTrans[1024];
92
93 for (int counti = 0; counti < 32; counti++) {
94 for (int countj = 0; countj < 32; countj++) {
95 *(bTrans + counti + countj*lda) = *(B + countj + counti*lda);
96 }
97 }
98
99
100 int BLOCKSIZE = 8;
101 for ( i = 0; i < lda; i+=BLOCKSIZE )
102 {
103 for ( int iTemp = i; iTemp < i + BLOCKSIZE; iTemp++ ) {
104 int iFlag = iTemp*lda;
105 for ( j = coreid*lda/ncores; j < (coreid+1)*lda/ncores; j++ ) {
106 int jFlag = j*lda;
107 int cLoc = jFlag+iTemp;
108 for ( k = 0; k < lda; k+=8) {
109 *(C+cLoc) += *(A+jFlag+k) * *(bTrans+iFlag+k);
110 *(C+cLoc) += *(A+jFlag+k+1) * *(bTrans+iFlag+k+1);
111 *(C+cLoc) += *(A+jFlag+k+2) * *(bTrans+iFlag+k+2);
112 *(C+cLoc) += *(A+jFlag+k+3) * *(bTrans+iFlag+k+3);
113 *(C+cLoc) += *(A+jFlag+k+4) * *(bTrans+iFlag+k+4);
114 *(C+cLoc) += *(A+jFlag+k+5) * *(bTrans+iFlag+k+5);
115 *(C+cLoc) += *(A+jFlag+k+6) * *(bTrans+iFlag+k+6);
116 *(C+cLoc) += *(A+jFlag+k+7) * *(bTrans+iFlag+k+7);
117 }
118 }
119 }
120 }*/
121 data_t bTrans[1024];
122
123 for (int counti = 0; counti < 32; counti++) {
124 for (int countj = 0; countj < 32; countj++) {
125 *(bTrans + counti + countj*lda) = *(B + countj + counti*lda);
126 }
127 }
128
129
130 int BLOCKSIZE = 8;
131 for ( j = 0; j < lda; j++ )
132 {
133 //for ( int jTemp = j; jTemp < j + BLOCKSIZE; jTemp++ ) {
134 int jFlag = j*lda;
135 for ( i = coreid*lda/ncores; i < (coreid+1)*lda/ncores; i+=BLOCKSIZE ) {
136 for ( int iTemp = i; iTemp < i + BLOCKSIZE; iTemp++ ) {
137
138 int iFlag = iTemp*lda;
139 int cLoc = jFlag+iTemp;
140 for ( k = 0; k < lda; k+=16) {
141 *(C+cLoc) += *(A+jFlag+k) * *(bTrans+iFlag+k);
142 *(C+cLoc) += *(A+jFlag+k+1) * *(bTrans+iFlag+k+1);
143 *(C+cLoc) += *(A+jFlag+k+2) * *(bTrans+iFlag+k+2);
144 *(C+cLoc) += *(A+jFlag+k+3) * *(bTrans+iFlag+k+3);
145 *(C+cLoc) += *(A+jFlag+k+4) * *(bTrans+iFlag+k+4);
146 *(C+cLoc) += *(A+jFlag+k+5) * *(bTrans+iFlag+k+5);
147 *(C+cLoc) += *(A+jFlag+k+6) * *(bTrans+iFlag+k+6);
148 *(C+cLoc) += *(A+jFlag+k+7) * *(bTrans+iFlag+k+7);
149 *(C+cLoc) += *(A+jFlag+k+8) * *(bTrans+iFlag+k+8);
150 *(C+cLoc) += *(A+jFlag+k+9) * *(bTrans+iFlag+k+9);
151 *(C+cLoc) += *(A+jFlag+k+10) * *(bTrans+iFlag+k+10);
152 *(C+cLoc) += *(A+jFlag+k+11) * *(bTrans+iFlag+k+11);
153 *(C+cLoc) += *(A+jFlag+k+12) * *(bTrans+iFlag+k+12);
154 *(C+cLoc) += *(A+jFlag+k+13) * *(bTrans+iFlag+k+13);
155 *(C+cLoc) += *(A+jFlag+k+14) * *(bTrans+iFlag+k+14);
156 *(C+cLoc) += *(A+jFlag+k+15) * *(bTrans+iFlag+k+15);
157 }
158 }
159 }
160 //}
161 }
162
163
164 }