6 void __attribute__((noinline
)) matmul(const int coreid
, const int ncores
, const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
9 // ***************************** //
10 // **** ADD YOUR CODE HERE ***** //
11 // ***************************** //
13 // feel free to make a separate function for MI and MSI versions.
18 for ( i = coreid*lda/ncores; i < (coreid+1)*lda/ncores; i++ )
20 for ( j = 0; j < lda; j++ )
23 int cIndex = i + aIndex;
24 C[cIndex] += A[aIndex] * B[i];
25 C[cIndex] += A[aIndex + 1] * B[1*lda + i];
26 C[cIndex] += A[aIndex + 2] * B[2*lda + i];
27 C[cIndex] += A[aIndex + 3] * B[3*lda + i];
28 C[cIndex] += A[aIndex + 4] * B[4*lda + i];
29 C[cIndex] += A[aIndex + 5] * B[5*lda + i];
30 C[cIndex] += A[aIndex + 6] * B[6*lda + i];
31 C[cIndex] += A[aIndex + 7] * B[7*lda + i];
32 C[cIndex] += A[aIndex + 8] * B[8*lda + i];
33 C[cIndex] += A[aIndex + 9] * B[9*lda + i];
34 C[cIndex] += A[aIndex + 10] * B[10*lda + i];
35 C[cIndex] += A[aIndex + 11] * B[11*lda + i];
36 C[cIndex] += A[aIndex + 12] * B[12*lda + i];
37 C[cIndex] += A[aIndex + 13] * B[13*lda + i];
38 C[cIndex] += A[aIndex + 14] * B[14*lda + i];
39 C[cIndex] += A[aIndex + 15] * B[15*lda + i];
40 C[cIndex] += A[aIndex + 16] * B[16*lda + i];
41 C[cIndex] += A[aIndex + 17] * B[17*lda + i];
42 C[cIndex] += A[aIndex + 18] * B[18*lda + i];
43 C[cIndex] += A[aIndex + 19] * B[19*lda + i];
44 C[cIndex] += A[aIndex + 20] * B[20*lda + i];
45 C[cIndex] += A[aIndex + 21] * B[21*lda + i];
46 C[cIndex] += A[aIndex + 22] * B[22*lda + i];
47 C[cIndex] += A[aIndex + 23] * B[23*lda + i];
48 C[cIndex] += A[aIndex + 24] * B[24*lda + i];
49 C[cIndex] += A[aIndex + 25] * B[25*lda + i];
50 C[cIndex] += A[aIndex + 26] * B[26*lda + i];
51 C[cIndex] += A[aIndex + 27] * B[27*lda + i];
52 C[cIndex] += A[aIndex + 28] * B[28*lda + i];
53 C[cIndex] += A[aIndex + 29] * B[29*lda + i];
54 C[cIndex] += A[aIndex + 30] * B[30*lda + i];
55 C[cIndex] += A[aIndex + 31] * B[31*lda + i];
61 /* for ( i = coreid*lda/ncores; i < (coreid+1)*lda/ncores; i++ )
63 for ( j = 0; j < lda; j++ )
67 int cIndex = i + aIndex;
68 for ( k = 0; k < lda; k++)
70 C[cIndex] += A[aIndex + k] * B[k*lda + i];
71 /* C[cIndex] += A[aIndex + k+1] * B[(k+1)*lda + i];
72 C[cIndex] += A[aIndex + k+2] * B[(k+2)*lda + i];
73 C[cIndex] += A[aIndex + k+3] * B[(k+3)*lda + i];
74 C[cIndex] += A[aIndex + k+4] * B[(k+4)*lda + i];
75 C[cIndex] += A[aIndex + k+5] * B[(k+5)*lda + i];
76 C[cIndex] += A[aIndex + k+6] * B[(k+6)*lda + i];
77 C[cIndex] += A[aIndex + k+7] * B[(k+7)*lda + i];
78 C[cIndex] += A[aIndex + k+8] * B[(k+8)*lda + i];
79 C[cIndex] += A[aIndex + k+9] * B[(k+9)*lda + i];
80 C[cIndex] += A[aIndex + k+10] * B[(k+10)*lda + i];
81 C[cIndex] += A[aIndex + k+11] * B[(k+11)*lda + i];
82 C[cIndex] += A[aIndex + k+12] * B[(k+12)*lda + i];
83 C[cIndex] += A[aIndex + k+13] * B[(k+13)*lda + i];
84 C[cIndex] += A[aIndex + k+14] * B[(k+14)*lda + i];
85 C[cIndex] += A[aIndex + k+15] * B[(k+15)*lda + i];*/
93 for (int counti = 0; counti < 32; counti++) {
94 for (int countj = 0; countj < 32; countj++) {
95 *(bTrans + counti + countj*lda) = *(B + countj + counti*lda);
101 for ( i = 0; i < lda; i+=BLOCKSIZE )
103 for ( int iTemp = i; iTemp < i + BLOCKSIZE; iTemp++ ) {
104 int iFlag = iTemp*lda;
105 for ( j = coreid*lda/ncores; j < (coreid+1)*lda/ncores; j++ ) {
107 int cLoc = jFlag+iTemp;
108 for ( k = 0; k < lda; k+=8) {
109 *(C+cLoc) += *(A+jFlag+k) * *(bTrans+iFlag+k);
110 *(C+cLoc) += *(A+jFlag+k+1) * *(bTrans+iFlag+k+1);
111 *(C+cLoc) += *(A+jFlag+k+2) * *(bTrans+iFlag+k+2);
112 *(C+cLoc) += *(A+jFlag+k+3) * *(bTrans+iFlag+k+3);
113 *(C+cLoc) += *(A+jFlag+k+4) * *(bTrans+iFlag+k+4);
114 *(C+cLoc) += *(A+jFlag+k+5) * *(bTrans+iFlag+k+5);
115 *(C+cLoc) += *(A+jFlag+k+6) * *(bTrans+iFlag+k+6);
116 *(C+cLoc) += *(A+jFlag+k+7) * *(bTrans+iFlag+k+7);
123 for (int counti
= 0; counti
< 32; counti
++) {
124 for (int countj
= 0; countj
< 32; countj
++) {
125 *(bTrans
+ counti
+ countj
*lda
) = *(B
+ countj
+ counti
*lda
);
131 for ( j
= 0; j
< lda
; j
++ )
133 //for ( int jTemp = j; jTemp < j + BLOCKSIZE; jTemp++ ) {
135 for ( i
= coreid
*lda
/ncores
; i
< (coreid
+1)*lda
/ncores
; i
+=BLOCKSIZE
) {
136 for ( int iTemp
= i
; iTemp
< i
+ BLOCKSIZE
; iTemp
++ ) {
138 int iFlag
= iTemp
*lda
;
139 int cLoc
= jFlag
+iTemp
;
140 for ( k
= 0; k
< lda
; k
+=16) {
141 *(C
+cLoc
) += *(A
+jFlag
+k
) * *(bTrans
+iFlag
+k
);
142 *(C
+cLoc
) += *(A
+jFlag
+k
+1) * *(bTrans
+iFlag
+k
+1);
143 *(C
+cLoc
) += *(A
+jFlag
+k
+2) * *(bTrans
+iFlag
+k
+2);
144 *(C
+cLoc
) += *(A
+jFlag
+k
+3) * *(bTrans
+iFlag
+k
+3);
145 *(C
+cLoc
) += *(A
+jFlag
+k
+4) * *(bTrans
+iFlag
+k
+4);
146 *(C
+cLoc
) += *(A
+jFlag
+k
+5) * *(bTrans
+iFlag
+k
+5);
147 *(C
+cLoc
) += *(A
+jFlag
+k
+6) * *(bTrans
+iFlag
+k
+6);
148 *(C
+cLoc
) += *(A
+jFlag
+k
+7) * *(bTrans
+iFlag
+k
+7);
149 *(C
+cLoc
) += *(A
+jFlag
+k
+8) * *(bTrans
+iFlag
+k
+8);
150 *(C
+cLoc
) += *(A
+jFlag
+k
+9) * *(bTrans
+iFlag
+k
+9);
151 *(C
+cLoc
) += *(A
+jFlag
+k
+10) * *(bTrans
+iFlag
+k
+10);
152 *(C
+cLoc
) += *(A
+jFlag
+k
+11) * *(bTrans
+iFlag
+k
+11);
153 *(C
+cLoc
) += *(A
+jFlag
+k
+12) * *(bTrans
+iFlag
+k
+12);
154 *(C
+cLoc
) += *(A
+jFlag
+k
+13) * *(bTrans
+iFlag
+k
+13);
155 *(C
+cLoc
) += *(A
+jFlag
+k
+14) * *(bTrans
+iFlag
+k
+14);
156 *(C
+cLoc
) += *(A
+jFlag
+k
+15) * *(bTrans
+iFlag
+k
+15);