14 #define MIN(X,Y) (X < Y ? X : Y)
16 void __attribute__((noinline
)) matmul(const int coreid
, const int ncores
, const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
19 // ***************************** //
20 // **** ADD YOUR CODE HERE ***** //
21 // ***************************** //
23 // feel free to make a separate function for MI and MSI versions.
25 int i
, j
, k
, ri
, rj
, ii
, jj
, kk
;
27 data_t c
[REG_I
][REG_J
], a
[REG_J
], b
[REG_I
];
28 size_t start
= coreid
* (LDA
/ NCORES
), end
= (coreid
== NCORES
- 1 ? LDA
: (coreid
+ 1) * (LDA
/ NCORES
));
30 /* if (coreid > 0) { */
33 /* start = 0, end = lda; */
34 if (ncores
== NCORES
&& lda
== LDA
) {
35 for (jj
= start
; jj
< end
; jj
+= BLOCK_J
)
36 for (kk
= 0; kk
< LDA
; kk
+= BLOCK_K
)
37 //for (ii = 0; ii < LDA; ii += BLOCK_I)
38 for (j
= jj
; j
< MIN(end
, jj
+ BLOCK_J
); j
+= REG_J
) {
41 for (i
= 0; i
< LDA
; i
+= REG_I
) {
42 /* Load C in register blocks. */
44 for (ri
= 0; ri
< REG_I
; ri
++) {
45 for (rj
= 0; rj
< REG_J
; rj
++) {
46 c
[ri
][rj
] = Cj
[i
+ ri
+ ( rj
)*LDA
];
51 for (k
= kk
; k
< MIN(LDA
, kk
+ BLOCK_K
); k
++) {
52 /* Load a,b in register blocks. */
53 /* for (rj = 0; rj < REG_J; rj++) {
54 a[rj] = A[(j + rj)*LDA + k];
56 /* for (ri = 0; ri < REG_I; ri++) { */
57 /* b[ri] = Bi[k*LDA + ri]; */
59 /* /\* Compute C in register blocks. *\/ */
60 /* for (rj = 0; rj < REG_J; rj++) { */
61 /* a[rj] = Aj[( rj)*LDA + k]; */
62 /* for (ri = 0; ri < REG_I; ri++) { */
63 /* c[ri][rj] += a[rj] * b[ri]; */
78 c
[0][0] += b
[0] * a
[0];
79 c
[0][1] += b
[0] * a
[1];
80 c
[1][0] += b
[1] * a
[0];
81 c
[1][1] += b
[1] * a
[1];
82 c
[2][0] += b
[2] * a
[0];
83 c
[2][1] += b
[2] * a
[1];
84 c
[3][0] += b
[3] * a
[0];
85 c
[3][1] += b
[3] * a
[1];
86 c
[4][0] += b
[4] * a
[0];
87 c
[4][1] += b
[4] * a
[1];
88 c
[5][0] += b
[5] * a
[0];
89 c
[5][1] += b
[5] * a
[1];
90 c
[6][0] += b
[6] * a
[0];
91 c
[6][1] += b
[6] * a
[1];
92 c
[7][0] += b
[7] * a
[0];
93 c
[7][1] += b
[7] * a
[1];
96 /* c[0][0] += b[0] * a[0]; */
97 /* c[1][1] += b[1] * a[1]; */
98 /* c[2][0] += b[2] * a[0]; */
99 /* c[3][1] += b[3] * a[1]; */
100 /* c[4][0] += b[4] * a[0]; */
101 /* c[5][1] += b[5] * a[1]; */
102 /* c[6][0] += b[6] * a[0]; */
103 /* c[7][1] += b[7] * a[1]; */
104 /* c[0][0] += b[0] * a[0]; */
105 /* c[1][1] += b[1] * a[1]; */
106 /* c[2][0] += b[2] * a[0]; */
107 /* c[3][1] += b[3] * a[1]; */
108 /* c[4][0] += b[4] * a[0]; */
109 /* c[5][1] += b[5] * a[1]; */
110 /* c[6][0] += b[6] * a[0]; */
111 /* c[7][1] += b[7] * a[1]; */
115 /* store C in register blocks. */
116 for (ri
= 0; ri
< REG_I
; ri
++) {
117 for (rj
= 0; rj
< REG_J
; rj
++) {
118 Cj
[i
+ ri
+ (rj
)*LDA
] = c
[ri
][rj
];
127 /* We only care about performance for 32x32 matrices and 2 cores. Otherwise just naive mat_mul */
132 for ( i
= 0; i
< lda
; i
++ )
133 for ( j
= 0; j
< lda
; j
++ )
134 for ( k
= 0; k
< lda
; k
++ )
135 C
[i
+ j
*lda
] += A
[j
*lda
+ k
] * B
[k
*lda
+ i
];