14 #define MIN(X,Y) (X < Y ? X : Y)
16 void __attribute__((noinline
)) matmul(const int coreid
, const int ncores
, const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
19 // ***************************** //
20 // **** ADD YOUR CODE HERE ***** //
21 // ***************************** //
23 // feel free to make a separate function for MI and MSI versions.
25 int i
, j
, k
, ri
, rj
, ii
, jj
, kk
;
27 data_t c
[REG_I
][REG_J
], a
[REG_J
], b
[REG_I
];
28 size_t start
= coreid
* (LDA
/ NCORES
), end
= (coreid
== NCORES
- 1 ? LDA
: (coreid
+ 1) * (LDA
/ NCORES
));
30 /* if (coreid > 0) { */
33 /* start = 0, end = lda; */
34 if (ncores
== NCORES
&& lda
== LDA
) {
35 for (jj
= start
; jj
< end
; jj
+= BLOCK_J
) {
36 int kk_start
= (coreid
== 0 ? 0 : LDA
/2) ,kk_end
= (coreid
== 0 ? LDA
/2 : LDA
);
37 for (kk
= kk_start
; kk
< kk_end
; kk
+= BLOCK_K
) {
38 // for (ii = 0; ii < LDA; ii += BLOCK_I)
39 for (j
= jj
; j
< MIN(end
, jj
+ BLOCK_J
); j
+= REG_J
) {
42 for (i
= 0; i
< LDA
/*, ii + BLOCK_I)*/; i
+= REG_I
) {
43 /* Load C in register blocks. */
45 for (ri
= 0; ri
< REG_I
; ri
++) {
46 for (rj
= 0; rj
< REG_J
; rj
++) {
47 c
[ri
][rj
] = Cj
[i
+ ri
+ ( rj
)*LDA
];
52 for (k
= kk
; k
< MIN(LDA
, kk
+ BLOCK_K
); k
++) {
53 for (ri
= 0; ri
< REG_I
; ri
++) {
54 b
[ri
] = Bi
[k
*LDA
+ ri
];
56 /* Compute C in register blocks. */
57 for (rj
= 0; rj
< REG_J
; rj
++) {
58 a
[rj
] = Aj
[(rj
)*LDA
+ k
];
59 for (ri
= 0; ri
< REG_I
; ri
++) {
60 c
[ri
][rj
] += a
[rj
] * b
[ri
];
65 /* store C in register blocks. */
66 for (ri
= 0; ri
< REG_I
; ri
++) {
67 for (rj
= 0; rj
< REG_J
; rj
++) {
68 Cj
[i
+ ri
+ ( rj
)*LDA
] = c
[ri
][rj
];
75 /* kk_start= (coreid == 1 ? 0 : LDA/2); */
76 /* kk_end = (coreid == 1 ? LDA/2 : LDA); */
77 /* for (kk = kk_start; kk < kk_end; kk += BLOCK_K) { */
78 /* // for (ii = 0; ii < LDA; ii += BLOCK_I) */
79 /* for (j = jj; j < MIN(end, jj + BLOCK_J); j += REG_J) { */
82 /* for (i = 0; i < LDA/\*, ii + BLOCK_I)*\/; i += REG_I) { */
83 /* /\* Load C in register blocks. *\/ */
85 /* for (ri = 0; ri < REG_I; ri++) { */
86 /* for (rj = 0; rj < REG_J; rj++) { */
87 /* c[ri][rj] = Cj[i + ri + ( rj)*LDA]; */
92 /* for (k = kk; k < MIN(LDA, kk + BLOCK_K); k++) { */
93 /* for (ri = 0; ri < REG_I; ri++) { */
94 /* b[ri] = Bi[k*LDA + ri]; */
96 /* /\* Compute C in register blocks. *\/ */
97 /* for (rj = 0; rj < REG_J; rj++) { */
98 /* a[rj] = Aj[(rj)*LDA + k]; */
99 /* for (ri = 0; ri < REG_I; ri++) { */
100 /* c[ri][rj] += a[rj] * b[ri]; */
105 /* store C in register blocks. */
106 /* for (ri = 0; ri < REG_I; ri++) { */
107 /* for (rj = 0; rj < REG_J; rj++) { */
108 /* Cj[i + ri + ( rj)*LDA] = c[ri][rj]; */
118 for (jj
= start
; jj
< end
; jj
+= BLOCK_J
) {
119 int kk_start
= (coreid
!= 0 ? 0 : LDA
/2), kk_end
= (coreid
!= 0 ? LDA
/2 : LDA
);
120 for (kk
= kk_start
; kk
< kk_end
; kk
+= BLOCK_K
) {
121 // for (ii = 0; ii < LDA; ii += BLOCK_I)
122 for (j
= jj
; j
< MIN(end
, jj
+ BLOCK_J
); j
+= REG_J
) {
125 for (i
= 0; i
< LDA
/*, ii + BLOCK_I)*/; i
+= REG_I
) {
126 /* Load C in register blocks. */
128 for (ri
= 0; ri
< REG_I
; ri
++) {
129 for (rj
= 0; rj
< REG_J
; rj
++) {
130 c
[ri
][rj
] = Cj
[i
+ ri
+ ( rj
)*LDA
];
135 for (k
= kk
; k
< MIN(LDA
, kk
+ BLOCK_K
); k
++) {
136 for (ri
= 0; ri
< REG_I
; ri
++) {
137 b
[ri
] = Bi
[k
*LDA
+ ri
];
139 /* Compute C in register blocks. */
140 for (rj
= 0; rj
< REG_J
; rj
++) {
141 a
[rj
] = Aj
[(rj
)*LDA
+ k
];
142 for (ri
= 0; ri
< REG_I
; ri
++) {
143 c
[ri
][rj
] += a
[rj
] * b
[ri
];
148 /* store C in register blocks. */
149 for (ri
= 0; ri
< REG_I
; ri
++) {
150 for (rj
= 0; rj
< REG_J
; rj
++) {
151 Cj
[i
+ ri
+ ( rj
)*LDA
] = c
[ri
][rj
];
158 /* We only care about performance for 32x32 matrices and 2 cores. Otherwise just naive mat_mul */
163 for ( i
= 0; i
< lda
; i
++ )
164 for ( j
= 0; j
< lda
; j
++ )
165 for ( k
= 0; k
< lda
; k
++ )
166 C
[i
+ j
*lda
] += A
[j
*lda
+ k
] * B
[k
*lda
+ i
];