bump env
[riscv-tests.git] / mt / ak_matmul.c
1 #include "stdlib.h"
2
3 #include "util.h"
4
5 #include "dataset.h"
6 void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
7 {
8
9 // ***************************** //
10 // **** ADD YOUR CODE HERE ***** //
11 // ***************************** //
12 //
13 // feel free to make a separate function for MI and MSI versions.
14 int i, j, k, ii, jj, bsize;
15 bsize = 16;
16 for ( jj = bsize*coreid; jj < lda; jj += bsize*ncores) {
17 for ( ii = 0; ii < lda; ii += bsize) {
18 for ( j = jj; j < lda && j < jj + bsize; j++) {
19 for ( i = ii; i < lda && i < ii + bsize; i += 8) {
20 data_t c1 = C[i + j*lda];
21 data_t c2 = C[i + j*lda + 1];
22 data_t c3 = C[i + j*lda + 2];
23 data_t c4 = C[i + j*lda + 3];
24 data_t c5 = C[i + j*lda + 4];
25 data_t c6 = C[i + j*lda + 5];
26 data_t c7 = C[i + j*lda + 6];
27 data_t c8 = C[i + j*lda + 7];
28 for ( k = 0; k < lda; k+=4 ) {
29 for (int x = 0; x < 4; x++) {
30 data_t a = A[j*lda + k+x];
31 data_t b1 = B[(k+x)*lda + i];
32 data_t b2 = B[(k+x)*lda + i + 1];
33 data_t b3 = B[(k+x)*lda + i + 2];
34 data_t b4 = B[(k+x)*lda + i + 3];
35 data_t b5 = B[(k+x)*lda + i + 4];
36 data_t b6 = B[(k+x)*lda + i + 5];
37 data_t b7 = B[(k+x)*lda + i + 6];
38 data_t b8 = B[(k+x)*lda + i + 7];
39 c1 += a * b1;
40 c2 += a * b2;
41 c3 += a * b3;
42 c4 += a * b4;
43 c5 += a * b5;
44 c6 += a * b6;
45 c7 += a * b7;
46 c8 += a * b8;
47 }
48 }
49 C[i + j*lda] = c1;
50 C[i + j*lda + 1] = c2;
51 C[i + j*lda + 2] = c3;
52 C[i + j*lda + 3] = c4;
53 C[i + j*lda + 4] = c5;
54 C[i + j*lda + 5] = c6;
55 C[i + j*lda + 6] = c7;
56 C[i + j*lda + 7] = c8;
57 }
58 }
59 }
60 }
61
62 }