Merge rv64si and rv32si tests
[riscv-tests.git] / mt / ae_matmul.c
1 #include "stdlib.h"
2
3 #include "util.h"
4
5 #include "dataset.h"
6 void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
7 {
8
9 // ***************************** //
10 // **** ADD YOUR CODE HERE ***** //
11 // ***************************** //
12 //
13 // feel free to make a separate function for MI and MSI versions.
14
15
16
17 data_t *b1;
18 data_t *b2;
19 data_t *b3;
20 data_t *b4;
21 data_t c1;
22 data_t c2;
23 data_t c3;
24 data_t c4;
25 data_t a1;
26 data_t a2;
27 data_t a3;
28 data_t a4;
29 data_t a5;
30 data_t a6;
31 data_t a7;
32 data_t a8;
33 int i, j, k;
34 static data_t BB[1024];
35
36
37
38 //transpose B
39 for ( k = 0; k < lda; k++) {
40 for ( i = coreid*(lda/ncores); i < (coreid+1)*(lda/ncores); i++ ) {
41 BB[i*lda + k] = B[k*lda + i];
42 }
43 barrier(ncores);
44 }
45
46 for ( i = 0; i < lda; i+=4 ) {
47 for ( j = coreid*(lda/ncores); j < (coreid+1)*(lda/ncores); j++ ) {
48 c1 = 0; c2 = 0; c3 = 0; c4 = 0;
49 b1 = &BB[(i+0)*lda];
50 b2 = &BB[(i+1)*lda];
51 b3 = &BB[(i+2)*lda];
52 b4 = &BB[(i+3)*lda];
53 for ( k = 0; k < lda; k+=8 ) {
54
55 a1 = A[j*lda + k+0];
56 a2 = A[j*lda + k+1];
57 a3 = A[j*lda + k+2];
58 a4 = A[j*lda + k+3];
59 a5 = A[j*lda + k+4];
60 a6 = A[j*lda + k+5];
61 a7 = A[j*lda + k+6];
62 a8 = A[j*lda + k+7];
63
64 c1 += a1 * b1[k+0];
65 c1 += a2 * b1[k+1];
66 c1 += a3 * b1[k+2];
67 c1 += a4 * b1[k+3];
68 c1 += a5 * b1[k+4];
69 c1 += a6 * b1[k+5];
70 c1 += a7 * b1[k+6];
71 c1 += a8 * b1[k+7];
72
73 c2 += a1 * b2[k+0];
74 c2 += a2 * b2[k+1];
75 c2 += a3 * b2[k+2];
76 c2 += a4 * b2[k+3];
77 c2 += a5 * b2[k+4];
78 c2 += a6 * b2[k+5];
79 c2 += a7 * b2[k+6];
80 c2 += a8 * b2[k+7];
81
82 c3 += a1 * b3[k+0];
83 c3 += a2 * b3[k+1];
84 c3 += a3 * b3[k+2];
85 c3 += a4 * b3[k+3];
86 c3 += a5 * b3[k+4];
87 c3 += a6 * b3[k+5];
88 c3 += a7 * b3[k+6];
89 c3 += a8 * b3[k+7];
90
91 c4 += a1 * b4[k+0];
92 c4 += a2 * b4[k+1];
93 c4 += a3 * b4[k+2];
94 c4 += a4 * b4[k+3];
95 c4 += a5 * b4[k+4];
96 c4 += a6 * b4[k+5];
97 c4 += a7 * b4[k+6];
98 c4 += a8 * b4[k+7];
99
100
101 }
102 C[i+0 + j*lda] = c1;
103 C[i+1 + j*lda] = c2;
104 C[i+2 + j*lda] = c3;
105 C[i+3 + j*lda] = c4;
106 barrier(ncores);
107 }
108 }
109
110 }