Merge pull request #4 from pmundkur/devel
[riscv-tests.git] / mt / bo_matmul.c
1 #include "stdlib.h"
2
3 #include "util.h"
4
5 #include "dataset.h"
6 void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
7 {
8 int i, j, k;
9 data_t B_trans[32*32];
10 data_t acc_temp0, acc_temp1;
11 data_t *A_j, *B_i;
12 data_t *A_j_k, *B_i_k;
13 int z;
14
15 //for (i = 0; i < 32; i++) {
16 // for (j = 0; j < 32; j++) {
17 // B_trans[i*lda+j] = B[i+j*lda];
18 // }
19 //}
20
21 if (coreid == 0) {
22 for (i = 0; i < 32; i++) {
23 B_i = B_trans+i*32;
24 for (z = 0; z < 32; z++) {
25 *(B_i+z) = B[i+z*32];
26 }
27 for (j = 0; j < 16; j+=2) {
28 A_j = A+j*lda;
29 acc_temp0 = 0;
30 for (k = 0; k < 32; k+=8) {
31 A_j_k = A_j+k;
32 B_i_k = B_i+k;
33 acc_temp0 += *(A_j_k) * *(B_i_k);
34 acc_temp0 += *(A_j_k + 1) * *(B_i_k + 1);
35 acc_temp0 += *(A_j_k + 2) * *(B_i_k + 2);
36 acc_temp0 += *(A_j_k + 3) * *(B_i_k + 3);
37 acc_temp0 += *(A_j_k + 4) * *(B_i_k + 4);
38 acc_temp0 += *(A_j_k + 5) * *(B_i_k + 5);
39 acc_temp0 += *(A_j_k + 6) * *(B_i_k + 6);
40 acc_temp0 += *(A_j_k + 7) * *(B_i_k + 7);
41 }
42 A_j += 32;
43
44 acc_temp1 = 0;
45 for (k = 0; k < 32; k+=8) {
46 acc_temp1 += *(A_j+k) * *(B_i+k);
47 acc_temp1 += *(A_j+k + 1) * *(B_i+k + 1);
48 acc_temp1 += *(A_j+k + 2) * *(B_i+k + 2);
49 acc_temp1 += *(A_j+k + 3) * *(B_i+k + 3);
50 acc_temp1 += *(A_j+k + 4) * *(B_i+k + 4);
51 acc_temp1 += *(A_j+k + 5) * *(B_i+k + 5);
52 acc_temp1 += *(A_j+k + 6) * *(B_i+k + 6);
53 acc_temp1 += *(A_j+k + 7) * *(B_i+k + 7);
54 }
55
56 C[i + j*lda] = acc_temp0;
57 C[i + (j+1)*lda] = acc_temp1;
58 }
59 }
60 }
61 if (coreid == 1 || ncores == 1) {
62 for (i = 0; i < 32; i++) {
63 B_i = B_trans+i*32;
64 for (z = 0; z < 32; z++) {
65 *(B_i+z) = B[i+z*32];
66 }
67 for (j = 16; j < 32; j+=2) {
68 A_j = A+j*lda;
69 acc_temp0 = 0;
70 for (k = 0; k < 32; k+=8) {
71 acc_temp0 += *(A_j+k) * *(B_i+k);
72 acc_temp0 += *(A_j+k + 1) * *(B_i+k + 1);
73 acc_temp0 += *(A_j+k + 2) * *(B_i+k + 2);
74 acc_temp0 += *(A_j+k + 3) * *(B_i+k + 3);
75 acc_temp0 += *(A_j+k + 4) * *(B_i+k + 4);
76 acc_temp0 += *(A_j+k + 5) * *(B_i+k + 5);
77 acc_temp0 += *(A_j+k + 6) * *(B_i+k + 6);
78 acc_temp0 += *(A_j+k + 7) * *(B_i+k + 7);
79 }
80 A_j += 32;
81
82 acc_temp1 = 0;
83 for (k = 0; k < 32; k+=8) {
84 acc_temp1 += *(A_j+k) * *(B_i+k);
85 acc_temp1 += *(A_j+k + 1) * *(B_i+k + 1);
86 acc_temp1 += *(A_j+k + 2) * *(B_i+k + 2);
87 acc_temp1 += *(A_j+k + 3) * *(B_i+k + 3);
88 acc_temp1 += *(A_j+k + 4) * *(B_i+k + 4);
89 acc_temp1 += *(A_j+k + 5) * *(B_i+k + 5);
90 acc_temp1 += *(A_j+k + 6) * *(B_i+k + 6);
91 acc_temp1 += *(A_j+k + 7) * *(B_i+k + 7);
92 }
93 C[i + j*lda] = acc_temp0;
94 C[i + (j+1)*lda] = acc_temp1;
95 }
96 }
97 }
98 }