Remove smips/host-debugging cruft
[riscv-tests.git] / mt / bh_matmul.c
1 #include "stdlib.h"
2
3 #include "util.h"
4
5 #include "dataset.h"
6 void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
7 {
8
9 // ***************************** //
10 // **** ADD YOUR CODE HERE ***** //
11 // ***************************** //
12 //
13 // feel free to make a separate function for MI and MSI versions.
14 if(coreid > 1) return;
15
16 int m, i, j, k, iB0, iB1;
17 data_t tempC0, tempC1, tempC2, tempC3, tempC4, tempC5, tempC6, tempC7;
18 data_t tempA0, tempA1;
19
20 if (coreid == 0){
21 for (m = 0; m < 2; m++){
22 for (j = 0; j < lda/2; j++){
23 for (i = 0; i < lda; i+=8){
24 tempC0 = C[i + j*lda];
25 tempC1 = C[i + j*lda+1];
26 tempC2 = C[i + j*lda+2];
27 tempC3 = C[i + j*lda+3];
28 tempC4 = C[i + j*lda+4];
29 tempC5 = C[i + j*lda+5];
30 tempC6 = C[i + j*lda+6];
31 tempC7 = C[i + j*lda+7];
32 iB0 = m*lda*lda/2+i;
33 iB1 = iB0+lda;
34 for (k = m*lda/2; k < (m+1)*lda/2; k+=2){
35 tempA0 = A[j*lda+k];
36 tempA1 = A[j*lda+k+1];
37 tempC0 += tempA0*B[iB0]+tempA1*B[iB1];
38 tempC1 += tempA0*B[iB0+1]+tempA1*B[iB1+1];
39 tempC2 += tempA0*B[iB0+2]+tempA1*B[iB1+2];
40 tempC3 += tempA0*B[iB0+3]+tempA1*B[iB1+3];
41 tempC4 += tempA0*B[iB0+4]+tempA1*B[iB1+4];
42 tempC5 += tempA0*B[iB0+5]+tempA1*B[iB1+5];
43 tempC6 += tempA0*B[iB0+6]+tempA1*B[iB1+6];
44 tempC7 += tempA0*B[iB0+7]+tempA1*B[iB1+7];
45 iB0 += 2*lda;
46 iB1 += 2*lda;
47
48 }
49 C[i + j*lda] = tempC0;
50 C[i + j*lda + 1] = tempC1;
51 C[i + j*lda + 2] = tempC2;
52 C[i + j*lda + 3] = tempC3;
53 C[i + j*lda + 4] = tempC4;
54 C[i + j*lda + 5] = tempC5;
55 C[i + j*lda + 6] = tempC6;
56 C[i + j*lda + 7] = tempC7;
57 }
58 }
59 }
60 }
61 if(coreid == 1 || ncores == 1) {
62 for (m = 2; m > 0; m--){
63 for (j = lda-1; j >= lda/2; j--){
64 for (i = lda-1; i >= 0; i-=8){
65 tempC0 = C[i + j*lda];
66 tempC1 = C[i + j*lda - 1];
67 tempC2 = C[i + j*lda - 2];
68 tempC3 = C[i + j*lda - 3];
69 tempC4 = C[i + j*lda - 4];
70 tempC5 = C[i + j*lda - 5];
71 tempC6 = C[i + j*lda - 6];
72 tempC7 = C[i + j*lda - 7];
73 for (k = m*lda/2-1; k >= (m-1)*lda/2; k-=2){
74 tempA0 = A[j*lda+k];
75 tempA1 = A[j*lda+k-1];
76 tempC0 += tempA0*B[k*lda+i]+tempA1*B[(k-1)*lda+i];
77 tempC1 += tempA0*B[k*lda+i-1]+tempA1*B[(k-1)*lda+i-1];
78 tempC2 += tempA0*B[k*lda+i-2]+tempA1*B[(k-1)*lda+i-2];
79 tempC3 += tempA0*B[k*lda+i-3]+tempA1*B[(k-1)*lda+i-3];
80 tempC4 += tempA0*B[k*lda+i-4]+tempA1*B[(k-1)*lda+i-4];
81 tempC5 += tempA0*B[k*lda+i-5]+tempA1*B[(k-1)*lda+i-5];
82 tempC6 += tempA0*B[k*lda+i-6]+tempA1*B[(k-1)*lda+i-6];
83 tempC7 += tempA0*B[k*lda+i-7]+tempA1*B[(k-1)*lda+i-7];
84 }
85 C[i + j*lda] = tempC0;
86 C[i + j*lda - 1] = tempC1;
87 C[i + j*lda - 2] = tempC2;
88 C[i + j*lda - 3] = tempC3;
89 C[i + j*lda - 4] = tempC4;
90 C[i + j*lda - 5] = tempC5;
91 C[i + j*lda - 6] = tempC6;
92 C[i + j*lda - 7] = tempC7;
93 }
94 }
95 }
96 }
97 }