Run debug tests from main Makefile.
[riscv-tests.git] / mt / cg_matmul.c
1 #include "stdlib.h"
2
3 #include "util.h"
4
5 #include "dataset.h"
6 void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda, const data_t A[], const data_t B[], data_t C[] )
7 {
8 int i, j, k;
9
10 for ( i = 0; i < lda; i+=2 )
11 {
12 for (k = 0; k < lda; k+=4)
13 {
14 int d0 = B[k*lda + i];
15 int c0 = B[k*lda + i + 1];
16 int d1 = B[(k+1)*lda + i];
17 int c1 = B[(k+1)*lda + i + 1];
18 int d2 = B[(k+2)*lda + i];
19 int c2 = B[(k+2)*lda + i + 1];
20 int d3 = B[(k+3)*lda + i];
21 int c3 = B[(k+3)*lda + i + 1];
22
23 for ( j = coreid*(lda/ncores); j < (coreid+1)*(lda/ncores); j+=4)
24 {
25
26 int sum = A[j*lda + k] * d0;
27 sum += A[j*lda + k + 1] * d1;
28 sum += A[j*lda + k + 2] * d2;
29 sum += A[j*lda + k + 3] * d3;
30 C[j*lda +i] += sum;
31
32 sum = A[j*lda + k] * c0;
33 sum += A[j*lda + k + 1] * c1;
34 sum += A[j*lda + k + 2] * c2;
35 sum += A[j*lda + k + 3] * c3;
36 C[j*lda + i + 1] += sum;
37
38 sum = A[(j+1)*lda + k] * d0;
39 sum += A[(j+1)*lda + k + 1] * d1;
40 sum += A[(j+1)*lda + k + 2] * d2;
41 sum += A[(j+1)*lda + k + 3] * d3;
42 C[(j+1)*lda +i] += sum;
43
44 sum = A[(j+1)*lda + k] * c0;
45 sum += A[(j+1)*lda + k + 1] * c1;
46 sum += A[(j+1)*lda + k + 2] * c2;
47 sum += A[(j+1)*lda + k + 3] * c3;
48 C[(j+1)*lda + i + 1] += sum;
49
50 sum = A[(j+2)*lda + k] * d0;
51 sum += A[(j+2)*lda + k + 1] * d1;
52 sum += A[(j+2)*lda + k + 2] * d2;
53 sum += A[(j+2)*lda + k + 3] * d3;
54 C[(j+2)*lda +i] += sum;
55
56 sum = A[(j+2)*lda + k] * c0;
57 sum += A[(j+2)*lda + k + 1] * c1;
58 sum += A[(j+2)*lda + k + 2] * c2;
59 sum += A[(j+2)*lda + k + 3] * c3;
60 C[(j+2)*lda + i + 1] += sum;
61
62 sum = A[(j+3)*lda + k] * d0;
63 sum += A[(j+3)*lda + k + 1] * d1;
64 sum += A[(j+3)*lda + k + 2] * d2;
65 sum += A[(j+3)*lda + k + 3] * d3;
66 C[(j+3)*lda +i] += sum;
67
68 sum = A[(j+3)*lda + k] * c0;
69 sum += A[(j+3)*lda + k + 1] * c1;
70 sum += A[(j+3)*lda + k + 2] * c2;
71 sum += A[(j+3)*lda + k + 3] * c3;
72 C[(j+3)*lda + i + 1] += sum;
73
74 }
75 barrier(ncores);
76 }
77 }
78 }