fb08386104d8b9205f98c7ab7f4a8461f328e094
[riscv-tests.git] / benchmarks / mm / rb.h
1 // See LICENSE for license details.
2
3 static const int RBM = 4, RBN = 5, RBK = 6;
4 static const int CBM = 36, CBN = 35, CBK = 36;
5 static inline void kloop(size_t p, t* a0, size_t lda, t* b0, size_t ldb, t* c, size_t ldc)
6 {
7 t* c_0 = &c[ldc*0];
8 t* c_1 = &c[ldc*1];
9 t* c_2 = &c[ldc*2];
10 t* c_3 = &c[ldc*3];
11 t c_0_0 = c_0[0];
12 t c_0_1 = c_0[1];
13 t c_0_2 = c_0[2];
14 t c_0_3 = c_0[3];
15 t c_0_4 = c_0[4];
16 t c_1_0 = c_1[0];
17 t c_1_1 = c_1[1];
18 t c_1_2 = c_1[2];
19 t c_1_3 = c_1[3];
20 t c_1_4 = c_1[4];
21 t c_2_0 = c_2[0];
22 t c_2_1 = c_2[1];
23 t c_2_2 = c_2[2];
24 t c_2_3 = c_2[3];
25 t c_2_4 = c_2[4];
26 t c_3_0 = c_3[0];
27 t c_3_1 = c_3[1];
28 t c_3_2 = c_3[2];
29 t c_3_3 = c_3[3];
30 t c_3_4 = c_3[4];
31 for (t *a = a0, *b = b0; a < a0 + p/RBK*RBK; a += RBK, b += RBK*ldb)
32 {
33 t* a_0 = &a[lda*0];
34 t* a_1 = &a[lda*1];
35 t* a_2 = &a[lda*2];
36 t* a_3 = &a[lda*3];
37 t* b_0 = &b[ldb*0];
38 t* b_1 = &b[ldb*1];
39 t* b_2 = &b[ldb*2];
40 t* b_3 = &b[ldb*3];
41 t* b_4 = &b[ldb*4];
42 t* b_5 = &b[ldb*5];
43 c_0_0 = fma(a_0[0], b_0[0], c_0_0);
44 c_0_0 = fma(a_0[1], b_1[0], c_0_0);
45 c_0_0 = fma(a_0[2], b_2[0], c_0_0);
46 c_0_0 = fma(a_0[3], b_3[0], c_0_0);
47 c_0_0 = fma(a_0[4], b_4[0], c_0_0);
48 c_0_0 = fma(a_0[5], b_5[0], c_0_0);
49 c_0_1 = fma(a_0[0], b_0[1], c_0_1);
50 c_0_1 = fma(a_0[1], b_1[1], c_0_1);
51 c_0_1 = fma(a_0[2], b_2[1], c_0_1);
52 c_0_1 = fma(a_0[3], b_3[1], c_0_1);
53 c_0_1 = fma(a_0[4], b_4[1], c_0_1);
54 c_0_1 = fma(a_0[5], b_5[1], c_0_1);
55 c_0_2 = fma(a_0[0], b_0[2], c_0_2);
56 c_0_2 = fma(a_0[1], b_1[2], c_0_2);
57 c_0_2 = fma(a_0[2], b_2[2], c_0_2);
58 c_0_2 = fma(a_0[3], b_3[2], c_0_2);
59 c_0_2 = fma(a_0[4], b_4[2], c_0_2);
60 c_0_2 = fma(a_0[5], b_5[2], c_0_2);
61 c_0_3 = fma(a_0[0], b_0[3], c_0_3);
62 c_0_3 = fma(a_0[1], b_1[3], c_0_3);
63 c_0_3 = fma(a_0[2], b_2[3], c_0_3);
64 c_0_3 = fma(a_0[3], b_3[3], c_0_3);
65 c_0_3 = fma(a_0[4], b_4[3], c_0_3);
66 c_0_3 = fma(a_0[5], b_5[3], c_0_3);
67 c_0_4 = fma(a_0[0], b_0[4], c_0_4);
68 c_0_4 = fma(a_0[1], b_1[4], c_0_4);
69 c_0_4 = fma(a_0[2], b_2[4], c_0_4);
70 c_0_4 = fma(a_0[3], b_3[4], c_0_4);
71 c_0_4 = fma(a_0[4], b_4[4], c_0_4);
72 c_0_4 = fma(a_0[5], b_5[4], c_0_4);
73 c_1_0 = fma(a_1[0], b_0[0], c_1_0);
74 c_1_0 = fma(a_1[1], b_1[0], c_1_0);
75 c_1_0 = fma(a_1[2], b_2[0], c_1_0);
76 c_1_0 = fma(a_1[3], b_3[0], c_1_0);
77 c_1_0 = fma(a_1[4], b_4[0], c_1_0);
78 c_1_0 = fma(a_1[5], b_5[0], c_1_0);
79 c_1_1 = fma(a_1[0], b_0[1], c_1_1);
80 c_1_1 = fma(a_1[1], b_1[1], c_1_1);
81 c_1_1 = fma(a_1[2], b_2[1], c_1_1);
82 c_1_1 = fma(a_1[3], b_3[1], c_1_1);
83 c_1_1 = fma(a_1[4], b_4[1], c_1_1);
84 c_1_1 = fma(a_1[5], b_5[1], c_1_1);
85 c_1_2 = fma(a_1[0], b_0[2], c_1_2);
86 c_1_2 = fma(a_1[1], b_1[2], c_1_2);
87 c_1_2 = fma(a_1[2], b_2[2], c_1_2);
88 c_1_2 = fma(a_1[3], b_3[2], c_1_2);
89 c_1_2 = fma(a_1[4], b_4[2], c_1_2);
90 c_1_2 = fma(a_1[5], b_5[2], c_1_2);
91 c_1_3 = fma(a_1[0], b_0[3], c_1_3);
92 c_1_3 = fma(a_1[1], b_1[3], c_1_3);
93 c_1_3 = fma(a_1[2], b_2[3], c_1_3);
94 c_1_3 = fma(a_1[3], b_3[3], c_1_3);
95 c_1_3 = fma(a_1[4], b_4[3], c_1_3);
96 c_1_3 = fma(a_1[5], b_5[3], c_1_3);
97 c_1_4 = fma(a_1[0], b_0[4], c_1_4);
98 c_1_4 = fma(a_1[1], b_1[4], c_1_4);
99 c_1_4 = fma(a_1[2], b_2[4], c_1_4);
100 c_1_4 = fma(a_1[3], b_3[4], c_1_4);
101 c_1_4 = fma(a_1[4], b_4[4], c_1_4);
102 c_1_4 = fma(a_1[5], b_5[4], c_1_4);
103 c_2_0 = fma(a_2[0], b_0[0], c_2_0);
104 c_2_0 = fma(a_2[1], b_1[0], c_2_0);
105 c_2_0 = fma(a_2[2], b_2[0], c_2_0);
106 c_2_0 = fma(a_2[3], b_3[0], c_2_0);
107 c_2_0 = fma(a_2[4], b_4[0], c_2_0);
108 c_2_0 = fma(a_2[5], b_5[0], c_2_0);
109 c_2_1 = fma(a_2[0], b_0[1], c_2_1);
110 c_2_1 = fma(a_2[1], b_1[1], c_2_1);
111 c_2_1 = fma(a_2[2], b_2[1], c_2_1);
112 c_2_1 = fma(a_2[3], b_3[1], c_2_1);
113 c_2_1 = fma(a_2[4], b_4[1], c_2_1);
114 c_2_1 = fma(a_2[5], b_5[1], c_2_1);
115 c_2_2 = fma(a_2[0], b_0[2], c_2_2);
116 c_2_2 = fma(a_2[1], b_1[2], c_2_2);
117 c_2_2 = fma(a_2[2], b_2[2], c_2_2);
118 c_2_2 = fma(a_2[3], b_3[2], c_2_2);
119 c_2_2 = fma(a_2[4], b_4[2], c_2_2);
120 c_2_2 = fma(a_2[5], b_5[2], c_2_2);
121 c_2_3 = fma(a_2[0], b_0[3], c_2_3);
122 c_2_3 = fma(a_2[1], b_1[3], c_2_3);
123 c_2_3 = fma(a_2[2], b_2[3], c_2_3);
124 c_2_3 = fma(a_2[3], b_3[3], c_2_3);
125 c_2_3 = fma(a_2[4], b_4[3], c_2_3);
126 c_2_3 = fma(a_2[5], b_5[3], c_2_3);
127 c_2_4 = fma(a_2[0], b_0[4], c_2_4);
128 c_2_4 = fma(a_2[1], b_1[4], c_2_4);
129 c_2_4 = fma(a_2[2], b_2[4], c_2_4);
130 c_2_4 = fma(a_2[3], b_3[4], c_2_4);
131 c_2_4 = fma(a_2[4], b_4[4], c_2_4);
132 c_2_4 = fma(a_2[5], b_5[4], c_2_4);
133 c_3_0 = fma(a_3[0], b_0[0], c_3_0);
134 c_3_0 = fma(a_3[1], b_1[0], c_3_0);
135 c_3_0 = fma(a_3[2], b_2[0], c_3_0);
136 c_3_0 = fma(a_3[3], b_3[0], c_3_0);
137 c_3_0 = fma(a_3[4], b_4[0], c_3_0);
138 c_3_0 = fma(a_3[5], b_5[0], c_3_0);
139 c_3_1 = fma(a_3[0], b_0[1], c_3_1);
140 c_3_1 = fma(a_3[1], b_1[1], c_3_1);
141 c_3_1 = fma(a_3[2], b_2[1], c_3_1);
142 c_3_1 = fma(a_3[3], b_3[1], c_3_1);
143 c_3_1 = fma(a_3[4], b_4[1], c_3_1);
144 c_3_1 = fma(a_3[5], b_5[1], c_3_1);
145 c_3_2 = fma(a_3[0], b_0[2], c_3_2);
146 c_3_2 = fma(a_3[1], b_1[2], c_3_2);
147 c_3_2 = fma(a_3[2], b_2[2], c_3_2);
148 c_3_2 = fma(a_3[3], b_3[2], c_3_2);
149 c_3_2 = fma(a_3[4], b_4[2], c_3_2);
150 c_3_2 = fma(a_3[5], b_5[2], c_3_2);
151 c_3_3 = fma(a_3[0], b_0[3], c_3_3);
152 c_3_3 = fma(a_3[1], b_1[3], c_3_3);
153 c_3_3 = fma(a_3[2], b_2[3], c_3_3);
154 c_3_3 = fma(a_3[3], b_3[3], c_3_3);
155 c_3_3 = fma(a_3[4], b_4[3], c_3_3);
156 c_3_3 = fma(a_3[5], b_5[3], c_3_3);
157 c_3_4 = fma(a_3[0], b_0[4], c_3_4);
158 c_3_4 = fma(a_3[1], b_1[4], c_3_4);
159 c_3_4 = fma(a_3[2], b_2[4], c_3_4);
160 c_3_4 = fma(a_3[3], b_3[4], c_3_4);
161 c_3_4 = fma(a_3[4], b_4[4], c_3_4);
162 c_3_4 = fma(a_3[5], b_5[4], c_3_4);
163 }
164 for (t *a = a0 + p/RBK*RBK, *b = b0 + p/RBK*RBK*ldb; a < a0 + p; a++, b += ldb)
165 {
166 t* a_0 = &a[lda*0];
167 t* a_1 = &a[lda*1];
168 t* a_2 = &a[lda*2];
169 t* a_3 = &a[lda*3];
170 t* b_0 = &b[ldb*0];
171 c_0_0 = fma(a_0[0], b_0[0], c_0_0);
172 c_0_1 = fma(a_0[0], b_0[1], c_0_1);
173 c_0_2 = fma(a_0[0], b_0[2], c_0_2);
174 c_0_3 = fma(a_0[0], b_0[3], c_0_3);
175 c_0_4 = fma(a_0[0], b_0[4], c_0_4);
176 c_1_0 = fma(a_1[0], b_0[0], c_1_0);
177 c_1_1 = fma(a_1[0], b_0[1], c_1_1);
178 c_1_2 = fma(a_1[0], b_0[2], c_1_2);
179 c_1_3 = fma(a_1[0], b_0[3], c_1_3);
180 c_1_4 = fma(a_1[0], b_0[4], c_1_4);
181 c_2_0 = fma(a_2[0], b_0[0], c_2_0);
182 c_2_1 = fma(a_2[0], b_0[1], c_2_1);
183 c_2_2 = fma(a_2[0], b_0[2], c_2_2);
184 c_2_3 = fma(a_2[0], b_0[3], c_2_3);
185 c_2_4 = fma(a_2[0], b_0[4], c_2_4);
186 c_3_0 = fma(a_3[0], b_0[0], c_3_0);
187 c_3_1 = fma(a_3[0], b_0[1], c_3_1);
188 c_3_2 = fma(a_3[0], b_0[2], c_3_2);
189 c_3_3 = fma(a_3[0], b_0[3], c_3_3);
190 c_3_4 = fma(a_3[0], b_0[4], c_3_4);
191 }
192 c_0[0] = c_0_0;
193 c_0[1] = c_0_1;
194 c_0[2] = c_0_2;
195 c_0[3] = c_0_3;
196 c_0[4] = c_0_4;
197 c_1[0] = c_1_0;
198 c_1[1] = c_1_1;
199 c_1[2] = c_1_2;
200 c_1[3] = c_1_3;
201 c_1[4] = c_1_4;
202 c_2[0] = c_2_0;
203 c_2[1] = c_2_1;
204 c_2[2] = c_2_2;
205 c_2[3] = c_2_3;
206 c_2[4] = c_2_4;
207 c_3[0] = c_3_0;
208 c_3[1] = c_3_1;
209 c_3[2] = c_3_2;
210 c_3[3] = c_3_3;
211 c_3[4] = c_3_4;
212 }