minor mt updates
[riscv-tests.git] / benchmarks / mm / rb.h
1 static const int RBM = 4, RBN = 5, RBK = 6;
2 static const int CBM = 36, CBN = 35, CBK = 36;
3 static inline void kloop(size_t p, t* a0, size_t lda, t* b0, size_t ldb, t* c, size_t ldc)
4 {
5 t* c_0 = &c[ldc*0];
6 t* c_1 = &c[ldc*1];
7 t* c_2 = &c[ldc*2];
8 t* c_3 = &c[ldc*3];
9 t c_0_0 = c_0[0];
10 t c_0_1 = c_0[1];
11 t c_0_2 = c_0[2];
12 t c_0_3 = c_0[3];
13 t c_0_4 = c_0[4];
14 t c_1_0 = c_1[0];
15 t c_1_1 = c_1[1];
16 t c_1_2 = c_1[2];
17 t c_1_3 = c_1[3];
18 t c_1_4 = c_1[4];
19 t c_2_0 = c_2[0];
20 t c_2_1 = c_2[1];
21 t c_2_2 = c_2[2];
22 t c_2_3 = c_2[3];
23 t c_2_4 = c_2[4];
24 t c_3_0 = c_3[0];
25 t c_3_1 = c_3[1];
26 t c_3_2 = c_3[2];
27 t c_3_3 = c_3[3];
28 t c_3_4 = c_3[4];
29 for (t *a = a0, *b = b0; a < a0 + p/RBK*RBK; a += RBK, b += RBK*ldb)
30 {
31 t* a_0 = &a[lda*0];
32 t* a_1 = &a[lda*1];
33 t* a_2 = &a[lda*2];
34 t* a_3 = &a[lda*3];
35 t* b_0 = &b[ldb*0];
36 t* b_1 = &b[ldb*1];
37 t* b_2 = &b[ldb*2];
38 t* b_3 = &b[ldb*3];
39 t* b_4 = &b[ldb*4];
40 t* b_5 = &b[ldb*5];
41 c_0_0 = fma(a_0[0], b_0[0], c_0_0);
42 c_0_0 = fma(a_0[1], b_1[0], c_0_0);
43 c_0_0 = fma(a_0[2], b_2[0], c_0_0);
44 c_0_0 = fma(a_0[3], b_3[0], c_0_0);
45 c_0_0 = fma(a_0[4], b_4[0], c_0_0);
46 c_0_0 = fma(a_0[5], b_5[0], c_0_0);
47 c_0_1 = fma(a_0[0], b_0[1], c_0_1);
48 c_0_1 = fma(a_0[1], b_1[1], c_0_1);
49 c_0_1 = fma(a_0[2], b_2[1], c_0_1);
50 c_0_1 = fma(a_0[3], b_3[1], c_0_1);
51 c_0_1 = fma(a_0[4], b_4[1], c_0_1);
52 c_0_1 = fma(a_0[5], b_5[1], c_0_1);
53 c_0_2 = fma(a_0[0], b_0[2], c_0_2);
54 c_0_2 = fma(a_0[1], b_1[2], c_0_2);
55 c_0_2 = fma(a_0[2], b_2[2], c_0_2);
56 c_0_2 = fma(a_0[3], b_3[2], c_0_2);
57 c_0_2 = fma(a_0[4], b_4[2], c_0_2);
58 c_0_2 = fma(a_0[5], b_5[2], c_0_2);
59 c_0_3 = fma(a_0[0], b_0[3], c_0_3);
60 c_0_3 = fma(a_0[1], b_1[3], c_0_3);
61 c_0_3 = fma(a_0[2], b_2[3], c_0_3);
62 c_0_3 = fma(a_0[3], b_3[3], c_0_3);
63 c_0_3 = fma(a_0[4], b_4[3], c_0_3);
64 c_0_3 = fma(a_0[5], b_5[3], c_0_3);
65 c_0_4 = fma(a_0[0], b_0[4], c_0_4);
66 c_0_4 = fma(a_0[1], b_1[4], c_0_4);
67 c_0_4 = fma(a_0[2], b_2[4], c_0_4);
68 c_0_4 = fma(a_0[3], b_3[4], c_0_4);
69 c_0_4 = fma(a_0[4], b_4[4], c_0_4);
70 c_0_4 = fma(a_0[5], b_5[4], c_0_4);
71 c_1_0 = fma(a_1[0], b_0[0], c_1_0);
72 c_1_0 = fma(a_1[1], b_1[0], c_1_0);
73 c_1_0 = fma(a_1[2], b_2[0], c_1_0);
74 c_1_0 = fma(a_1[3], b_3[0], c_1_0);
75 c_1_0 = fma(a_1[4], b_4[0], c_1_0);
76 c_1_0 = fma(a_1[5], b_5[0], c_1_0);
77 c_1_1 = fma(a_1[0], b_0[1], c_1_1);
78 c_1_1 = fma(a_1[1], b_1[1], c_1_1);
79 c_1_1 = fma(a_1[2], b_2[1], c_1_1);
80 c_1_1 = fma(a_1[3], b_3[1], c_1_1);
81 c_1_1 = fma(a_1[4], b_4[1], c_1_1);
82 c_1_1 = fma(a_1[5], b_5[1], c_1_1);
83 c_1_2 = fma(a_1[0], b_0[2], c_1_2);
84 c_1_2 = fma(a_1[1], b_1[2], c_1_2);
85 c_1_2 = fma(a_1[2], b_2[2], c_1_2);
86 c_1_2 = fma(a_1[3], b_3[2], c_1_2);
87 c_1_2 = fma(a_1[4], b_4[2], c_1_2);
88 c_1_2 = fma(a_1[5], b_5[2], c_1_2);
89 c_1_3 = fma(a_1[0], b_0[3], c_1_3);
90 c_1_3 = fma(a_1[1], b_1[3], c_1_3);
91 c_1_3 = fma(a_1[2], b_2[3], c_1_3);
92 c_1_3 = fma(a_1[3], b_3[3], c_1_3);
93 c_1_3 = fma(a_1[4], b_4[3], c_1_3);
94 c_1_3 = fma(a_1[5], b_5[3], c_1_3);
95 c_1_4 = fma(a_1[0], b_0[4], c_1_4);
96 c_1_4 = fma(a_1[1], b_1[4], c_1_4);
97 c_1_4 = fma(a_1[2], b_2[4], c_1_4);
98 c_1_4 = fma(a_1[3], b_3[4], c_1_4);
99 c_1_4 = fma(a_1[4], b_4[4], c_1_4);
100 c_1_4 = fma(a_1[5], b_5[4], c_1_4);
101 c_2_0 = fma(a_2[0], b_0[0], c_2_0);
102 c_2_0 = fma(a_2[1], b_1[0], c_2_0);
103 c_2_0 = fma(a_2[2], b_2[0], c_2_0);
104 c_2_0 = fma(a_2[3], b_3[0], c_2_0);
105 c_2_0 = fma(a_2[4], b_4[0], c_2_0);
106 c_2_0 = fma(a_2[5], b_5[0], c_2_0);
107 c_2_1 = fma(a_2[0], b_0[1], c_2_1);
108 c_2_1 = fma(a_2[1], b_1[1], c_2_1);
109 c_2_1 = fma(a_2[2], b_2[1], c_2_1);
110 c_2_1 = fma(a_2[3], b_3[1], c_2_1);
111 c_2_1 = fma(a_2[4], b_4[1], c_2_1);
112 c_2_1 = fma(a_2[5], b_5[1], c_2_1);
113 c_2_2 = fma(a_2[0], b_0[2], c_2_2);
114 c_2_2 = fma(a_2[1], b_1[2], c_2_2);
115 c_2_2 = fma(a_2[2], b_2[2], c_2_2);
116 c_2_2 = fma(a_2[3], b_3[2], c_2_2);
117 c_2_2 = fma(a_2[4], b_4[2], c_2_2);
118 c_2_2 = fma(a_2[5], b_5[2], c_2_2);
119 c_2_3 = fma(a_2[0], b_0[3], c_2_3);
120 c_2_3 = fma(a_2[1], b_1[3], c_2_3);
121 c_2_3 = fma(a_2[2], b_2[3], c_2_3);
122 c_2_3 = fma(a_2[3], b_3[3], c_2_3);
123 c_2_3 = fma(a_2[4], b_4[3], c_2_3);
124 c_2_3 = fma(a_2[5], b_5[3], c_2_3);
125 c_2_4 = fma(a_2[0], b_0[4], c_2_4);
126 c_2_4 = fma(a_2[1], b_1[4], c_2_4);
127 c_2_4 = fma(a_2[2], b_2[4], c_2_4);
128 c_2_4 = fma(a_2[3], b_3[4], c_2_4);
129 c_2_4 = fma(a_2[4], b_4[4], c_2_4);
130 c_2_4 = fma(a_2[5], b_5[4], c_2_4);
131 c_3_0 = fma(a_3[0], b_0[0], c_3_0);
132 c_3_0 = fma(a_3[1], b_1[0], c_3_0);
133 c_3_0 = fma(a_3[2], b_2[0], c_3_0);
134 c_3_0 = fma(a_3[3], b_3[0], c_3_0);
135 c_3_0 = fma(a_3[4], b_4[0], c_3_0);
136 c_3_0 = fma(a_3[5], b_5[0], c_3_0);
137 c_3_1 = fma(a_3[0], b_0[1], c_3_1);
138 c_3_1 = fma(a_3[1], b_1[1], c_3_1);
139 c_3_1 = fma(a_3[2], b_2[1], c_3_1);
140 c_3_1 = fma(a_3[3], b_3[1], c_3_1);
141 c_3_1 = fma(a_3[4], b_4[1], c_3_1);
142 c_3_1 = fma(a_3[5], b_5[1], c_3_1);
143 c_3_2 = fma(a_3[0], b_0[2], c_3_2);
144 c_3_2 = fma(a_3[1], b_1[2], c_3_2);
145 c_3_2 = fma(a_3[2], b_2[2], c_3_2);
146 c_3_2 = fma(a_3[3], b_3[2], c_3_2);
147 c_3_2 = fma(a_3[4], b_4[2], c_3_2);
148 c_3_2 = fma(a_3[5], b_5[2], c_3_2);
149 c_3_3 = fma(a_3[0], b_0[3], c_3_3);
150 c_3_3 = fma(a_3[1], b_1[3], c_3_3);
151 c_3_3 = fma(a_3[2], b_2[3], c_3_3);
152 c_3_3 = fma(a_3[3], b_3[3], c_3_3);
153 c_3_3 = fma(a_3[4], b_4[3], c_3_3);
154 c_3_3 = fma(a_3[5], b_5[3], c_3_3);
155 c_3_4 = fma(a_3[0], b_0[4], c_3_4);
156 c_3_4 = fma(a_3[1], b_1[4], c_3_4);
157 c_3_4 = fma(a_3[2], b_2[4], c_3_4);
158 c_3_4 = fma(a_3[3], b_3[4], c_3_4);
159 c_3_4 = fma(a_3[4], b_4[4], c_3_4);
160 c_3_4 = fma(a_3[5], b_5[4], c_3_4);
161 }
162 for (t *a = a0 + p/RBK*RBK, *b = b0 + p/RBK*RBK*ldb; a < a0 + p; a++, b += ldb)
163 {
164 t* a_0 = &a[lda*0];
165 t* a_1 = &a[lda*1];
166 t* a_2 = &a[lda*2];
167 t* a_3 = &a[lda*3];
168 t* b_0 = &b[ldb*0];
169 c_0_0 = fma(a_0[0], b_0[0], c_0_0);
170 c_0_1 = fma(a_0[0], b_0[1], c_0_1);
171 c_0_2 = fma(a_0[0], b_0[2], c_0_2);
172 c_0_3 = fma(a_0[0], b_0[3], c_0_3);
173 c_0_4 = fma(a_0[0], b_0[4], c_0_4);
174 c_1_0 = fma(a_1[0], b_0[0], c_1_0);
175 c_1_1 = fma(a_1[0], b_0[1], c_1_1);
176 c_1_2 = fma(a_1[0], b_0[2], c_1_2);
177 c_1_3 = fma(a_1[0], b_0[3], c_1_3);
178 c_1_4 = fma(a_1[0], b_0[4], c_1_4);
179 c_2_0 = fma(a_2[0], b_0[0], c_2_0);
180 c_2_1 = fma(a_2[0], b_0[1], c_2_1);
181 c_2_2 = fma(a_2[0], b_0[2], c_2_2);
182 c_2_3 = fma(a_2[0], b_0[3], c_2_3);
183 c_2_4 = fma(a_2[0], b_0[4], c_2_4);
184 c_3_0 = fma(a_3[0], b_0[0], c_3_0);
185 c_3_1 = fma(a_3[0], b_0[1], c_3_1);
186 c_3_2 = fma(a_3[0], b_0[2], c_3_2);
187 c_3_3 = fma(a_3[0], b_0[3], c_3_3);
188 c_3_4 = fma(a_3[0], b_0[4], c_3_4);
189 }
190 c_0[0] = c_0_0;
191 c_0[1] = c_0_1;
192 c_0[2] = c_0_2;
193 c_0[3] = c_0_3;
194 c_0[4] = c_0_4;
195 c_1[0] = c_1_0;
196 c_1[1] = c_1_1;
197 c_1[2] = c_1_2;
198 c_1[3] = c_1_3;
199 c_1[4] = c_1_4;
200 c_2[0] = c_2_0;
201 c_2[1] = c_2_1;
202 c_2[2] = c_2_2;
203 c_2[3] = c_2_3;
204 c_2[4] = c_2_4;
205 c_3[0] = c_3_0;
206 c_3[1] = c_3_1;
207 c_3[2] = c_3_2;
208 c_3[3] = c_3_3;
209 c_3[4] = c_3_4;
210 }