Add a "--with-xlen" configure argument (#16)
[riscv-tests.git] / benchmarks / mm / rb.h
1 static const int RBM = 4, RBN = 5, RBK = 6;
2 static const int CBM = 24, CBN = 25, CBK = 24;
3 static inline void kloop(size_t p, t* a0, size_t lda, t* b0, size_t ldb, t* c, size_t ldc)
4 {
5 t* c_0 = &c[ldc*0];
6 t* c_1 = &c[ldc*1];
7 t* c_2 = &c[ldc*2];
8 t* c_3 = &c[ldc*3];
9 t c_0_0 = c_0[0];
10 t c_0_1 = c_0[1];
11 t c_0_2 = c_0[2];
12 t c_0_3 = c_0[3];
13 t c_0_4 = c_0[4];
14 t c_1_0 = c_1[0];
15 t c_1_1 = c_1[1];
16 t c_1_2 = c_1[2];
17 t c_1_3 = c_1[3];
18 t c_1_4 = c_1[4];
19 t c_2_0 = c_2[0];
20 t c_2_1 = c_2[1];
21 t c_2_2 = c_2[2];
22 t c_2_3 = c_2[3];
23 t c_2_4 = c_2[4];
24 t c_3_0 = c_3[0];
25 t c_3_1 = c_3[1];
26 t c_3_2 = c_3[2];
27 t c_3_3 = c_3[3];
28 t c_3_4 = c_3[4];
29 for (t *a = a0, *b = b0; a < a0 + p/RBK*RBK; a += RBK, b += RBK*ldb)
30 {
31 t* a_0 = &a[lda*0];
32 t* a_1 = &a[lda*1];
33 t* a_2 = &a[lda*2];
34 t* a_3 = &a[lda*3];
35 t* b_0 = &b[ldb*0];
36 t* b_1 = &b[ldb*1];
37 t* b_2 = &b[ldb*2];
38 t* b_3 = &b[ldb*3];
39 t* b_4 = &b[ldb*4];
40 t* b_5 = &b[ldb*5];
41 c_0_0 = fma(a_0[0], b_0[0], c_0_0);
42 c_0_1 = fma(a_0[0], b_0[1], c_0_1);
43 c_0_2 = fma(a_0[0], b_0[2], c_0_2);
44 c_0_3 = fma(a_0[0], b_0[3], c_0_3);
45 c_0_4 = fma(a_0[0], b_0[4], c_0_4);
46 c_1_0 = fma(a_1[0], b_0[0], c_1_0);
47 c_1_1 = fma(a_1[0], b_0[1], c_1_1);
48 c_1_2 = fma(a_1[0], b_0[2], c_1_2);
49 c_1_3 = fma(a_1[0], b_0[3], c_1_3);
50 c_1_4 = fma(a_1[0], b_0[4], c_1_4);
51 c_2_0 = fma(a_2[0], b_0[0], c_2_0);
52 c_2_1 = fma(a_2[0], b_0[1], c_2_1);
53 c_2_2 = fma(a_2[0], b_0[2], c_2_2);
54 c_2_3 = fma(a_2[0], b_0[3], c_2_3);
55 c_2_4 = fma(a_2[0], b_0[4], c_2_4);
56 c_3_0 = fma(a_3[0], b_0[0], c_3_0);
57 c_3_1 = fma(a_3[0], b_0[1], c_3_1);
58 c_3_2 = fma(a_3[0], b_0[2], c_3_2);
59 c_3_3 = fma(a_3[0], b_0[3], c_3_3);
60 c_3_4 = fma(a_3[0], b_0[4], c_3_4);
61 c_0_0 = fma(a_0[1], b_1[0], c_0_0);
62 c_0_1 = fma(a_0[1], b_1[1], c_0_1);
63 c_0_2 = fma(a_0[1], b_1[2], c_0_2);
64 c_0_3 = fma(a_0[1], b_1[3], c_0_3);
65 c_0_4 = fma(a_0[1], b_1[4], c_0_4);
66 c_1_0 = fma(a_1[1], b_1[0], c_1_0);
67 c_1_1 = fma(a_1[1], b_1[1], c_1_1);
68 c_1_2 = fma(a_1[1], b_1[2], c_1_2);
69 c_1_3 = fma(a_1[1], b_1[3], c_1_3);
70 c_1_4 = fma(a_1[1], b_1[4], c_1_4);
71 c_2_0 = fma(a_2[1], b_1[0], c_2_0);
72 c_2_1 = fma(a_2[1], b_1[1], c_2_1);
73 c_2_2 = fma(a_2[1], b_1[2], c_2_2);
74 c_2_3 = fma(a_2[1], b_1[3], c_2_3);
75 c_2_4 = fma(a_2[1], b_1[4], c_2_4);
76 c_3_0 = fma(a_3[1], b_1[0], c_3_0);
77 c_3_1 = fma(a_3[1], b_1[1], c_3_1);
78 c_3_2 = fma(a_3[1], b_1[2], c_3_2);
79 c_3_3 = fma(a_3[1], b_1[3], c_3_3);
80 c_3_4 = fma(a_3[1], b_1[4], c_3_4);
81 c_0_0 = fma(a_0[2], b_2[0], c_0_0);
82 c_0_1 = fma(a_0[2], b_2[1], c_0_1);
83 c_0_2 = fma(a_0[2], b_2[2], c_0_2);
84 c_0_3 = fma(a_0[2], b_2[3], c_0_3);
85 c_0_4 = fma(a_0[2], b_2[4], c_0_4);
86 c_1_0 = fma(a_1[2], b_2[0], c_1_0);
87 c_1_1 = fma(a_1[2], b_2[1], c_1_1);
88 c_1_2 = fma(a_1[2], b_2[2], c_1_2);
89 c_1_3 = fma(a_1[2], b_2[3], c_1_3);
90 c_1_4 = fma(a_1[2], b_2[4], c_1_4);
91 c_2_0 = fma(a_2[2], b_2[0], c_2_0);
92 c_2_1 = fma(a_2[2], b_2[1], c_2_1);
93 c_2_2 = fma(a_2[2], b_2[2], c_2_2);
94 c_2_3 = fma(a_2[2], b_2[3], c_2_3);
95 c_2_4 = fma(a_2[2], b_2[4], c_2_4);
96 c_3_0 = fma(a_3[2], b_2[0], c_3_0);
97 c_3_1 = fma(a_3[2], b_2[1], c_3_1);
98 c_3_2 = fma(a_3[2], b_2[2], c_3_2);
99 c_3_3 = fma(a_3[2], b_2[3], c_3_3);
100 c_3_4 = fma(a_3[2], b_2[4], c_3_4);
101 c_0_0 = fma(a_0[3], b_3[0], c_0_0);
102 c_0_1 = fma(a_0[3], b_3[1], c_0_1);
103 c_0_2 = fma(a_0[3], b_3[2], c_0_2);
104 c_0_3 = fma(a_0[3], b_3[3], c_0_3);
105 c_0_4 = fma(a_0[3], b_3[4], c_0_4);
106 c_1_0 = fma(a_1[3], b_3[0], c_1_0);
107 c_1_1 = fma(a_1[3], b_3[1], c_1_1);
108 c_1_2 = fma(a_1[3], b_3[2], c_1_2);
109 c_1_3 = fma(a_1[3], b_3[3], c_1_3);
110 c_1_4 = fma(a_1[3], b_3[4], c_1_4);
111 c_2_0 = fma(a_2[3], b_3[0], c_2_0);
112 c_2_1 = fma(a_2[3], b_3[1], c_2_1);
113 c_2_2 = fma(a_2[3], b_3[2], c_2_2);
114 c_2_3 = fma(a_2[3], b_3[3], c_2_3);
115 c_2_4 = fma(a_2[3], b_3[4], c_2_4);
116 c_3_0 = fma(a_3[3], b_3[0], c_3_0);
117 c_3_1 = fma(a_3[3], b_3[1], c_3_1);
118 c_3_2 = fma(a_3[3], b_3[2], c_3_2);
119 c_3_3 = fma(a_3[3], b_3[3], c_3_3);
120 c_3_4 = fma(a_3[3], b_3[4], c_3_4);
121 c_0_0 = fma(a_0[4], b_4[0], c_0_0);
122 c_0_1 = fma(a_0[4], b_4[1], c_0_1);
123 c_0_2 = fma(a_0[4], b_4[2], c_0_2);
124 c_0_3 = fma(a_0[4], b_4[3], c_0_3);
125 c_0_4 = fma(a_0[4], b_4[4], c_0_4);
126 c_1_0 = fma(a_1[4], b_4[0], c_1_0);
127 c_1_1 = fma(a_1[4], b_4[1], c_1_1);
128 c_1_2 = fma(a_1[4], b_4[2], c_1_2);
129 c_1_3 = fma(a_1[4], b_4[3], c_1_3);
130 c_1_4 = fma(a_1[4], b_4[4], c_1_4);
131 c_2_0 = fma(a_2[4], b_4[0], c_2_0);
132 c_2_1 = fma(a_2[4], b_4[1], c_2_1);
133 c_2_2 = fma(a_2[4], b_4[2], c_2_2);
134 c_2_3 = fma(a_2[4], b_4[3], c_2_3);
135 c_2_4 = fma(a_2[4], b_4[4], c_2_4);
136 c_3_0 = fma(a_3[4], b_4[0], c_3_0);
137 c_3_1 = fma(a_3[4], b_4[1], c_3_1);
138 c_3_2 = fma(a_3[4], b_4[2], c_3_2);
139 c_3_3 = fma(a_3[4], b_4[3], c_3_3);
140 c_3_4 = fma(a_3[4], b_4[4], c_3_4);
141 c_0_0 = fma(a_0[5], b_5[0], c_0_0);
142 c_0_1 = fma(a_0[5], b_5[1], c_0_1);
143 c_0_2 = fma(a_0[5], b_5[2], c_0_2);
144 c_0_3 = fma(a_0[5], b_5[3], c_0_3);
145 c_0_4 = fma(a_0[5], b_5[4], c_0_4);
146 c_1_0 = fma(a_1[5], b_5[0], c_1_0);
147 c_1_1 = fma(a_1[5], b_5[1], c_1_1);
148 c_1_2 = fma(a_1[5], b_5[2], c_1_2);
149 c_1_3 = fma(a_1[5], b_5[3], c_1_3);
150 c_1_4 = fma(a_1[5], b_5[4], c_1_4);
151 c_2_0 = fma(a_2[5], b_5[0], c_2_0);
152 c_2_1 = fma(a_2[5], b_5[1], c_2_1);
153 c_2_2 = fma(a_2[5], b_5[2], c_2_2);
154 c_2_3 = fma(a_2[5], b_5[3], c_2_3);
155 c_2_4 = fma(a_2[5], b_5[4], c_2_4);
156 c_3_0 = fma(a_3[5], b_5[0], c_3_0);
157 c_3_1 = fma(a_3[5], b_5[1], c_3_1);
158 c_3_2 = fma(a_3[5], b_5[2], c_3_2);
159 c_3_3 = fma(a_3[5], b_5[3], c_3_3);
160 c_3_4 = fma(a_3[5], b_5[4], c_3_4);
161 }
162 for (t *a = a0 + p/RBK*RBK, *b = b0 + p/RBK*RBK*ldb; a < a0 + p; a++, b += ldb)
163 {
164 t* a_0 = &a[lda*0];
165 t* a_1 = &a[lda*1];
166 t* a_2 = &a[lda*2];
167 t* a_3 = &a[lda*3];
168 t* b_0 = &b[ldb*0];
169 c_0_0 = fma(a_0[0], b_0[0], c_0_0);
170 c_0_1 = fma(a_0[0], b_0[1], c_0_1);
171 c_0_2 = fma(a_0[0], b_0[2], c_0_2);
172 c_0_3 = fma(a_0[0], b_0[3], c_0_3);
173 c_0_4 = fma(a_0[0], b_0[4], c_0_4);
174 c_1_0 = fma(a_1[0], b_0[0], c_1_0);
175 c_1_1 = fma(a_1[0], b_0[1], c_1_1);
176 c_1_2 = fma(a_1[0], b_0[2], c_1_2);
177 c_1_3 = fma(a_1[0], b_0[3], c_1_3);
178 c_1_4 = fma(a_1[0], b_0[4], c_1_4);
179 c_2_0 = fma(a_2[0], b_0[0], c_2_0);
180 c_2_1 = fma(a_2[0], b_0[1], c_2_1);
181 c_2_2 = fma(a_2[0], b_0[2], c_2_2);
182 c_2_3 = fma(a_2[0], b_0[3], c_2_3);
183 c_2_4 = fma(a_2[0], b_0[4], c_2_4);
184 c_3_0 = fma(a_3[0], b_0[0], c_3_0);
185 c_3_1 = fma(a_3[0], b_0[1], c_3_1);
186 c_3_2 = fma(a_3[0], b_0[2], c_3_2);
187 c_3_3 = fma(a_3[0], b_0[3], c_3_3);
188 c_3_4 = fma(a_3[0], b_0[4], c_3_4);
189 }
190 c_0[0] = c_0_0;
191 c_0[1] = c_0_1;
192 c_0[2] = c_0_2;
193 c_0[3] = c_0_3;
194 c_0[4] = c_0_4;
195 c_1[0] = c_1_0;
196 c_1[1] = c_1_1;
197 c_1[2] = c_1_2;
198 c_1[3] = c_1_3;
199 c_1[4] = c_1_4;
200 c_2[0] = c_2_0;
201 c_2[1] = c_2_1;
202 c_2[2] = c_2_2;
203 c_2[3] = c_2_3;
204 c_2[4] = c_2_4;
205 c_3[0] = c_3_0;
206 c_3[1] = c_3_1;
207 c_3[2] = c_3_2;
208 c_3[3] = c_3_3;
209 c_3[4] = c_3_4;
210 }