fb08386104d8b9205f98c7ab7f4a8461f328e094
1 // See LICENSE for license details.
3 static const int RBM
= 4, RBN
= 5, RBK
= 6;
4 static const int CBM
= 36, CBN
= 35, CBK
= 36;
5 static inline void kloop(size_t p
, t
* a0
, size_t lda
, t
* b0
, size_t ldb
, t
* c
, size_t ldc
)
31 for (t
*a
= a0
, *b
= b0
; a
< a0
+ p
/RBK
*RBK
; a
+= RBK
, b
+= RBK
*ldb
)
43 c_0_0
= fma(a_0
[0], b_0
[0], c_0_0
);
44 c_0_0
= fma(a_0
[1], b_1
[0], c_0_0
);
45 c_0_0
= fma(a_0
[2], b_2
[0], c_0_0
);
46 c_0_0
= fma(a_0
[3], b_3
[0], c_0_0
);
47 c_0_0
= fma(a_0
[4], b_4
[0], c_0_0
);
48 c_0_0
= fma(a_0
[5], b_5
[0], c_0_0
);
49 c_0_1
= fma(a_0
[0], b_0
[1], c_0_1
);
50 c_0_1
= fma(a_0
[1], b_1
[1], c_0_1
);
51 c_0_1
= fma(a_0
[2], b_2
[1], c_0_1
);
52 c_0_1
= fma(a_0
[3], b_3
[1], c_0_1
);
53 c_0_1
= fma(a_0
[4], b_4
[1], c_0_1
);
54 c_0_1
= fma(a_0
[5], b_5
[1], c_0_1
);
55 c_0_2
= fma(a_0
[0], b_0
[2], c_0_2
);
56 c_0_2
= fma(a_0
[1], b_1
[2], c_0_2
);
57 c_0_2
= fma(a_0
[2], b_2
[2], c_0_2
);
58 c_0_2
= fma(a_0
[3], b_3
[2], c_0_2
);
59 c_0_2
= fma(a_0
[4], b_4
[2], c_0_2
);
60 c_0_2
= fma(a_0
[5], b_5
[2], c_0_2
);
61 c_0_3
= fma(a_0
[0], b_0
[3], c_0_3
);
62 c_0_3
= fma(a_0
[1], b_1
[3], c_0_3
);
63 c_0_3
= fma(a_0
[2], b_2
[3], c_0_3
);
64 c_0_3
= fma(a_0
[3], b_3
[3], c_0_3
);
65 c_0_3
= fma(a_0
[4], b_4
[3], c_0_3
);
66 c_0_3
= fma(a_0
[5], b_5
[3], c_0_3
);
67 c_0_4
= fma(a_0
[0], b_0
[4], c_0_4
);
68 c_0_4
= fma(a_0
[1], b_1
[4], c_0_4
);
69 c_0_4
= fma(a_0
[2], b_2
[4], c_0_4
);
70 c_0_4
= fma(a_0
[3], b_3
[4], c_0_4
);
71 c_0_4
= fma(a_0
[4], b_4
[4], c_0_4
);
72 c_0_4
= fma(a_0
[5], b_5
[4], c_0_4
);
73 c_1_0
= fma(a_1
[0], b_0
[0], c_1_0
);
74 c_1_0
= fma(a_1
[1], b_1
[0], c_1_0
);
75 c_1_0
= fma(a_1
[2], b_2
[0], c_1_0
);
76 c_1_0
= fma(a_1
[3], b_3
[0], c_1_0
);
77 c_1_0
= fma(a_1
[4], b_4
[0], c_1_0
);
78 c_1_0
= fma(a_1
[5], b_5
[0], c_1_0
);
79 c_1_1
= fma(a_1
[0], b_0
[1], c_1_1
);
80 c_1_1
= fma(a_1
[1], b_1
[1], c_1_1
);
81 c_1_1
= fma(a_1
[2], b_2
[1], c_1_1
);
82 c_1_1
= fma(a_1
[3], b_3
[1], c_1_1
);
83 c_1_1
= fma(a_1
[4], b_4
[1], c_1_1
);
84 c_1_1
= fma(a_1
[5], b_5
[1], c_1_1
);
85 c_1_2
= fma(a_1
[0], b_0
[2], c_1_2
);
86 c_1_2
= fma(a_1
[1], b_1
[2], c_1_2
);
87 c_1_2
= fma(a_1
[2], b_2
[2], c_1_2
);
88 c_1_2
= fma(a_1
[3], b_3
[2], c_1_2
);
89 c_1_2
= fma(a_1
[4], b_4
[2], c_1_2
);
90 c_1_2
= fma(a_1
[5], b_5
[2], c_1_2
);
91 c_1_3
= fma(a_1
[0], b_0
[3], c_1_3
);
92 c_1_3
= fma(a_1
[1], b_1
[3], c_1_3
);
93 c_1_3
= fma(a_1
[2], b_2
[3], c_1_3
);
94 c_1_3
= fma(a_1
[3], b_3
[3], c_1_3
);
95 c_1_3
= fma(a_1
[4], b_4
[3], c_1_3
);
96 c_1_3
= fma(a_1
[5], b_5
[3], c_1_3
);
97 c_1_4
= fma(a_1
[0], b_0
[4], c_1_4
);
98 c_1_4
= fma(a_1
[1], b_1
[4], c_1_4
);
99 c_1_4
= fma(a_1
[2], b_2
[4], c_1_4
);
100 c_1_4
= fma(a_1
[3], b_3
[4], c_1_4
);
101 c_1_4
= fma(a_1
[4], b_4
[4], c_1_4
);
102 c_1_4
= fma(a_1
[5], b_5
[4], c_1_4
);
103 c_2_0
= fma(a_2
[0], b_0
[0], c_2_0
);
104 c_2_0
= fma(a_2
[1], b_1
[0], c_2_0
);
105 c_2_0
= fma(a_2
[2], b_2
[0], c_2_0
);
106 c_2_0
= fma(a_2
[3], b_3
[0], c_2_0
);
107 c_2_0
= fma(a_2
[4], b_4
[0], c_2_0
);
108 c_2_0
= fma(a_2
[5], b_5
[0], c_2_0
);
109 c_2_1
= fma(a_2
[0], b_0
[1], c_2_1
);
110 c_2_1
= fma(a_2
[1], b_1
[1], c_2_1
);
111 c_2_1
= fma(a_2
[2], b_2
[1], c_2_1
);
112 c_2_1
= fma(a_2
[3], b_3
[1], c_2_1
);
113 c_2_1
= fma(a_2
[4], b_4
[1], c_2_1
);
114 c_2_1
= fma(a_2
[5], b_5
[1], c_2_1
);
115 c_2_2
= fma(a_2
[0], b_0
[2], c_2_2
);
116 c_2_2
= fma(a_2
[1], b_1
[2], c_2_2
);
117 c_2_2
= fma(a_2
[2], b_2
[2], c_2_2
);
118 c_2_2
= fma(a_2
[3], b_3
[2], c_2_2
);
119 c_2_2
= fma(a_2
[4], b_4
[2], c_2_2
);
120 c_2_2
= fma(a_2
[5], b_5
[2], c_2_2
);
121 c_2_3
= fma(a_2
[0], b_0
[3], c_2_3
);
122 c_2_3
= fma(a_2
[1], b_1
[3], c_2_3
);
123 c_2_3
= fma(a_2
[2], b_2
[3], c_2_3
);
124 c_2_3
= fma(a_2
[3], b_3
[3], c_2_3
);
125 c_2_3
= fma(a_2
[4], b_4
[3], c_2_3
);
126 c_2_3
= fma(a_2
[5], b_5
[3], c_2_3
);
127 c_2_4
= fma(a_2
[0], b_0
[4], c_2_4
);
128 c_2_4
= fma(a_2
[1], b_1
[4], c_2_4
);
129 c_2_4
= fma(a_2
[2], b_2
[4], c_2_4
);
130 c_2_4
= fma(a_2
[3], b_3
[4], c_2_4
);
131 c_2_4
= fma(a_2
[4], b_4
[4], c_2_4
);
132 c_2_4
= fma(a_2
[5], b_5
[4], c_2_4
);
133 c_3_0
= fma(a_3
[0], b_0
[0], c_3_0
);
134 c_3_0
= fma(a_3
[1], b_1
[0], c_3_0
);
135 c_3_0
= fma(a_3
[2], b_2
[0], c_3_0
);
136 c_3_0
= fma(a_3
[3], b_3
[0], c_3_0
);
137 c_3_0
= fma(a_3
[4], b_4
[0], c_3_0
);
138 c_3_0
= fma(a_3
[5], b_5
[0], c_3_0
);
139 c_3_1
= fma(a_3
[0], b_0
[1], c_3_1
);
140 c_3_1
= fma(a_3
[1], b_1
[1], c_3_1
);
141 c_3_1
= fma(a_3
[2], b_2
[1], c_3_1
);
142 c_3_1
= fma(a_3
[3], b_3
[1], c_3_1
);
143 c_3_1
= fma(a_3
[4], b_4
[1], c_3_1
);
144 c_3_1
= fma(a_3
[5], b_5
[1], c_3_1
);
145 c_3_2
= fma(a_3
[0], b_0
[2], c_3_2
);
146 c_3_2
= fma(a_3
[1], b_1
[2], c_3_2
);
147 c_3_2
= fma(a_3
[2], b_2
[2], c_3_2
);
148 c_3_2
= fma(a_3
[3], b_3
[2], c_3_2
);
149 c_3_2
= fma(a_3
[4], b_4
[2], c_3_2
);
150 c_3_2
= fma(a_3
[5], b_5
[2], c_3_2
);
151 c_3_3
= fma(a_3
[0], b_0
[3], c_3_3
);
152 c_3_3
= fma(a_3
[1], b_1
[3], c_3_3
);
153 c_3_3
= fma(a_3
[2], b_2
[3], c_3_3
);
154 c_3_3
= fma(a_3
[3], b_3
[3], c_3_3
);
155 c_3_3
= fma(a_3
[4], b_4
[3], c_3_3
);
156 c_3_3
= fma(a_3
[5], b_5
[3], c_3_3
);
157 c_3_4
= fma(a_3
[0], b_0
[4], c_3_4
);
158 c_3_4
= fma(a_3
[1], b_1
[4], c_3_4
);
159 c_3_4
= fma(a_3
[2], b_2
[4], c_3_4
);
160 c_3_4
= fma(a_3
[3], b_3
[4], c_3_4
);
161 c_3_4
= fma(a_3
[4], b_4
[4], c_3_4
);
162 c_3_4
= fma(a_3
[5], b_5
[4], c_3_4
);
164 for (t
*a
= a0
+ p
/RBK
*RBK
, *b
= b0
+ p
/RBK
*RBK
*ldb
; a
< a0
+ p
; a
++, b
+= ldb
)
171 c_0_0
= fma(a_0
[0], b_0
[0], c_0_0
);
172 c_0_1
= fma(a_0
[0], b_0
[1], c_0_1
);
173 c_0_2
= fma(a_0
[0], b_0
[2], c_0_2
);
174 c_0_3
= fma(a_0
[0], b_0
[3], c_0_3
);
175 c_0_4
= fma(a_0
[0], b_0
[4], c_0_4
);
176 c_1_0
= fma(a_1
[0], b_0
[0], c_1_0
);
177 c_1_1
= fma(a_1
[0], b_0
[1], c_1_1
);
178 c_1_2
= fma(a_1
[0], b_0
[2], c_1_2
);
179 c_1_3
= fma(a_1
[0], b_0
[3], c_1_3
);
180 c_1_4
= fma(a_1
[0], b_0
[4], c_1_4
);
181 c_2_0
= fma(a_2
[0], b_0
[0], c_2_0
);
182 c_2_1
= fma(a_2
[0], b_0
[1], c_2_1
);
183 c_2_2
= fma(a_2
[0], b_0
[2], c_2_2
);
184 c_2_3
= fma(a_2
[0], b_0
[3], c_2_3
);
185 c_2_4
= fma(a_2
[0], b_0
[4], c_2_4
);
186 c_3_0
= fma(a_3
[0], b_0
[0], c_3_0
);
187 c_3_1
= fma(a_3
[0], b_0
[1], c_3_1
);
188 c_3_2
= fma(a_3
[0], b_0
[2], c_3_2
);
189 c_3_3
= fma(a_3
[0], b_0
[3], c_3_3
);
190 c_3_4
= fma(a_3
[0], b_0
[4], c_3_4
);