1 static const int RBM
= 4, RBN
= 5, RBK
= 6;
2 static const int CBM
= 24, CBN
= 25, CBK
= 24;
3 static inline void kloop(size_t p
, t
* a0
, size_t lda
, t
* b0
, size_t ldb
, t
* c
, size_t ldc
)
29 for (t
*a
= a0
, *b
= b0
; a
< a0
+ p
/RBK
*RBK
; a
+= RBK
, b
+= RBK
*ldb
)
41 c_0_0
= fma(a_0
[0], b_0
[0], c_0_0
);
42 c_0_1
= fma(a_0
[0], b_0
[1], c_0_1
);
43 c_0_2
= fma(a_0
[0], b_0
[2], c_0_2
);
44 c_0_3
= fma(a_0
[0], b_0
[3], c_0_3
);
45 c_0_4
= fma(a_0
[0], b_0
[4], c_0_4
);
46 c_1_0
= fma(a_1
[0], b_0
[0], c_1_0
);
47 c_1_1
= fma(a_1
[0], b_0
[1], c_1_1
);
48 c_1_2
= fma(a_1
[0], b_0
[2], c_1_2
);
49 c_1_3
= fma(a_1
[0], b_0
[3], c_1_3
);
50 c_1_4
= fma(a_1
[0], b_0
[4], c_1_4
);
51 c_2_0
= fma(a_2
[0], b_0
[0], c_2_0
);
52 c_2_1
= fma(a_2
[0], b_0
[1], c_2_1
);
53 c_2_2
= fma(a_2
[0], b_0
[2], c_2_2
);
54 c_2_3
= fma(a_2
[0], b_0
[3], c_2_3
);
55 c_2_4
= fma(a_2
[0], b_0
[4], c_2_4
);
56 c_3_0
= fma(a_3
[0], b_0
[0], c_3_0
);
57 c_3_1
= fma(a_3
[0], b_0
[1], c_3_1
);
58 c_3_2
= fma(a_3
[0], b_0
[2], c_3_2
);
59 c_3_3
= fma(a_3
[0], b_0
[3], c_3_3
);
60 c_3_4
= fma(a_3
[0], b_0
[4], c_3_4
);
61 c_0_0
= fma(a_0
[1], b_1
[0], c_0_0
);
62 c_0_1
= fma(a_0
[1], b_1
[1], c_0_1
);
63 c_0_2
= fma(a_0
[1], b_1
[2], c_0_2
);
64 c_0_3
= fma(a_0
[1], b_1
[3], c_0_3
);
65 c_0_4
= fma(a_0
[1], b_1
[4], c_0_4
);
66 c_1_0
= fma(a_1
[1], b_1
[0], c_1_0
);
67 c_1_1
= fma(a_1
[1], b_1
[1], c_1_1
);
68 c_1_2
= fma(a_1
[1], b_1
[2], c_1_2
);
69 c_1_3
= fma(a_1
[1], b_1
[3], c_1_3
);
70 c_1_4
= fma(a_1
[1], b_1
[4], c_1_4
);
71 c_2_0
= fma(a_2
[1], b_1
[0], c_2_0
);
72 c_2_1
= fma(a_2
[1], b_1
[1], c_2_1
);
73 c_2_2
= fma(a_2
[1], b_1
[2], c_2_2
);
74 c_2_3
= fma(a_2
[1], b_1
[3], c_2_3
);
75 c_2_4
= fma(a_2
[1], b_1
[4], c_2_4
);
76 c_3_0
= fma(a_3
[1], b_1
[0], c_3_0
);
77 c_3_1
= fma(a_3
[1], b_1
[1], c_3_1
);
78 c_3_2
= fma(a_3
[1], b_1
[2], c_3_2
);
79 c_3_3
= fma(a_3
[1], b_1
[3], c_3_3
);
80 c_3_4
= fma(a_3
[1], b_1
[4], c_3_4
);
81 c_0_0
= fma(a_0
[2], b_2
[0], c_0_0
);
82 c_0_1
= fma(a_0
[2], b_2
[1], c_0_1
);
83 c_0_2
= fma(a_0
[2], b_2
[2], c_0_2
);
84 c_0_3
= fma(a_0
[2], b_2
[3], c_0_3
);
85 c_0_4
= fma(a_0
[2], b_2
[4], c_0_4
);
86 c_1_0
= fma(a_1
[2], b_2
[0], c_1_0
);
87 c_1_1
= fma(a_1
[2], b_2
[1], c_1_1
);
88 c_1_2
= fma(a_1
[2], b_2
[2], c_1_2
);
89 c_1_3
= fma(a_1
[2], b_2
[3], c_1_3
);
90 c_1_4
= fma(a_1
[2], b_2
[4], c_1_4
);
91 c_2_0
= fma(a_2
[2], b_2
[0], c_2_0
);
92 c_2_1
= fma(a_2
[2], b_2
[1], c_2_1
);
93 c_2_2
= fma(a_2
[2], b_2
[2], c_2_2
);
94 c_2_3
= fma(a_2
[2], b_2
[3], c_2_3
);
95 c_2_4
= fma(a_2
[2], b_2
[4], c_2_4
);
96 c_3_0
= fma(a_3
[2], b_2
[0], c_3_0
);
97 c_3_1
= fma(a_3
[2], b_2
[1], c_3_1
);
98 c_3_2
= fma(a_3
[2], b_2
[2], c_3_2
);
99 c_3_3
= fma(a_3
[2], b_2
[3], c_3_3
);
100 c_3_4
= fma(a_3
[2], b_2
[4], c_3_4
);
101 c_0_0
= fma(a_0
[3], b_3
[0], c_0_0
);
102 c_0_1
= fma(a_0
[3], b_3
[1], c_0_1
);
103 c_0_2
= fma(a_0
[3], b_3
[2], c_0_2
);
104 c_0_3
= fma(a_0
[3], b_3
[3], c_0_3
);
105 c_0_4
= fma(a_0
[3], b_3
[4], c_0_4
);
106 c_1_0
= fma(a_1
[3], b_3
[0], c_1_0
);
107 c_1_1
= fma(a_1
[3], b_3
[1], c_1_1
);
108 c_1_2
= fma(a_1
[3], b_3
[2], c_1_2
);
109 c_1_3
= fma(a_1
[3], b_3
[3], c_1_3
);
110 c_1_4
= fma(a_1
[3], b_3
[4], c_1_4
);
111 c_2_0
= fma(a_2
[3], b_3
[0], c_2_0
);
112 c_2_1
= fma(a_2
[3], b_3
[1], c_2_1
);
113 c_2_2
= fma(a_2
[3], b_3
[2], c_2_2
);
114 c_2_3
= fma(a_2
[3], b_3
[3], c_2_3
);
115 c_2_4
= fma(a_2
[3], b_3
[4], c_2_4
);
116 c_3_0
= fma(a_3
[3], b_3
[0], c_3_0
);
117 c_3_1
= fma(a_3
[3], b_3
[1], c_3_1
);
118 c_3_2
= fma(a_3
[3], b_3
[2], c_3_2
);
119 c_3_3
= fma(a_3
[3], b_3
[3], c_3_3
);
120 c_3_4
= fma(a_3
[3], b_3
[4], c_3_4
);
121 c_0_0
= fma(a_0
[4], b_4
[0], c_0_0
);
122 c_0_1
= fma(a_0
[4], b_4
[1], c_0_1
);
123 c_0_2
= fma(a_0
[4], b_4
[2], c_0_2
);
124 c_0_3
= fma(a_0
[4], b_4
[3], c_0_3
);
125 c_0_4
= fma(a_0
[4], b_4
[4], c_0_4
);
126 c_1_0
= fma(a_1
[4], b_4
[0], c_1_0
);
127 c_1_1
= fma(a_1
[4], b_4
[1], c_1_1
);
128 c_1_2
= fma(a_1
[4], b_4
[2], c_1_2
);
129 c_1_3
= fma(a_1
[4], b_4
[3], c_1_3
);
130 c_1_4
= fma(a_1
[4], b_4
[4], c_1_4
);
131 c_2_0
= fma(a_2
[4], b_4
[0], c_2_0
);
132 c_2_1
= fma(a_2
[4], b_4
[1], c_2_1
);
133 c_2_2
= fma(a_2
[4], b_4
[2], c_2_2
);
134 c_2_3
= fma(a_2
[4], b_4
[3], c_2_3
);
135 c_2_4
= fma(a_2
[4], b_4
[4], c_2_4
);
136 c_3_0
= fma(a_3
[4], b_4
[0], c_3_0
);
137 c_3_1
= fma(a_3
[4], b_4
[1], c_3_1
);
138 c_3_2
= fma(a_3
[4], b_4
[2], c_3_2
);
139 c_3_3
= fma(a_3
[4], b_4
[3], c_3_3
);
140 c_3_4
= fma(a_3
[4], b_4
[4], c_3_4
);
141 c_0_0
= fma(a_0
[5], b_5
[0], c_0_0
);
142 c_0_1
= fma(a_0
[5], b_5
[1], c_0_1
);
143 c_0_2
= fma(a_0
[5], b_5
[2], c_0_2
);
144 c_0_3
= fma(a_0
[5], b_5
[3], c_0_3
);
145 c_0_4
= fma(a_0
[5], b_5
[4], c_0_4
);
146 c_1_0
= fma(a_1
[5], b_5
[0], c_1_0
);
147 c_1_1
= fma(a_1
[5], b_5
[1], c_1_1
);
148 c_1_2
= fma(a_1
[5], b_5
[2], c_1_2
);
149 c_1_3
= fma(a_1
[5], b_5
[3], c_1_3
);
150 c_1_4
= fma(a_1
[5], b_5
[4], c_1_4
);
151 c_2_0
= fma(a_2
[5], b_5
[0], c_2_0
);
152 c_2_1
= fma(a_2
[5], b_5
[1], c_2_1
);
153 c_2_2
= fma(a_2
[5], b_5
[2], c_2_2
);
154 c_2_3
= fma(a_2
[5], b_5
[3], c_2_3
);
155 c_2_4
= fma(a_2
[5], b_5
[4], c_2_4
);
156 c_3_0
= fma(a_3
[5], b_5
[0], c_3_0
);
157 c_3_1
= fma(a_3
[5], b_5
[1], c_3_1
);
158 c_3_2
= fma(a_3
[5], b_5
[2], c_3_2
);
159 c_3_3
= fma(a_3
[5], b_5
[3], c_3_3
);
160 c_3_4
= fma(a_3
[5], b_5
[4], c_3_4
);
162 for (t
*a
= a0
+ p
/RBK
*RBK
, *b
= b0
+ p
/RBK
*RBK
*ldb
; a
< a0
+ p
; a
++, b
+= ldb
)
169 c_0_0
= fma(a_0
[0], b_0
[0], c_0_0
);
170 c_0_1
= fma(a_0
[0], b_0
[1], c_0_1
);
171 c_0_2
= fma(a_0
[0], b_0
[2], c_0_2
);
172 c_0_3
= fma(a_0
[0], b_0
[3], c_0_3
);
173 c_0_4
= fma(a_0
[0], b_0
[4], c_0_4
);
174 c_1_0
= fma(a_1
[0], b_0
[0], c_1_0
);
175 c_1_1
= fma(a_1
[0], b_0
[1], c_1_1
);
176 c_1_2
= fma(a_1
[0], b_0
[2], c_1_2
);
177 c_1_3
= fma(a_1
[0], b_0
[3], c_1_3
);
178 c_1_4
= fma(a_1
[0], b_0
[4], c_1_4
);
179 c_2_0
= fma(a_2
[0], b_0
[0], c_2_0
);
180 c_2_1
= fma(a_2
[0], b_0
[1], c_2_1
);
181 c_2_2
= fma(a_2
[0], b_0
[2], c_2_2
);
182 c_2_3
= fma(a_2
[0], b_0
[3], c_2_3
);
183 c_2_4
= fma(a_2
[0], b_0
[4], c_2_4
);
184 c_3_0
= fma(a_3
[0], b_0
[0], c_3_0
);
185 c_3_1
= fma(a_3
[0], b_0
[1], c_3_1
);
186 c_3_2
= fma(a_3
[0], b_0
[2], c_3_2
);
187 c_3_3
= fma(a_3
[0], b_0
[3], c_3_3
);
188 c_3_4
= fma(a_3
[0], b_0
[4], c_3_4
);