i386.c (ix86_rtx_costs): Make difference between x87 and SSE operations.
[gcc.git] / gcc / config / i386 / x86-tune-costs.h
1
2 /* Processor costs (relative to an add) */
3 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
4 #define COSTS_N_BYTES(N) ((N) * 2)
5
6 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
7
8 static stringop_algs ix86_size_memcpy[2] = {
9 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
10 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
11 static stringop_algs ix86_size_memset[2] = {
12 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
13 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
14
15 const
16 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
17 COSTS_N_BYTES (2), /* cost of an add instruction */
18 COSTS_N_BYTES (3), /* cost of a lea instruction */
19 COSTS_N_BYTES (2), /* variable shift costs */
20 COSTS_N_BYTES (3), /* constant shift costs */
21 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
22 COSTS_N_BYTES (3), /* HI */
23 COSTS_N_BYTES (3), /* SI */
24 COSTS_N_BYTES (3), /* DI */
25 COSTS_N_BYTES (5)}, /* other */
26 0, /* cost of multiply per each bit set */
27 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
28 COSTS_N_BYTES (3), /* HI */
29 COSTS_N_BYTES (3), /* SI */
30 COSTS_N_BYTES (3), /* DI */
31 COSTS_N_BYTES (5)}, /* other */
32 COSTS_N_BYTES (3), /* cost of movsx */
33 COSTS_N_BYTES (3), /* cost of movzx */
34 0, /* "large" insn */
35 2, /* MOVE_RATIO */
36 2, /* cost for loading QImode using movzbl */
37 {2, 2, 2}, /* cost of loading integer registers
38 in QImode, HImode and SImode.
39 Relative to reg-reg move (2). */
40 {2, 2, 2}, /* cost of storing integer registers */
41 2, /* cost of reg,reg fld/fst */
42 {2, 2, 2}, /* cost of loading fp registers
43 in SFmode, DFmode and XFmode */
44 {2, 2, 2}, /* cost of storing fp registers
45 in SFmode, DFmode and XFmode */
46 3, /* cost of moving MMX register */
47 {3, 3}, /* cost of loading MMX registers
48 in SImode and DImode */
49 {3, 3}, /* cost of storing MMX registers
50 in SImode and DImode */
51 3, /* cost of moving SSE register */
52 {3, 3, 3}, /* cost of loading SSE registers
53 in SImode, DImode and TImode */
54 {3, 3, 3}, /* cost of storing SSE registers
55 in SImode, DImode and TImode */
56 3, /* MMX or SSE register to integer */
57 0, /* size of l1 cache */
58 0, /* size of l2 cache */
59 0, /* size of prefetch block */
60 0, /* number of parallel prefetches */
61 2, /* Branch cost */
62 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
63 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
64 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
65 COSTS_N_BYTES (2), /* cost of FABS instruction. */
66 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
67 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
68
69 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
70 COSTS_N_BYTES (2), /* cost of MULSS instruction. */
71 COSTS_N_BYTES (2), /* cost of MULSD instruction. */
72 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
73 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
74 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
75 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
76 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
77 ix86_size_memcpy,
78 ix86_size_memset,
79 1, /* scalar_stmt_cost. */
80 1, /* scalar load_cost. */
81 1, /* scalar_store_cost. */
82 1, /* vec_stmt_cost. */
83 1, /* vec_to_scalar_cost. */
84 1, /* scalar_to_vec_cost. */
85 1, /* vec_align_load_cost. */
86 1, /* vec_unalign_load_cost. */
87 1, /* vec_store_cost. */
88 1, /* cond_taken_branch_cost. */
89 1, /* cond_not_taken_branch_cost. */
90 };
91
92 /* Processor costs (relative to an add) */
93 static stringop_algs i386_memcpy[2] = {
94 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
95 DUMMY_STRINGOP_ALGS};
96 static stringop_algs i386_memset[2] = {
97 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
98 DUMMY_STRINGOP_ALGS};
99
100 static const
101 struct processor_costs i386_cost = { /* 386 specific costs */
102 COSTS_N_INSNS (1), /* cost of an add instruction */
103 COSTS_N_INSNS (1), /* cost of a lea instruction */
104 COSTS_N_INSNS (3), /* variable shift costs */
105 COSTS_N_INSNS (2), /* constant shift costs */
106 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
107 COSTS_N_INSNS (6), /* HI */
108 COSTS_N_INSNS (6), /* SI */
109 COSTS_N_INSNS (6), /* DI */
110 COSTS_N_INSNS (6)}, /* other */
111 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
112 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
113 COSTS_N_INSNS (23), /* HI */
114 COSTS_N_INSNS (23), /* SI */
115 COSTS_N_INSNS (23), /* DI */
116 COSTS_N_INSNS (23)}, /* other */
117 COSTS_N_INSNS (3), /* cost of movsx */
118 COSTS_N_INSNS (2), /* cost of movzx */
119 15, /* "large" insn */
120 3, /* MOVE_RATIO */
121 4, /* cost for loading QImode using movzbl */
122 {2, 4, 2}, /* cost of loading integer registers
123 in QImode, HImode and SImode.
124 Relative to reg-reg move (2). */
125 {2, 4, 2}, /* cost of storing integer registers */
126 2, /* cost of reg,reg fld/fst */
127 {8, 8, 8}, /* cost of loading fp registers
128 in SFmode, DFmode and XFmode */
129 {8, 8, 8}, /* cost of storing fp registers
130 in SFmode, DFmode and XFmode */
131 2, /* cost of moving MMX register */
132 {4, 8}, /* cost of loading MMX registers
133 in SImode and DImode */
134 {4, 8}, /* cost of storing MMX registers
135 in SImode and DImode */
136 2, /* cost of moving SSE register */
137 {4, 8, 16}, /* cost of loading SSE registers
138 in SImode, DImode and TImode */
139 {4, 8, 16}, /* cost of storing SSE registers
140 in SImode, DImode and TImode */
141 3, /* MMX or SSE register to integer */
142 0, /* size of l1 cache */
143 0, /* size of l2 cache */
144 0, /* size of prefetch block */
145 0, /* number of parallel prefetches */
146 1, /* Branch cost */
147 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
148 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
149 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
150 COSTS_N_INSNS (22), /* cost of FABS instruction. */
151 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
152 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
153
154 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
155 COSTS_N_INSNS (27), /* cost of MULSS instruction. */
156 COSTS_N_INSNS (27), /* cost of MULSD instruction. */
157 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
158 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
159 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
160 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
161 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
162 i386_memcpy,
163 i386_memset,
164 1, /* scalar_stmt_cost. */
165 1, /* scalar load_cost. */
166 1, /* scalar_store_cost. */
167 1, /* vec_stmt_cost. */
168 1, /* vec_to_scalar_cost. */
169 1, /* scalar_to_vec_cost. */
170 1, /* vec_align_load_cost. */
171 2, /* vec_unalign_load_cost. */
172 1, /* vec_store_cost. */
173 3, /* cond_taken_branch_cost. */
174 1, /* cond_not_taken_branch_cost. */
175 };
176
177 static stringop_algs i486_memcpy[2] = {
178 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
179 DUMMY_STRINGOP_ALGS};
180 static stringop_algs i486_memset[2] = {
181 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
182 DUMMY_STRINGOP_ALGS};
183
184 static const
185 struct processor_costs i486_cost = { /* 486 specific costs */
186 COSTS_N_INSNS (1), /* cost of an add instruction */
187 COSTS_N_INSNS (1), /* cost of a lea instruction */
188 COSTS_N_INSNS (3), /* variable shift costs */
189 COSTS_N_INSNS (2), /* constant shift costs */
190 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
191 COSTS_N_INSNS (12), /* HI */
192 COSTS_N_INSNS (12), /* SI */
193 COSTS_N_INSNS (12), /* DI */
194 COSTS_N_INSNS (12)}, /* other */
195 1, /* cost of multiply per each bit set */
196 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
197 COSTS_N_INSNS (40), /* HI */
198 COSTS_N_INSNS (40), /* SI */
199 COSTS_N_INSNS (40), /* DI */
200 COSTS_N_INSNS (40)}, /* other */
201 COSTS_N_INSNS (3), /* cost of movsx */
202 COSTS_N_INSNS (2), /* cost of movzx */
203 15, /* "large" insn */
204 3, /* MOVE_RATIO */
205 4, /* cost for loading QImode using movzbl */
206 {2, 4, 2}, /* cost of loading integer registers
207 in QImode, HImode and SImode.
208 Relative to reg-reg move (2). */
209 {2, 4, 2}, /* cost of storing integer registers */
210 2, /* cost of reg,reg fld/fst */
211 {8, 8, 8}, /* cost of loading fp registers
212 in SFmode, DFmode and XFmode */
213 {8, 8, 8}, /* cost of storing fp registers
214 in SFmode, DFmode and XFmode */
215 2, /* cost of moving MMX register */
216 {4, 8}, /* cost of loading MMX registers
217 in SImode and DImode */
218 {4, 8}, /* cost of storing MMX registers
219 in SImode and DImode */
220 2, /* cost of moving SSE register */
221 {4, 8, 16}, /* cost of loading SSE registers
222 in SImode, DImode and TImode */
223 {4, 8, 16}, /* cost of storing SSE registers
224 in SImode, DImode and TImode */
225 3, /* MMX or SSE register to integer */
226 4, /* size of l1 cache. 486 has 8kB cache
227 shared for code and data, so 4kB is
228 not really precise. */
229 4, /* size of l2 cache */
230 0, /* size of prefetch block */
231 0, /* number of parallel prefetches */
232 1, /* Branch cost */
233 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
234 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
235 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
236 COSTS_N_INSNS (3), /* cost of FABS instruction. */
237 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
238 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
239
240 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
241 COSTS_N_INSNS (16), /* cost of MULSS instruction. */
242 COSTS_N_INSNS (16), /* cost of MULSD instruction. */
243 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
244 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
245 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
246 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
247 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
248 i486_memcpy,
249 i486_memset,
250 1, /* scalar_stmt_cost. */
251 1, /* scalar load_cost. */
252 1, /* scalar_store_cost. */
253 1, /* vec_stmt_cost. */
254 1, /* vec_to_scalar_cost. */
255 1, /* scalar_to_vec_cost. */
256 1, /* vec_align_load_cost. */
257 2, /* vec_unalign_load_cost. */
258 1, /* vec_store_cost. */
259 3, /* cond_taken_branch_cost. */
260 1, /* cond_not_taken_branch_cost. */
261 };
262
263 static stringop_algs pentium_memcpy[2] = {
264 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
265 DUMMY_STRINGOP_ALGS};
266 static stringop_algs pentium_memset[2] = {
267 {libcall, {{-1, rep_prefix_4_byte, false}}},
268 DUMMY_STRINGOP_ALGS};
269
270 static const
271 struct processor_costs pentium_cost = {
272 COSTS_N_INSNS (1), /* cost of an add instruction */
273 COSTS_N_INSNS (1), /* cost of a lea instruction */
274 COSTS_N_INSNS (4), /* variable shift costs */
275 COSTS_N_INSNS (1), /* constant shift costs */
276 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
277 COSTS_N_INSNS (11), /* HI */
278 COSTS_N_INSNS (11), /* SI */
279 COSTS_N_INSNS (11), /* DI */
280 COSTS_N_INSNS (11)}, /* other */
281 0, /* cost of multiply per each bit set */
282 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
283 COSTS_N_INSNS (25), /* HI */
284 COSTS_N_INSNS (25), /* SI */
285 COSTS_N_INSNS (25), /* DI */
286 COSTS_N_INSNS (25)}, /* other */
287 COSTS_N_INSNS (3), /* cost of movsx */
288 COSTS_N_INSNS (2), /* cost of movzx */
289 8, /* "large" insn */
290 6, /* MOVE_RATIO */
291 6, /* cost for loading QImode using movzbl */
292 {2, 4, 2}, /* cost of loading integer registers
293 in QImode, HImode and SImode.
294 Relative to reg-reg move (2). */
295 {2, 4, 2}, /* cost of storing integer registers */
296 2, /* cost of reg,reg fld/fst */
297 {2, 2, 6}, /* cost of loading fp registers
298 in SFmode, DFmode and XFmode */
299 {4, 4, 6}, /* cost of storing fp registers
300 in SFmode, DFmode and XFmode */
301 8, /* cost of moving MMX register */
302 {8, 8}, /* cost of loading MMX registers
303 in SImode and DImode */
304 {8, 8}, /* cost of storing MMX registers
305 in SImode and DImode */
306 2, /* cost of moving SSE register */
307 {4, 8, 16}, /* cost of loading SSE registers
308 in SImode, DImode and TImode */
309 {4, 8, 16}, /* cost of storing SSE registers
310 in SImode, DImode and TImode */
311 3, /* MMX or SSE register to integer */
312 8, /* size of l1 cache. */
313 8, /* size of l2 cache */
314 0, /* size of prefetch block */
315 0, /* number of parallel prefetches */
316 2, /* Branch cost */
317 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
318 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
319 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
320 COSTS_N_INSNS (1), /* cost of FABS instruction. */
321 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
322 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
323
324 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
325 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
326 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
327 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
328 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
329 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
330 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
331 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
332 pentium_memcpy,
333 pentium_memset,
334 1, /* scalar_stmt_cost. */
335 1, /* scalar load_cost. */
336 1, /* scalar_store_cost. */
337 1, /* vec_stmt_cost. */
338 1, /* vec_to_scalar_cost. */
339 1, /* scalar_to_vec_cost. */
340 1, /* vec_align_load_cost. */
341 2, /* vec_unalign_load_cost. */
342 1, /* vec_store_cost. */
343 3, /* cond_taken_branch_cost. */
344 1, /* cond_not_taken_branch_cost. */
345 };
346
347 static const
348 struct processor_costs lakemont_cost = {
349 COSTS_N_INSNS (1), /* cost of an add instruction */
350 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
351 COSTS_N_INSNS (1), /* variable shift costs */
352 COSTS_N_INSNS (1), /* constant shift costs */
353 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
354 COSTS_N_INSNS (11), /* HI */
355 COSTS_N_INSNS (11), /* SI */
356 COSTS_N_INSNS (11), /* DI */
357 COSTS_N_INSNS (11)}, /* other */
358 0, /* cost of multiply per each bit set */
359 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
360 COSTS_N_INSNS (25), /* HI */
361 COSTS_N_INSNS (25), /* SI */
362 COSTS_N_INSNS (25), /* DI */
363 COSTS_N_INSNS (25)}, /* other */
364 COSTS_N_INSNS (3), /* cost of movsx */
365 COSTS_N_INSNS (2), /* cost of movzx */
366 8, /* "large" insn */
367 17, /* MOVE_RATIO */
368 6, /* cost for loading QImode using movzbl */
369 {2, 4, 2}, /* cost of loading integer registers
370 in QImode, HImode and SImode.
371 Relative to reg-reg move (2). */
372 {2, 4, 2}, /* cost of storing integer registers */
373 2, /* cost of reg,reg fld/fst */
374 {2, 2, 6}, /* cost of loading fp registers
375 in SFmode, DFmode and XFmode */
376 {4, 4, 6}, /* cost of storing fp registers
377 in SFmode, DFmode and XFmode */
378 8, /* cost of moving MMX register */
379 {8, 8}, /* cost of loading MMX registers
380 in SImode and DImode */
381 {8, 8}, /* cost of storing MMX registers
382 in SImode and DImode */
383 2, /* cost of moving SSE register */
384 {4, 8, 16}, /* cost of loading SSE registers
385 in SImode, DImode and TImode */
386 {4, 8, 16}, /* cost of storing SSE registers
387 in SImode, DImode and TImode */
388 3, /* MMX or SSE register to integer */
389 8, /* size of l1 cache. */
390 8, /* size of l2 cache */
391 0, /* size of prefetch block */
392 0, /* number of parallel prefetches */
393 2, /* Branch cost */
394 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
395 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
396 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
397 COSTS_N_INSNS (1), /* cost of FABS instruction. */
398 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
399 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
400
401 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
402 COSTS_N_INSNS (5), /* cost of MULSS instruction. */
403 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
404 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
405 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
406 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
407 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
408 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
409 pentium_memcpy,
410 pentium_memset,
411 1, /* scalar_stmt_cost. */
412 1, /* scalar load_cost. */
413 1, /* scalar_store_cost. */
414 1, /* vec_stmt_cost. */
415 1, /* vec_to_scalar_cost. */
416 1, /* scalar_to_vec_cost. */
417 1, /* vec_align_load_cost. */
418 2, /* vec_unalign_load_cost. */
419 1, /* vec_store_cost. */
420 3, /* cond_taken_branch_cost. */
421 1, /* cond_not_taken_branch_cost. */
422 };
423
424 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
425 (we ensure the alignment). For small blocks inline loop is still a
426 noticeable win, for bigger blocks either rep movsl or rep movsb is
427 way to go. Rep movsb has apparently more expensive startup time in CPU,
428 but after 4K the difference is down in the noise. */
429 static stringop_algs pentiumpro_memcpy[2] = {
430 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
431 {8192, rep_prefix_4_byte, false},
432 {-1, rep_prefix_1_byte, false}}},
433 DUMMY_STRINGOP_ALGS};
434 static stringop_algs pentiumpro_memset[2] = {
435 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
436 {8192, rep_prefix_4_byte, false},
437 {-1, libcall, false}}},
438 DUMMY_STRINGOP_ALGS};
439 static const
440 struct processor_costs pentiumpro_cost = {
441 COSTS_N_INSNS (1), /* cost of an add instruction */
442 COSTS_N_INSNS (1), /* cost of a lea instruction */
443 COSTS_N_INSNS (1), /* variable shift costs */
444 COSTS_N_INSNS (1), /* constant shift costs */
445 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
446 COSTS_N_INSNS (4), /* HI */
447 COSTS_N_INSNS (4), /* SI */
448 COSTS_N_INSNS (4), /* DI */
449 COSTS_N_INSNS (4)}, /* other */
450 0, /* cost of multiply per each bit set */
451 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
452 COSTS_N_INSNS (17), /* HI */
453 COSTS_N_INSNS (17), /* SI */
454 COSTS_N_INSNS (17), /* DI */
455 COSTS_N_INSNS (17)}, /* other */
456 COSTS_N_INSNS (1), /* cost of movsx */
457 COSTS_N_INSNS (1), /* cost of movzx */
458 8, /* "large" insn */
459 6, /* MOVE_RATIO */
460 2, /* cost for loading QImode using movzbl */
461 {4, 4, 4}, /* cost of loading integer registers
462 in QImode, HImode and SImode.
463 Relative to reg-reg move (2). */
464 {2, 2, 2}, /* cost of storing integer registers */
465 2, /* cost of reg,reg fld/fst */
466 {2, 2, 6}, /* cost of loading fp registers
467 in SFmode, DFmode and XFmode */
468 {4, 4, 6}, /* cost of storing fp registers
469 in SFmode, DFmode and XFmode */
470 2, /* cost of moving MMX register */
471 {2, 2}, /* cost of loading MMX registers
472 in SImode and DImode */
473 {2, 2}, /* cost of storing MMX registers
474 in SImode and DImode */
475 2, /* cost of moving SSE register */
476 {2, 2, 8}, /* cost of loading SSE registers
477 in SImode, DImode and TImode */
478 {2, 2, 8}, /* cost of storing SSE registers
479 in SImode, DImode and TImode */
480 3, /* MMX or SSE register to integer */
481 8, /* size of l1 cache. */
482 256, /* size of l2 cache */
483 32, /* size of prefetch block */
484 6, /* number of parallel prefetches */
485 2, /* Branch cost */
486 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
487 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
488 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
489 COSTS_N_INSNS (2), /* cost of FABS instruction. */
490 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
491 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
492
493 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
494 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
495 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
496 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
497 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
498 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
499 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
500 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
501 pentiumpro_memcpy,
502 pentiumpro_memset,
503 1, /* scalar_stmt_cost. */
504 1, /* scalar load_cost. */
505 1, /* scalar_store_cost. */
506 1, /* vec_stmt_cost. */
507 1, /* vec_to_scalar_cost. */
508 1, /* scalar_to_vec_cost. */
509 1, /* vec_align_load_cost. */
510 2, /* vec_unalign_load_cost. */
511 1, /* vec_store_cost. */
512 3, /* cond_taken_branch_cost. */
513 1, /* cond_not_taken_branch_cost. */
514 };
515
516 static stringop_algs geode_memcpy[2] = {
517 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
518 DUMMY_STRINGOP_ALGS};
519 static stringop_algs geode_memset[2] = {
520 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
521 DUMMY_STRINGOP_ALGS};
522 static const
523 struct processor_costs geode_cost = {
524 COSTS_N_INSNS (1), /* cost of an add instruction */
525 COSTS_N_INSNS (1), /* cost of a lea instruction */
526 COSTS_N_INSNS (2), /* variable shift costs */
527 COSTS_N_INSNS (1), /* constant shift costs */
528 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
529 COSTS_N_INSNS (4), /* HI */
530 COSTS_N_INSNS (7), /* SI */
531 COSTS_N_INSNS (7), /* DI */
532 COSTS_N_INSNS (7)}, /* other */
533 0, /* cost of multiply per each bit set */
534 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
535 COSTS_N_INSNS (23), /* HI */
536 COSTS_N_INSNS (39), /* SI */
537 COSTS_N_INSNS (39), /* DI */
538 COSTS_N_INSNS (39)}, /* other */
539 COSTS_N_INSNS (1), /* cost of movsx */
540 COSTS_N_INSNS (1), /* cost of movzx */
541 8, /* "large" insn */
542 4, /* MOVE_RATIO */
543 1, /* cost for loading QImode using movzbl */
544 {1, 1, 1}, /* cost of loading integer registers
545 in QImode, HImode and SImode.
546 Relative to reg-reg move (2). */
547 {1, 1, 1}, /* cost of storing integer registers */
548 1, /* cost of reg,reg fld/fst */
549 {1, 1, 1}, /* cost of loading fp registers
550 in SFmode, DFmode and XFmode */
551 {4, 6, 6}, /* cost of storing fp registers
552 in SFmode, DFmode and XFmode */
553
554 2, /* cost of moving MMX register */
555 {2, 2}, /* cost of loading MMX registers
556 in SImode and DImode */
557 {2, 2}, /* cost of storing MMX registers
558 in SImode and DImode */
559 2, /* cost of moving SSE register */
560 {2, 2, 8}, /* cost of loading SSE registers
561 in SImode, DImode and TImode */
562 {2, 2, 8}, /* cost of storing SSE registers
563 in SImode, DImode and TImode */
564 3, /* MMX or SSE register to integer */
565 64, /* size of l1 cache. */
566 128, /* size of l2 cache. */
567 32, /* size of prefetch block */
568 1, /* number of parallel prefetches */
569 1, /* Branch cost */
570 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
571 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
572 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
573 COSTS_N_INSNS (1), /* cost of FABS instruction. */
574 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
575 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
576
577 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
578 COSTS_N_INSNS (11), /* cost of MULSS instruction. */
579 COSTS_N_INSNS (11), /* cost of MULSD instruction. */
580 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
581 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
582 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
583 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
584 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
585 geode_memcpy,
586 geode_memset,
587 1, /* scalar_stmt_cost. */
588 1, /* scalar load_cost. */
589 1, /* scalar_store_cost. */
590 1, /* vec_stmt_cost. */
591 1, /* vec_to_scalar_cost. */
592 1, /* scalar_to_vec_cost. */
593 1, /* vec_align_load_cost. */
594 2, /* vec_unalign_load_cost. */
595 1, /* vec_store_cost. */
596 3, /* cond_taken_branch_cost. */
597 1, /* cond_not_taken_branch_cost. */
598 };
599
600 static stringop_algs k6_memcpy[2] = {
601 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
602 DUMMY_STRINGOP_ALGS};
603 static stringop_algs k6_memset[2] = {
604 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
605 DUMMY_STRINGOP_ALGS};
606 static const
607 struct processor_costs k6_cost = {
608 COSTS_N_INSNS (1), /* cost of an add instruction */
609 COSTS_N_INSNS (2), /* cost of a lea instruction */
610 COSTS_N_INSNS (1), /* variable shift costs */
611 COSTS_N_INSNS (1), /* constant shift costs */
612 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
613 COSTS_N_INSNS (3), /* HI */
614 COSTS_N_INSNS (3), /* SI */
615 COSTS_N_INSNS (3), /* DI */
616 COSTS_N_INSNS (3)}, /* other */
617 0, /* cost of multiply per each bit set */
618 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
619 COSTS_N_INSNS (18), /* HI */
620 COSTS_N_INSNS (18), /* SI */
621 COSTS_N_INSNS (18), /* DI */
622 COSTS_N_INSNS (18)}, /* other */
623 COSTS_N_INSNS (2), /* cost of movsx */
624 COSTS_N_INSNS (2), /* cost of movzx */
625 8, /* "large" insn */
626 4, /* MOVE_RATIO */
627 3, /* cost for loading QImode using movzbl */
628 {4, 5, 4}, /* cost of loading integer registers
629 in QImode, HImode and SImode.
630 Relative to reg-reg move (2). */
631 {2, 3, 2}, /* cost of storing integer registers */
632 4, /* cost of reg,reg fld/fst */
633 {6, 6, 6}, /* cost of loading fp registers
634 in SFmode, DFmode and XFmode */
635 {4, 4, 4}, /* cost of storing fp registers
636 in SFmode, DFmode and XFmode */
637 2, /* cost of moving MMX register */
638 {2, 2}, /* cost of loading MMX registers
639 in SImode and DImode */
640 {2, 2}, /* cost of storing MMX registers
641 in SImode and DImode */
642 2, /* cost of moving SSE register */
643 {2, 2, 8}, /* cost of loading SSE registers
644 in SImode, DImode and TImode */
645 {2, 2, 8}, /* cost of storing SSE registers
646 in SImode, DImode and TImode */
647 6, /* MMX or SSE register to integer */
648 32, /* size of l1 cache. */
649 32, /* size of l2 cache. Some models
650 have integrated l2 cache, but
651 optimizing for k6 is not important
652 enough to worry about that. */
653 32, /* size of prefetch block */
654 1, /* number of parallel prefetches */
655 1, /* Branch cost */
656 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
657 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
658 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
659 COSTS_N_INSNS (2), /* cost of FABS instruction. */
660 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
661 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
662
663 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
664 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
665 COSTS_N_INSNS (2), /* cost of MULSD instruction. */
666 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
667 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
668 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
669 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
670 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
671 k6_memcpy,
672 k6_memset,
673 1, /* scalar_stmt_cost. */
674 1, /* scalar load_cost. */
675 1, /* scalar_store_cost. */
676 1, /* vec_stmt_cost. */
677 1, /* vec_to_scalar_cost. */
678 1, /* scalar_to_vec_cost. */
679 1, /* vec_align_load_cost. */
680 2, /* vec_unalign_load_cost. */
681 1, /* vec_store_cost. */
682 3, /* cond_taken_branch_cost. */
683 1, /* cond_not_taken_branch_cost. */
684 };
685
686 /* For some reason, Athlon deals better with REP prefix (relative to loops)
687 compared to K8. Alignment becomes important after 8 bytes for memcpy and
688 128 bytes for memset. */
689 static stringop_algs athlon_memcpy[2] = {
690 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
691 DUMMY_STRINGOP_ALGS};
692 static stringop_algs athlon_memset[2] = {
693 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
694 DUMMY_STRINGOP_ALGS};
695 static const
696 struct processor_costs athlon_cost = {
697 COSTS_N_INSNS (1), /* cost of an add instruction */
698 COSTS_N_INSNS (2), /* cost of a lea instruction */
699 COSTS_N_INSNS (1), /* variable shift costs */
700 COSTS_N_INSNS (1), /* constant shift costs */
701 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
702 COSTS_N_INSNS (5), /* HI */
703 COSTS_N_INSNS (5), /* SI */
704 COSTS_N_INSNS (5), /* DI */
705 COSTS_N_INSNS (5)}, /* other */
706 0, /* cost of multiply per each bit set */
707 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
708 COSTS_N_INSNS (26), /* HI */
709 COSTS_N_INSNS (42), /* SI */
710 COSTS_N_INSNS (74), /* DI */
711 COSTS_N_INSNS (74)}, /* other */
712 COSTS_N_INSNS (1), /* cost of movsx */
713 COSTS_N_INSNS (1), /* cost of movzx */
714 8, /* "large" insn */
715 9, /* MOVE_RATIO */
716 4, /* cost for loading QImode using movzbl */
717 {3, 4, 3}, /* cost of loading integer registers
718 in QImode, HImode and SImode.
719 Relative to reg-reg move (2). */
720 {3, 4, 3}, /* cost of storing integer registers */
721 4, /* cost of reg,reg fld/fst */
722 {4, 4, 12}, /* cost of loading fp registers
723 in SFmode, DFmode and XFmode */
724 {6, 6, 8}, /* cost of storing fp registers
725 in SFmode, DFmode and XFmode */
726 2, /* cost of moving MMX register */
727 {4, 4}, /* cost of loading MMX registers
728 in SImode and DImode */
729 {4, 4}, /* cost of storing MMX registers
730 in SImode and DImode */
731 2, /* cost of moving SSE register */
732 {4, 4, 6}, /* cost of loading SSE registers
733 in SImode, DImode and TImode */
734 {4, 4, 5}, /* cost of storing SSE registers
735 in SImode, DImode and TImode */
736 5, /* MMX or SSE register to integer */
737 64, /* size of l1 cache. */
738 256, /* size of l2 cache. */
739 64, /* size of prefetch block */
740 6, /* number of parallel prefetches */
741 5, /* Branch cost */
742 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
743 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
744 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
745 COSTS_N_INSNS (2), /* cost of FABS instruction. */
746 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
747 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
748
749 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
750 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
751 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
752 /* 11-16 */
753 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
754 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
755 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
756 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
757 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
758 athlon_memcpy,
759 athlon_memset,
760 1, /* scalar_stmt_cost. */
761 1, /* scalar load_cost. */
762 1, /* scalar_store_cost. */
763 1, /* vec_stmt_cost. */
764 1, /* vec_to_scalar_cost. */
765 1, /* scalar_to_vec_cost. */
766 1, /* vec_align_load_cost. */
767 2, /* vec_unalign_load_cost. */
768 1, /* vec_store_cost. */
769 3, /* cond_taken_branch_cost. */
770 1, /* cond_not_taken_branch_cost. */
771 };
772
773 /* K8 has optimized REP instruction for medium sized blocks, but for very
774 small blocks it is better to use loop. For large blocks, libcall can
775 do nontemporary accesses and beat inline considerably. */
776 static stringop_algs k8_memcpy[2] = {
777 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
778 {-1, rep_prefix_4_byte, false}}},
779 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
780 {-1, libcall, false}}}};
781 static stringop_algs k8_memset[2] = {
782 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
783 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
784 {libcall, {{48, unrolled_loop, false},
785 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
786 static const
787 struct processor_costs k8_cost = {
788 COSTS_N_INSNS (1), /* cost of an add instruction */
789 COSTS_N_INSNS (2), /* cost of a lea instruction */
790 COSTS_N_INSNS (1), /* variable shift costs */
791 COSTS_N_INSNS (1), /* constant shift costs */
792 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
793 COSTS_N_INSNS (4), /* HI */
794 COSTS_N_INSNS (3), /* SI */
795 COSTS_N_INSNS (4), /* DI */
796 COSTS_N_INSNS (5)}, /* other */
797 0, /* cost of multiply per each bit set */
798 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
799 COSTS_N_INSNS (26), /* HI */
800 COSTS_N_INSNS (42), /* SI */
801 COSTS_N_INSNS (74), /* DI */
802 COSTS_N_INSNS (74)}, /* other */
803 COSTS_N_INSNS (1), /* cost of movsx */
804 COSTS_N_INSNS (1), /* cost of movzx */
805 8, /* "large" insn */
806 9, /* MOVE_RATIO */
807 4, /* cost for loading QImode using movzbl */
808 {3, 4, 3}, /* cost of loading integer registers
809 in QImode, HImode and SImode.
810 Relative to reg-reg move (2). */
811 {3, 4, 3}, /* cost of storing integer registers */
812 4, /* cost of reg,reg fld/fst */
813 {4, 4, 12}, /* cost of loading fp registers
814 in SFmode, DFmode and XFmode */
815 {6, 6, 8}, /* cost of storing fp registers
816 in SFmode, DFmode and XFmode */
817 2, /* cost of moving MMX register */
818 {3, 3}, /* cost of loading MMX registers
819 in SImode and DImode */
820 {4, 4}, /* cost of storing MMX registers
821 in SImode and DImode */
822 2, /* cost of moving SSE register */
823 {4, 3, 6}, /* cost of loading SSE registers
824 in SImode, DImode and TImode */
825 {4, 4, 5}, /* cost of storing SSE registers
826 in SImode, DImode and TImode */
827 5, /* MMX or SSE register to integer */
828 64, /* size of l1 cache. */
829 512, /* size of l2 cache. */
830 64, /* size of prefetch block */
831 /* New AMD processors never drop prefetches; if they cannot be performed
832 immediately, they are queued. We set number of simultaneous prefetches
833 to a large constant to reflect this (it probably is not a good idea not
834 to limit number of prefetches at all, as their execution also takes some
835 time). */
836 100, /* number of parallel prefetches */
837 3, /* Branch cost */
838 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
839 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
840 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
841 COSTS_N_INSNS (2), /* cost of FABS instruction. */
842 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
843 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
844
845 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
846 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
847 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
848 /* 11-16 */
849 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
850 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
851 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
852 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
853 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
854 k8_memcpy,
855 k8_memset,
856 4, /* scalar_stmt_cost. */
857 2, /* scalar load_cost. */
858 2, /* scalar_store_cost. */
859 5, /* vec_stmt_cost. */
860 0, /* vec_to_scalar_cost. */
861 2, /* scalar_to_vec_cost. */
862 2, /* vec_align_load_cost. */
863 3, /* vec_unalign_load_cost. */
864 3, /* vec_store_cost. */
865 3, /* cond_taken_branch_cost. */
866 2, /* cond_not_taken_branch_cost. */
867 };
868
869 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
870 very small blocks it is better to use loop. For large blocks, libcall can
871 do nontemporary accesses and beat inline considerably. */
872 static stringop_algs amdfam10_memcpy[2] = {
873 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
874 {-1, rep_prefix_4_byte, false}}},
875 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
876 {-1, libcall, false}}}};
877 static stringop_algs amdfam10_memset[2] = {
878 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
879 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
880 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
881 {-1, libcall, false}}}};
882 struct processor_costs amdfam10_cost = {
883 COSTS_N_INSNS (1), /* cost of an add instruction */
884 COSTS_N_INSNS (2), /* cost of a lea instruction */
885 COSTS_N_INSNS (1), /* variable shift costs */
886 COSTS_N_INSNS (1), /* constant shift costs */
887 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
888 COSTS_N_INSNS (4), /* HI */
889 COSTS_N_INSNS (3), /* SI */
890 COSTS_N_INSNS (4), /* DI */
891 COSTS_N_INSNS (5)}, /* other */
892 0, /* cost of multiply per each bit set */
893 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
894 COSTS_N_INSNS (35), /* HI */
895 COSTS_N_INSNS (51), /* SI */
896 COSTS_N_INSNS (83), /* DI */
897 COSTS_N_INSNS (83)}, /* other */
898 COSTS_N_INSNS (1), /* cost of movsx */
899 COSTS_N_INSNS (1), /* cost of movzx */
900 8, /* "large" insn */
901 9, /* MOVE_RATIO */
902 4, /* cost for loading QImode using movzbl */
903 {3, 4, 3}, /* cost of loading integer registers
904 in QImode, HImode and SImode.
905 Relative to reg-reg move (2). */
906 {3, 4, 3}, /* cost of storing integer registers */
907 4, /* cost of reg,reg fld/fst */
908 {4, 4, 12}, /* cost of loading fp registers
909 in SFmode, DFmode and XFmode */
910 {6, 6, 8}, /* cost of storing fp registers
911 in SFmode, DFmode and XFmode */
912 2, /* cost of moving MMX register */
913 {3, 3}, /* cost of loading MMX registers
914 in SImode and DImode */
915 {4, 4}, /* cost of storing MMX registers
916 in SImode and DImode */
917 2, /* cost of moving SSE register */
918 {4, 4, 3}, /* cost of loading SSE registers
919 in SImode, DImode and TImode */
920 {4, 4, 5}, /* cost of storing SSE registers
921 in SImode, DImode and TImode */
922 3, /* MMX or SSE register to integer */
923 /* On K8:
924 MOVD reg64, xmmreg Double FSTORE 4
925 MOVD reg32, xmmreg Double FSTORE 4
926 On AMDFAM10:
927 MOVD reg64, xmmreg Double FADD 3
928 1/1 1/1
929 MOVD reg32, xmmreg Double FADD 3
930 1/1 1/1 */
931 64, /* size of l1 cache. */
932 512, /* size of l2 cache. */
933 64, /* size of prefetch block */
934 /* New AMD processors never drop prefetches; if they cannot be performed
935 immediately, they are queued. We set number of simultaneous prefetches
936 to a large constant to reflect this (it probably is not a good idea not
937 to limit number of prefetches at all, as their execution also takes some
938 time). */
939 100, /* number of parallel prefetches */
940 2, /* Branch cost */
941 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
942 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
943 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
944 COSTS_N_INSNS (2), /* cost of FABS instruction. */
945 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
946 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
947
948 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
949 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
950 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
951 /* 11-16 */
952 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
953 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
954 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
955 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
956 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
957 amdfam10_memcpy,
958 amdfam10_memset,
959 4, /* scalar_stmt_cost. */
960 2, /* scalar load_cost. */
961 2, /* scalar_store_cost. */
962 6, /* vec_stmt_cost. */
963 0, /* vec_to_scalar_cost. */
964 2, /* scalar_to_vec_cost. */
965 2, /* vec_align_load_cost. */
966 2, /* vec_unalign_load_cost. */
967 2, /* vec_store_cost. */
968 2, /* cond_taken_branch_cost. */
969 1, /* cond_not_taken_branch_cost. */
970 };
971
972 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
973 very small blocks it is better to use loop. For large blocks, libcall
974 can do nontemporary accesses and beat inline considerably. */
975 static stringop_algs bdver1_memcpy[2] = {
976 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
977 {-1, rep_prefix_4_byte, false}}},
978 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
979 {-1, libcall, false}}}};
980 static stringop_algs bdver1_memset[2] = {
981 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
982 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
983 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
984 {-1, libcall, false}}}};
985
986 const struct processor_costs bdver1_cost = {
987 COSTS_N_INSNS (1), /* cost of an add instruction */
988 COSTS_N_INSNS (1), /* cost of a lea instruction */
989 COSTS_N_INSNS (1), /* variable shift costs */
990 COSTS_N_INSNS (1), /* constant shift costs */
991 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
992 COSTS_N_INSNS (4), /* HI */
993 COSTS_N_INSNS (4), /* SI */
994 COSTS_N_INSNS (6), /* DI */
995 COSTS_N_INSNS (6)}, /* other */
996 0, /* cost of multiply per each bit set */
997 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
998 COSTS_N_INSNS (35), /* HI */
999 COSTS_N_INSNS (51), /* SI */
1000 COSTS_N_INSNS (83), /* DI */
1001 COSTS_N_INSNS (83)}, /* other */
1002 COSTS_N_INSNS (1), /* cost of movsx */
1003 COSTS_N_INSNS (1), /* cost of movzx */
1004 8, /* "large" insn */
1005 9, /* MOVE_RATIO */
1006 4, /* cost for loading QImode using movzbl */
1007 {5, 5, 4}, /* cost of loading integer registers
1008 in QImode, HImode and SImode.
1009 Relative to reg-reg move (2). */
1010 {4, 4, 4}, /* cost of storing integer registers */
1011 2, /* cost of reg,reg fld/fst */
1012 {5, 5, 12}, /* cost of loading fp registers
1013 in SFmode, DFmode and XFmode */
1014 {4, 4, 8}, /* cost of storing fp registers
1015 in SFmode, DFmode and XFmode */
1016 2, /* cost of moving MMX register */
1017 {4, 4}, /* cost of loading MMX registers
1018 in SImode and DImode */
1019 {4, 4}, /* cost of storing MMX registers
1020 in SImode and DImode */
1021 2, /* cost of moving SSE register */
1022 {4, 4, 4}, /* cost of loading SSE registers
1023 in SImode, DImode and TImode */
1024 {4, 4, 4}, /* cost of storing SSE registers
1025 in SImode, DImode and TImode */
1026 2, /* MMX or SSE register to integer */
1027 /* On K8:
1028 MOVD reg64, xmmreg Double FSTORE 4
1029 MOVD reg32, xmmreg Double FSTORE 4
1030 On AMDFAM10:
1031 MOVD reg64, xmmreg Double FADD 3
1032 1/1 1/1
1033 MOVD reg32, xmmreg Double FADD 3
1034 1/1 1/1 */
1035 16, /* size of l1 cache. */
1036 2048, /* size of l2 cache. */
1037 64, /* size of prefetch block */
1038 /* New AMD processors never drop prefetches; if they cannot be performed
1039 immediately, they are queued. We set number of simultaneous prefetches
1040 to a large constant to reflect this (it probably is not a good idea not
1041 to limit number of prefetches at all, as their execution also takes some
1042 time). */
1043 100, /* number of parallel prefetches */
1044 2, /* Branch cost */
1045 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1046 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1047 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1048 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1049 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1050 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1051
1052 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1053 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1054 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1055 /* 9-24 */
1056 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1057 /* 9-27 */
1058 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1059 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1060 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1061 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1062 bdver1_memcpy,
1063 bdver1_memset,
1064 6, /* scalar_stmt_cost. */
1065 4, /* scalar load_cost. */
1066 4, /* scalar_store_cost. */
1067 6, /* vec_stmt_cost. */
1068 0, /* vec_to_scalar_cost. */
1069 2, /* scalar_to_vec_cost. */
1070 4, /* vec_align_load_cost. */
1071 4, /* vec_unalign_load_cost. */
1072 4, /* vec_store_cost. */
1073 4, /* cond_taken_branch_cost. */
1074 2, /* cond_not_taken_branch_cost. */
1075 };
1076
1077 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1078 very small blocks it is better to use loop. For large blocks, libcall
1079 can do nontemporary accesses and beat inline considerably. */
1080
1081 static stringop_algs bdver2_memcpy[2] = {
1082 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1083 {-1, rep_prefix_4_byte, false}}},
1084 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1085 {-1, libcall, false}}}};
1086 static stringop_algs bdver2_memset[2] = {
1087 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1088 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1089 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1090 {-1, libcall, false}}}};
1091
1092 const struct processor_costs bdver2_cost = {
1093 COSTS_N_INSNS (1), /* cost of an add instruction */
1094 COSTS_N_INSNS (1), /* cost of a lea instruction */
1095 COSTS_N_INSNS (1), /* variable shift costs */
1096 COSTS_N_INSNS (1), /* constant shift costs */
1097 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1098 COSTS_N_INSNS (4), /* HI */
1099 COSTS_N_INSNS (4), /* SI */
1100 COSTS_N_INSNS (6), /* DI */
1101 COSTS_N_INSNS (6)}, /* other */
1102 0, /* cost of multiply per each bit set */
1103 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1104 COSTS_N_INSNS (35), /* HI */
1105 COSTS_N_INSNS (51), /* SI */
1106 COSTS_N_INSNS (83), /* DI */
1107 COSTS_N_INSNS (83)}, /* other */
1108 COSTS_N_INSNS (1), /* cost of movsx */
1109 COSTS_N_INSNS (1), /* cost of movzx */
1110 8, /* "large" insn */
1111 9, /* MOVE_RATIO */
1112 4, /* cost for loading QImode using movzbl */
1113 {5, 5, 4}, /* cost of loading integer registers
1114 in QImode, HImode and SImode.
1115 Relative to reg-reg move (2). */
1116 {4, 4, 4}, /* cost of storing integer registers */
1117 2, /* cost of reg,reg fld/fst */
1118 {5, 5, 12}, /* cost of loading fp registers
1119 in SFmode, DFmode and XFmode */
1120 {4, 4, 8}, /* cost of storing fp registers
1121 in SFmode, DFmode and XFmode */
1122 2, /* cost of moving MMX register */
1123 {4, 4}, /* cost of loading MMX registers
1124 in SImode and DImode */
1125 {4, 4}, /* cost of storing MMX registers
1126 in SImode and DImode */
1127 2, /* cost of moving SSE register */
1128 {4, 4, 4}, /* cost of loading SSE registers
1129 in SImode, DImode and TImode */
1130 {4, 4, 4}, /* cost of storing SSE registers
1131 in SImode, DImode and TImode */
1132 2, /* MMX or SSE register to integer */
1133 /* On K8:
1134 MOVD reg64, xmmreg Double FSTORE 4
1135 MOVD reg32, xmmreg Double FSTORE 4
1136 On AMDFAM10:
1137 MOVD reg64, xmmreg Double FADD 3
1138 1/1 1/1
1139 MOVD reg32, xmmreg Double FADD 3
1140 1/1 1/1 */
1141 16, /* size of l1 cache. */
1142 2048, /* size of l2 cache. */
1143 64, /* size of prefetch block */
1144 /* New AMD processors never drop prefetches; if they cannot be performed
1145 immediately, they are queued. We set number of simultaneous prefetches
1146 to a large constant to reflect this (it probably is not a good idea not
1147 to limit number of prefetches at all, as their execution also takes some
1148 time). */
1149 100, /* number of parallel prefetches */
1150 2, /* Branch cost */
1151 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1152 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1153 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1154 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1155 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1156 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1157
1158 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1159 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1160 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1161 /* 9-24 */
1162 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1163 /* 9-27 */
1164 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1165 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1166 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1167 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1168 bdver2_memcpy,
1169 bdver2_memset,
1170 6, /* scalar_stmt_cost. */
1171 4, /* scalar load_cost. */
1172 4, /* scalar_store_cost. */
1173 6, /* vec_stmt_cost. */
1174 0, /* vec_to_scalar_cost. */
1175 2, /* scalar_to_vec_cost. */
1176 4, /* vec_align_load_cost. */
1177 4, /* vec_unalign_load_cost. */
1178 4, /* vec_store_cost. */
1179 4, /* cond_taken_branch_cost. */
1180 2, /* cond_not_taken_branch_cost. */
1181 };
1182
1183
1184 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1185 very small blocks it is better to use loop. For large blocks, libcall
1186 can do nontemporary accesses and beat inline considerably. */
1187 static stringop_algs bdver3_memcpy[2] = {
1188 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1189 {-1, rep_prefix_4_byte, false}}},
1190 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1191 {-1, libcall, false}}}};
1192 static stringop_algs bdver3_memset[2] = {
1193 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1194 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1195 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1196 {-1, libcall, false}}}};
1197 struct processor_costs bdver3_cost = {
1198 COSTS_N_INSNS (1), /* cost of an add instruction */
1199 COSTS_N_INSNS (1), /* cost of a lea instruction */
1200 COSTS_N_INSNS (1), /* variable shift costs */
1201 COSTS_N_INSNS (1), /* constant shift costs */
1202 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1203 COSTS_N_INSNS (4), /* HI */
1204 COSTS_N_INSNS (4), /* SI */
1205 COSTS_N_INSNS (6), /* DI */
1206 COSTS_N_INSNS (6)}, /* other */
1207 0, /* cost of multiply per each bit set */
1208 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1209 COSTS_N_INSNS (35), /* HI */
1210 COSTS_N_INSNS (51), /* SI */
1211 COSTS_N_INSNS (83), /* DI */
1212 COSTS_N_INSNS (83)}, /* other */
1213 COSTS_N_INSNS (1), /* cost of movsx */
1214 COSTS_N_INSNS (1), /* cost of movzx */
1215 8, /* "large" insn */
1216 9, /* MOVE_RATIO */
1217 4, /* cost for loading QImode using movzbl */
1218 {5, 5, 4}, /* cost of loading integer registers
1219 in QImode, HImode and SImode.
1220 Relative to reg-reg move (2). */
1221 {4, 4, 4}, /* cost of storing integer registers */
1222 2, /* cost of reg,reg fld/fst */
1223 {5, 5, 12}, /* cost of loading fp registers
1224 in SFmode, DFmode and XFmode */
1225 {4, 4, 8}, /* cost of storing fp registers
1226 in SFmode, DFmode and XFmode */
1227 2, /* cost of moving MMX register */
1228 {4, 4}, /* cost of loading MMX registers
1229 in SImode and DImode */
1230 {4, 4}, /* cost of storing MMX registers
1231 in SImode and DImode */
1232 2, /* cost of moving SSE register */
1233 {4, 4, 4}, /* cost of loading SSE registers
1234 in SImode, DImode and TImode */
1235 {4, 4, 4}, /* cost of storing SSE registers
1236 in SImode, DImode and TImode */
1237 2, /* MMX or SSE register to integer */
1238 16, /* size of l1 cache. */
1239 2048, /* size of l2 cache. */
1240 64, /* size of prefetch block */
1241 /* New AMD processors never drop prefetches; if they cannot be performed
1242 immediately, they are queued. We set number of simultaneous prefetches
1243 to a large constant to reflect this (it probably is not a good idea not
1244 to limit number of prefetches at all, as their execution also takes some
1245 time). */
1246 100, /* number of parallel prefetches */
1247 2, /* Branch cost */
1248 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1249 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1250 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1251 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1252 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1253 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1254
1255 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1256 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1257 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1258 /* 9-24 */
1259 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1260 /* 9-27 */
1261 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1262 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1263 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1264 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1265 bdver3_memcpy,
1266 bdver3_memset,
1267 6, /* scalar_stmt_cost. */
1268 4, /* scalar load_cost. */
1269 4, /* scalar_store_cost. */
1270 6, /* vec_stmt_cost. */
1271 0, /* vec_to_scalar_cost. */
1272 2, /* scalar_to_vec_cost. */
1273 4, /* vec_align_load_cost. */
1274 4, /* vec_unalign_load_cost. */
1275 4, /* vec_store_cost. */
1276 4, /* cond_taken_branch_cost. */
1277 2, /* cond_not_taken_branch_cost. */
1278 };
1279
1280 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1281 very small blocks it is better to use loop. For large blocks, libcall
1282 can do nontemporary accesses and beat inline considerably. */
1283 static stringop_algs bdver4_memcpy[2] = {
1284 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1285 {-1, rep_prefix_4_byte, false}}},
1286 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1287 {-1, libcall, false}}}};
1288 static stringop_algs bdver4_memset[2] = {
1289 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1290 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1291 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1292 {-1, libcall, false}}}};
1293 struct processor_costs bdver4_cost = {
1294 COSTS_N_INSNS (1), /* cost of an add instruction */
1295 COSTS_N_INSNS (1), /* cost of a lea instruction */
1296 COSTS_N_INSNS (1), /* variable shift costs */
1297 COSTS_N_INSNS (1), /* constant shift costs */
1298 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1299 COSTS_N_INSNS (4), /* HI */
1300 COSTS_N_INSNS (4), /* SI */
1301 COSTS_N_INSNS (6), /* DI */
1302 COSTS_N_INSNS (6)}, /* other */
1303 0, /* cost of multiply per each bit set */
1304 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1305 COSTS_N_INSNS (35), /* HI */
1306 COSTS_N_INSNS (51), /* SI */
1307 COSTS_N_INSNS (83), /* DI */
1308 COSTS_N_INSNS (83)}, /* other */
1309 COSTS_N_INSNS (1), /* cost of movsx */
1310 COSTS_N_INSNS (1), /* cost of movzx */
1311 8, /* "large" insn */
1312 9, /* MOVE_RATIO */
1313 4, /* cost for loading QImode using movzbl */
1314 {5, 5, 4}, /* cost of loading integer registers
1315 in QImode, HImode and SImode.
1316 Relative to reg-reg move (2). */
1317 {4, 4, 4}, /* cost of storing integer registers */
1318 2, /* cost of reg,reg fld/fst */
1319 {5, 5, 12}, /* cost of loading fp registers
1320 in SFmode, DFmode and XFmode */
1321 {4, 4, 8}, /* cost of storing fp registers
1322 in SFmode, DFmode and XFmode */
1323 2, /* cost of moving MMX register */
1324 {4, 4}, /* cost of loading MMX registers
1325 in SImode and DImode */
1326 {4, 4}, /* cost of storing MMX registers
1327 in SImode and DImode */
1328 2, /* cost of moving SSE register */
1329 {4, 4, 4}, /* cost of loading SSE registers
1330 in SImode, DImode and TImode */
1331 {4, 4, 4}, /* cost of storing SSE registers
1332 in SImode, DImode and TImode */
1333 2, /* MMX or SSE register to integer */
1334 16, /* size of l1 cache. */
1335 2048, /* size of l2 cache. */
1336 64, /* size of prefetch block */
1337 /* New AMD processors never drop prefetches; if they cannot be performed
1338 immediately, they are queued. We set number of simultaneous prefetches
1339 to a large constant to reflect this (it probably is not a good idea not
1340 to limit number of prefetches at all, as their execution also takes some
1341 time). */
1342 100, /* number of parallel prefetches */
1343 2, /* Branch cost */
1344 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1345 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1346 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1347 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1348 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1349 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1350
1351 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1352 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1353 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1354 /* 9-24 */
1355 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1356 /* 9-27 */
1357 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1358 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1359 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1360 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1361 bdver4_memcpy,
1362 bdver4_memset,
1363 6, /* scalar_stmt_cost. */
1364 4, /* scalar load_cost. */
1365 4, /* scalar_store_cost. */
1366 6, /* vec_stmt_cost. */
1367 0, /* vec_to_scalar_cost. */
1368 2, /* scalar_to_vec_cost. */
1369 4, /* vec_align_load_cost. */
1370 4, /* vec_unalign_load_cost. */
1371 4, /* vec_store_cost. */
1372 4, /* cond_taken_branch_cost. */
1373 2, /* cond_not_taken_branch_cost. */
1374 };
1375
1376
1377 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1378 very small blocks it is better to use loop. For large blocks, libcall
1379 can do nontemporary accesses and beat inline considerably. */
1380 static stringop_algs znver1_memcpy[2] = {
1381 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1382 {-1, rep_prefix_4_byte, false}}},
1383 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1384 {-1, libcall, false}}}};
1385 static stringop_algs znver1_memset[2] = {
1386 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1387 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1388 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1389 {-1, libcall, false}}}};
1390 struct processor_costs znver1_cost = {
1391 COSTS_N_INSNS (1), /* cost of an add instruction. */
1392 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1393 COSTS_N_INSNS (1), /* variable shift costs. */
1394 COSTS_N_INSNS (1), /* constant shift costs. */
1395 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1396 COSTS_N_INSNS (3), /* HI. */
1397 COSTS_N_INSNS (3), /* SI. */
1398 COSTS_N_INSNS (3), /* DI. */
1399 COSTS_N_INSNS (3)}, /* other. */
1400 0, /* cost of multiply per each bit
1401 set. */
1402 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1403 bound. */
1404 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1405 COSTS_N_INSNS (22), /* HI. */
1406 COSTS_N_INSNS (30), /* SI. */
1407 COSTS_N_INSNS (45), /* DI. */
1408 COSTS_N_INSNS (45)}, /* other. */
1409 COSTS_N_INSNS (1), /* cost of movsx. */
1410 COSTS_N_INSNS (1), /* cost of movzx. */
1411 8, /* "large" insn. */
1412 9, /* MOVE_RATIO. */
1413 4, /* cost for loading QImode using
1414 movzbl. */
1415 {5, 5, 4}, /* cost of loading integer registers
1416 in QImode, HImode and SImode.
1417 Relative to reg-reg move (2). */
1418 {4, 4, 4}, /* cost of storing integer
1419 registers. */
1420 2, /* cost of reg,reg fld/fst. */
1421 {5, 5, 12}, /* cost of loading fp registers
1422 in SFmode, DFmode and XFmode. */
1423 {4, 4, 8}, /* cost of storing fp registers
1424 in SFmode, DFmode and XFmode. */
1425 2, /* cost of moving MMX register. */
1426 {4, 4}, /* cost of loading MMX registers
1427 in SImode and DImode. */
1428 {4, 4}, /* cost of storing MMX registers
1429 in SImode and DImode. */
1430 2, /* cost of moving SSE register. */
1431 {4, 4, 4}, /* cost of loading SSE registers
1432 in SImode, DImode and TImode. */
1433 {4, 4, 4}, /* cost of storing SSE registers
1434 in SImode, DImode and TImode. */
1435 2, /* MMX or SSE register to integer. */
1436 32, /* size of l1 cache. */
1437 512, /* size of l2 cache. */
1438 64, /* size of prefetch block. */
1439 /* New AMD processors never drop prefetches; if they cannot be performed
1440 immediately, they are queued. We set number of simultaneous prefetches
1441 to a large constant to reflect this (it probably is not a good idea not
1442 to limit number of prefetches at all, as their execution also takes some
1443 time). */
1444 100, /* number of parallel prefetches. */
1445 3, /* Branch cost. */
1446 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1447 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1448 /* Latency of fdiv is 8-15. */
1449 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1450 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1451 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1452 /* Latency of fsqrt is 4-10. */
1453 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1454
1455 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1456 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1457 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1458 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1459 /* 9-13 */
1460 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1461 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1462 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1463 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1464 and it can execute 2 integer additions and 2 multiplications thus
1465 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1466 that 4 works better than 6 probably due to register pressure.
1467
1468 Integer vector operations are taken by FP unit and execute 3 vector
1469 plus/minus operations per cycle but only one multiply. This is adjusted
1470 in ix86_reassociation_width. */
1471 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1472 znver1_memcpy,
1473 znver1_memset,
1474 6, /* scalar_stmt_cost. */
1475 4, /* scalar load_cost. */
1476 4, /* scalar_store_cost. */
1477 6, /* vec_stmt_cost. */
1478 0, /* vec_to_scalar_cost. */
1479 2, /* scalar_to_vec_cost. */
1480 4, /* vec_align_load_cost. */
1481 4, /* vec_unalign_load_cost. */
1482 4, /* vec_store_cost. */
1483 4, /* cond_taken_branch_cost. */
1484 2, /* cond_not_taken_branch_cost. */
1485 };
1486
1487 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1488 very small blocks it is better to use loop. For large blocks, libcall can
1489 do nontemporary accesses and beat inline considerably. */
1490 static stringop_algs btver1_memcpy[2] = {
1491 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1492 {-1, rep_prefix_4_byte, false}}},
1493 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1494 {-1, libcall, false}}}};
1495 static stringop_algs btver1_memset[2] = {
1496 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1497 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1498 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1499 {-1, libcall, false}}}};
1500 const struct processor_costs btver1_cost = {
1501 COSTS_N_INSNS (1), /* cost of an add instruction */
1502 COSTS_N_INSNS (2), /* cost of a lea instruction */
1503 COSTS_N_INSNS (1), /* variable shift costs */
1504 COSTS_N_INSNS (1), /* constant shift costs */
1505 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1506 COSTS_N_INSNS (4), /* HI */
1507 COSTS_N_INSNS (3), /* SI */
1508 COSTS_N_INSNS (4), /* DI */
1509 COSTS_N_INSNS (5)}, /* other */
1510 0, /* cost of multiply per each bit set */
1511 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1512 COSTS_N_INSNS (35), /* HI */
1513 COSTS_N_INSNS (51), /* SI */
1514 COSTS_N_INSNS (83), /* DI */
1515 COSTS_N_INSNS (83)}, /* other */
1516 COSTS_N_INSNS (1), /* cost of movsx */
1517 COSTS_N_INSNS (1), /* cost of movzx */
1518 8, /* "large" insn */
1519 9, /* MOVE_RATIO */
1520 4, /* cost for loading QImode using movzbl */
1521 {3, 4, 3}, /* cost of loading integer registers
1522 in QImode, HImode and SImode.
1523 Relative to reg-reg move (2). */
1524 {3, 4, 3}, /* cost of storing integer registers */
1525 4, /* cost of reg,reg fld/fst */
1526 {4, 4, 12}, /* cost of loading fp registers
1527 in SFmode, DFmode and XFmode */
1528 {6, 6, 8}, /* cost of storing fp registers
1529 in SFmode, DFmode and XFmode */
1530 2, /* cost of moving MMX register */
1531 {3, 3}, /* cost of loading MMX registers
1532 in SImode and DImode */
1533 {4, 4}, /* cost of storing MMX registers
1534 in SImode and DImode */
1535 2, /* cost of moving SSE register */
1536 {4, 4, 3}, /* cost of loading SSE registers
1537 in SImode, DImode and TImode */
1538 {4, 4, 5}, /* cost of storing SSE registers
1539 in SImode, DImode and TImode */
1540 3, /* MMX or SSE register to integer */
1541 /* On K8:
1542 MOVD reg64, xmmreg Double FSTORE 4
1543 MOVD reg32, xmmreg Double FSTORE 4
1544 On AMDFAM10:
1545 MOVD reg64, xmmreg Double FADD 3
1546 1/1 1/1
1547 MOVD reg32, xmmreg Double FADD 3
1548 1/1 1/1 */
1549 32, /* size of l1 cache. */
1550 512, /* size of l2 cache. */
1551 64, /* size of prefetch block */
1552 100, /* number of parallel prefetches */
1553 2, /* Branch cost */
1554 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1555 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1556 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1557 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1558 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1559 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1560
1561 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1562 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1563 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1564 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1565 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
1566 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
1567 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
1568 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1569 btver1_memcpy,
1570 btver1_memset,
1571 4, /* scalar_stmt_cost. */
1572 2, /* scalar load_cost. */
1573 2, /* scalar_store_cost. */
1574 6, /* vec_stmt_cost. */
1575 0, /* vec_to_scalar_cost. */
1576 2, /* scalar_to_vec_cost. */
1577 2, /* vec_align_load_cost. */
1578 2, /* vec_unalign_load_cost. */
1579 2, /* vec_store_cost. */
1580 2, /* cond_taken_branch_cost. */
1581 1, /* cond_not_taken_branch_cost. */
1582 };
1583
1584 static stringop_algs btver2_memcpy[2] = {
1585 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1586 {-1, rep_prefix_4_byte, false}}},
1587 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1588 {-1, libcall, false}}}};
1589 static stringop_algs btver2_memset[2] = {
1590 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1591 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1592 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1593 {-1, libcall, false}}}};
1594 const struct processor_costs btver2_cost = {
1595 COSTS_N_INSNS (1), /* cost of an add instruction */
1596 COSTS_N_INSNS (2), /* cost of a lea instruction */
1597 COSTS_N_INSNS (1), /* variable shift costs */
1598 COSTS_N_INSNS (1), /* constant shift costs */
1599 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1600 COSTS_N_INSNS (4), /* HI */
1601 COSTS_N_INSNS (3), /* SI */
1602 COSTS_N_INSNS (4), /* DI */
1603 COSTS_N_INSNS (5)}, /* other */
1604 0, /* cost of multiply per each bit set */
1605 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1606 COSTS_N_INSNS (35), /* HI */
1607 COSTS_N_INSNS (51), /* SI */
1608 COSTS_N_INSNS (83), /* DI */
1609 COSTS_N_INSNS (83)}, /* other */
1610 COSTS_N_INSNS (1), /* cost of movsx */
1611 COSTS_N_INSNS (1), /* cost of movzx */
1612 8, /* "large" insn */
1613 9, /* MOVE_RATIO */
1614 4, /* cost for loading QImode using movzbl */
1615 {3, 4, 3}, /* cost of loading integer registers
1616 in QImode, HImode and SImode.
1617 Relative to reg-reg move (2). */
1618 {3, 4, 3}, /* cost of storing integer registers */
1619 4, /* cost of reg,reg fld/fst */
1620 {4, 4, 12}, /* cost of loading fp registers
1621 in SFmode, DFmode and XFmode */
1622 {6, 6, 8}, /* cost of storing fp registers
1623 in SFmode, DFmode and XFmode */
1624 2, /* cost of moving MMX register */
1625 {3, 3}, /* cost of loading MMX registers
1626 in SImode and DImode */
1627 {4, 4}, /* cost of storing MMX registers
1628 in SImode and DImode */
1629 2, /* cost of moving SSE register */
1630 {4, 4, 3}, /* cost of loading SSE registers
1631 in SImode, DImode and TImode */
1632 {4, 4, 5}, /* cost of storing SSE registers
1633 in SImode, DImode and TImode */
1634 3, /* MMX or SSE register to integer */
1635 /* On K8:
1636 MOVD reg64, xmmreg Double FSTORE 4
1637 MOVD reg32, xmmreg Double FSTORE 4
1638 On AMDFAM10:
1639 MOVD reg64, xmmreg Double FADD 3
1640 1/1 1/1
1641 MOVD reg32, xmmreg Double FADD 3
1642 1/1 1/1 */
1643 32, /* size of l1 cache. */
1644 2048, /* size of l2 cache. */
1645 64, /* size of prefetch block */
1646 100, /* number of parallel prefetches */
1647 2, /* Branch cost */
1648 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1649 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1650 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1651 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1652 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1653 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1654
1655 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1656 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1657 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1658 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1659 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
1660 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
1661 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
1662 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1663 btver2_memcpy,
1664 btver2_memset,
1665 4, /* scalar_stmt_cost. */
1666 2, /* scalar load_cost. */
1667 2, /* scalar_store_cost. */
1668 6, /* vec_stmt_cost. */
1669 0, /* vec_to_scalar_cost. */
1670 2, /* scalar_to_vec_cost. */
1671 2, /* vec_align_load_cost. */
1672 2, /* vec_unalign_load_cost. */
1673 2, /* vec_store_cost. */
1674 2, /* cond_taken_branch_cost. */
1675 1, /* cond_not_taken_branch_cost. */
1676 };
1677
1678 static stringop_algs pentium4_memcpy[2] = {
1679 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1680 DUMMY_STRINGOP_ALGS};
1681 static stringop_algs pentium4_memset[2] = {
1682 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1683 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1684 DUMMY_STRINGOP_ALGS};
1685
1686 static const
1687 struct processor_costs pentium4_cost = {
1688 COSTS_N_INSNS (1), /* cost of an add instruction */
1689 COSTS_N_INSNS (3), /* cost of a lea instruction */
1690 COSTS_N_INSNS (4), /* variable shift costs */
1691 COSTS_N_INSNS (4), /* constant shift costs */
1692 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1693 COSTS_N_INSNS (15), /* HI */
1694 COSTS_N_INSNS (15), /* SI */
1695 COSTS_N_INSNS (15), /* DI */
1696 COSTS_N_INSNS (15)}, /* other */
1697 0, /* cost of multiply per each bit set */
1698 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1699 COSTS_N_INSNS (56), /* HI */
1700 COSTS_N_INSNS (56), /* SI */
1701 COSTS_N_INSNS (56), /* DI */
1702 COSTS_N_INSNS (56)}, /* other */
1703 COSTS_N_INSNS (1), /* cost of movsx */
1704 COSTS_N_INSNS (1), /* cost of movzx */
1705 16, /* "large" insn */
1706 6, /* MOVE_RATIO */
1707 2, /* cost for loading QImode using movzbl */
1708 {4, 5, 4}, /* cost of loading integer registers
1709 in QImode, HImode and SImode.
1710 Relative to reg-reg move (2). */
1711 {2, 3, 2}, /* cost of storing integer registers */
1712 2, /* cost of reg,reg fld/fst */
1713 {2, 2, 6}, /* cost of loading fp registers
1714 in SFmode, DFmode and XFmode */
1715 {4, 4, 6}, /* cost of storing fp registers
1716 in SFmode, DFmode and XFmode */
1717 2, /* cost of moving MMX register */
1718 {2, 2}, /* cost of loading MMX registers
1719 in SImode and DImode */
1720 {2, 2}, /* cost of storing MMX registers
1721 in SImode and DImode */
1722 12, /* cost of moving SSE register */
1723 {12, 12, 12}, /* cost of loading SSE registers
1724 in SImode, DImode and TImode */
1725 {2, 2, 8}, /* cost of storing SSE registers
1726 in SImode, DImode and TImode */
1727 10, /* MMX or SSE register to integer */
1728 8, /* size of l1 cache. */
1729 256, /* size of l2 cache. */
1730 64, /* size of prefetch block */
1731 6, /* number of parallel prefetches */
1732 2, /* Branch cost */
1733 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1734 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1735 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1736 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1737 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1738 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1739
1740 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1741 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1742 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1743 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
1744 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
1745 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
1746 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
1747 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1748 pentium4_memcpy,
1749 pentium4_memset,
1750 1, /* scalar_stmt_cost. */
1751 1, /* scalar load_cost. */
1752 1, /* scalar_store_cost. */
1753 1, /* vec_stmt_cost. */
1754 1, /* vec_to_scalar_cost. */
1755 1, /* scalar_to_vec_cost. */
1756 1, /* vec_align_load_cost. */
1757 2, /* vec_unalign_load_cost. */
1758 1, /* vec_store_cost. */
1759 3, /* cond_taken_branch_cost. */
1760 1, /* cond_not_taken_branch_cost. */
1761 };
1762
1763 static stringop_algs nocona_memcpy[2] = {
1764 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1765 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1766 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1767
1768 static stringop_algs nocona_memset[2] = {
1769 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1770 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1771 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1772 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1773
1774 static const
1775 struct processor_costs nocona_cost = {
1776 COSTS_N_INSNS (1), /* cost of an add instruction */
1777 COSTS_N_INSNS (1), /* cost of a lea instruction */
1778 COSTS_N_INSNS (1), /* variable shift costs */
1779 COSTS_N_INSNS (1), /* constant shift costs */
1780 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1781 COSTS_N_INSNS (10), /* HI */
1782 COSTS_N_INSNS (10), /* SI */
1783 COSTS_N_INSNS (10), /* DI */
1784 COSTS_N_INSNS (10)}, /* other */
1785 0, /* cost of multiply per each bit set */
1786 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1787 COSTS_N_INSNS (66), /* HI */
1788 COSTS_N_INSNS (66), /* SI */
1789 COSTS_N_INSNS (66), /* DI */
1790 COSTS_N_INSNS (66)}, /* other */
1791 COSTS_N_INSNS (1), /* cost of movsx */
1792 COSTS_N_INSNS (1), /* cost of movzx */
1793 16, /* "large" insn */
1794 17, /* MOVE_RATIO */
1795 4, /* cost for loading QImode using movzbl */
1796 {4, 4, 4}, /* cost of loading integer registers
1797 in QImode, HImode and SImode.
1798 Relative to reg-reg move (2). */
1799 {4, 4, 4}, /* cost of storing integer registers */
1800 3, /* cost of reg,reg fld/fst */
1801 {12, 12, 12}, /* cost of loading fp registers
1802 in SFmode, DFmode and XFmode */
1803 {4, 4, 4}, /* cost of storing fp registers
1804 in SFmode, DFmode and XFmode */
1805 6, /* cost of moving MMX register */
1806 {12, 12}, /* cost of loading MMX registers
1807 in SImode and DImode */
1808 {12, 12}, /* cost of storing MMX registers
1809 in SImode and DImode */
1810 6, /* cost of moving SSE register */
1811 {12, 12, 12}, /* cost of loading SSE registers
1812 in SImode, DImode and TImode */
1813 {12, 12, 12}, /* cost of storing SSE registers
1814 in SImode, DImode and TImode */
1815 8, /* MMX or SSE register to integer */
1816 8, /* size of l1 cache. */
1817 1024, /* size of l2 cache. */
1818 64, /* size of prefetch block */
1819 8, /* number of parallel prefetches */
1820 1, /* Branch cost */
1821 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1822 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1823 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1824 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1825 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1826 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1827
1828 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
1829 COSTS_N_INSNS (7), /* cost of MULSS instruction. */
1830 COSTS_N_INSNS (7), /* cost of MULSD instruction. */
1831 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */
1832 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
1833 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
1834 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
1835 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1836 nocona_memcpy,
1837 nocona_memset,
1838 1, /* scalar_stmt_cost. */
1839 1, /* scalar load_cost. */
1840 1, /* scalar_store_cost. */
1841 1, /* vec_stmt_cost. */
1842 1, /* vec_to_scalar_cost. */
1843 1, /* scalar_to_vec_cost. */
1844 1, /* vec_align_load_cost. */
1845 2, /* vec_unalign_load_cost. */
1846 1, /* vec_store_cost. */
1847 3, /* cond_taken_branch_cost. */
1848 1, /* cond_not_taken_branch_cost. */
1849 };
1850
1851 static stringop_algs atom_memcpy[2] = {
1852 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1853 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1854 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1855 static stringop_algs atom_memset[2] = {
1856 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1857 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1858 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1859 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1860 static const
1861 struct processor_costs atom_cost = {
1862 COSTS_N_INSNS (1), /* cost of an add instruction */
1863 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1864 COSTS_N_INSNS (1), /* variable shift costs */
1865 COSTS_N_INSNS (1), /* constant shift costs */
1866 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1867 COSTS_N_INSNS (4), /* HI */
1868 COSTS_N_INSNS (3), /* SI */
1869 COSTS_N_INSNS (4), /* DI */
1870 COSTS_N_INSNS (2)}, /* other */
1871 0, /* cost of multiply per each bit set */
1872 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1873 COSTS_N_INSNS (26), /* HI */
1874 COSTS_N_INSNS (42), /* SI */
1875 COSTS_N_INSNS (74), /* DI */
1876 COSTS_N_INSNS (74)}, /* other */
1877 COSTS_N_INSNS (1), /* cost of movsx */
1878 COSTS_N_INSNS (1), /* cost of movzx */
1879 8, /* "large" insn */
1880 17, /* MOVE_RATIO */
1881 4, /* cost for loading QImode using movzbl */
1882 {4, 4, 4}, /* cost of loading integer registers
1883 in QImode, HImode and SImode.
1884 Relative to reg-reg move (2). */
1885 {4, 4, 4}, /* cost of storing integer registers */
1886 4, /* cost of reg,reg fld/fst */
1887 {12, 12, 12}, /* cost of loading fp registers
1888 in SFmode, DFmode and XFmode */
1889 {6, 6, 8}, /* cost of storing fp registers
1890 in SFmode, DFmode and XFmode */
1891 2, /* cost of moving MMX register */
1892 {8, 8}, /* cost of loading MMX registers
1893 in SImode and DImode */
1894 {8, 8}, /* cost of storing MMX registers
1895 in SImode and DImode */
1896 2, /* cost of moving SSE register */
1897 {8, 8, 8}, /* cost of loading SSE registers
1898 in SImode, DImode and TImode */
1899 {8, 8, 8}, /* cost of storing SSE registers
1900 in SImode, DImode and TImode */
1901 5, /* MMX or SSE register to integer */
1902 32, /* size of l1 cache. */
1903 256, /* size of l2 cache. */
1904 64, /* size of prefetch block */
1905 6, /* number of parallel prefetches */
1906 3, /* Branch cost */
1907 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1908 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1909 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1910 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1911 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1912 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1913
1914 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
1915 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1916 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
1917 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
1918 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
1919 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
1920 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
1921 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
1922 atom_memcpy,
1923 atom_memset,
1924 1, /* scalar_stmt_cost. */
1925 1, /* scalar load_cost. */
1926 1, /* scalar_store_cost. */
1927 1, /* vec_stmt_cost. */
1928 1, /* vec_to_scalar_cost. */
1929 1, /* scalar_to_vec_cost. */
1930 1, /* vec_align_load_cost. */
1931 2, /* vec_unalign_load_cost. */
1932 1, /* vec_store_cost. */
1933 3, /* cond_taken_branch_cost. */
1934 1, /* cond_not_taken_branch_cost. */
1935 };
1936
1937 static stringop_algs slm_memcpy[2] = {
1938 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1939 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1940 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1941 static stringop_algs slm_memset[2] = {
1942 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1943 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1944 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1945 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1946 static const
1947 struct processor_costs slm_cost = {
1948 COSTS_N_INSNS (1), /* cost of an add instruction */
1949 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1950 COSTS_N_INSNS (1), /* variable shift costs */
1951 COSTS_N_INSNS (1), /* constant shift costs */
1952 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1953 COSTS_N_INSNS (3), /* HI */
1954 COSTS_N_INSNS (3), /* SI */
1955 COSTS_N_INSNS (4), /* DI */
1956 COSTS_N_INSNS (2)}, /* other */
1957 0, /* cost of multiply per each bit set */
1958 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1959 COSTS_N_INSNS (26), /* HI */
1960 COSTS_N_INSNS (42), /* SI */
1961 COSTS_N_INSNS (74), /* DI */
1962 COSTS_N_INSNS (74)}, /* other */
1963 COSTS_N_INSNS (1), /* cost of movsx */
1964 COSTS_N_INSNS (1), /* cost of movzx */
1965 8, /* "large" insn */
1966 17, /* MOVE_RATIO */
1967 4, /* cost for loading QImode using movzbl */
1968 {4, 4, 4}, /* cost of loading integer registers
1969 in QImode, HImode and SImode.
1970 Relative to reg-reg move (2). */
1971 {4, 4, 4}, /* cost of storing integer registers */
1972 4, /* cost of reg,reg fld/fst */
1973 {12, 12, 12}, /* cost of loading fp registers
1974 in SFmode, DFmode and XFmode */
1975 {6, 6, 8}, /* cost of storing fp registers
1976 in SFmode, DFmode and XFmode */
1977 2, /* cost of moving MMX register */
1978 {8, 8}, /* cost of loading MMX registers
1979 in SImode and DImode */
1980 {8, 8}, /* cost of storing MMX registers
1981 in SImode and DImode */
1982 2, /* cost of moving SSE register */
1983 {8, 8, 8}, /* cost of loading SSE registers
1984 in SImode, DImode and TImode */
1985 {8, 8, 8}, /* cost of storing SSE registers
1986 in SImode, DImode and TImode */
1987 5, /* MMX or SSE register to integer */
1988 32, /* size of l1 cache. */
1989 256, /* size of l2 cache. */
1990 64, /* size of prefetch block */
1991 6, /* number of parallel prefetches */
1992 3, /* Branch cost */
1993 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1994 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1995 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1996 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1997 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1998 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1999
2000 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2001 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2002 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2003 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
2004 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
2005 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
2006 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
2007 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2008 slm_memcpy,
2009 slm_memset,
2010 1, /* scalar_stmt_cost. */
2011 1, /* scalar load_cost. */
2012 1, /* scalar_store_cost. */
2013 1, /* vec_stmt_cost. */
2014 4, /* vec_to_scalar_cost. */
2015 1, /* scalar_to_vec_cost. */
2016 1, /* vec_align_load_cost. */
2017 2, /* vec_unalign_load_cost. */
2018 1, /* vec_store_cost. */
2019 3, /* cond_taken_branch_cost. */
2020 1, /* cond_not_taken_branch_cost. */
2021 };
2022
2023 static stringop_algs intel_memcpy[2] = {
2024 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2025 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2026 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2027 static stringop_algs intel_memset[2] = {
2028 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2029 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2030 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2031 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2032 static const
2033 struct processor_costs intel_cost = {
2034 COSTS_N_INSNS (1), /* cost of an add instruction */
2035 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2036 COSTS_N_INSNS (1), /* variable shift costs */
2037 COSTS_N_INSNS (1), /* constant shift costs */
2038 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2039 COSTS_N_INSNS (3), /* HI */
2040 COSTS_N_INSNS (3), /* SI */
2041 COSTS_N_INSNS (4), /* DI */
2042 COSTS_N_INSNS (2)}, /* other */
2043 0, /* cost of multiply per each bit set */
2044 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2045 COSTS_N_INSNS (26), /* HI */
2046 COSTS_N_INSNS (42), /* SI */
2047 COSTS_N_INSNS (74), /* DI */
2048 COSTS_N_INSNS (74)}, /* other */
2049 COSTS_N_INSNS (1), /* cost of movsx */
2050 COSTS_N_INSNS (1), /* cost of movzx */
2051 8, /* "large" insn */
2052 17, /* MOVE_RATIO */
2053 4, /* cost for loading QImode using movzbl */
2054 {4, 4, 4}, /* cost of loading integer registers
2055 in QImode, HImode and SImode.
2056 Relative to reg-reg move (2). */
2057 {4, 4, 4}, /* cost of storing integer registers */
2058 4, /* cost of reg,reg fld/fst */
2059 {12, 12, 12}, /* cost of loading fp registers
2060 in SFmode, DFmode and XFmode */
2061 {6, 6, 8}, /* cost of storing fp registers
2062 in SFmode, DFmode and XFmode */
2063 2, /* cost of moving MMX register */
2064 {8, 8}, /* cost of loading MMX registers
2065 in SImode and DImode */
2066 {8, 8}, /* cost of storing MMX registers
2067 in SImode and DImode */
2068 2, /* cost of moving SSE register */
2069 {8, 8, 8}, /* cost of loading SSE registers
2070 in SImode, DImode and TImode */
2071 {8, 8, 8}, /* cost of storing SSE registers
2072 in SImode, DImode and TImode */
2073 5, /* MMX or SSE register to integer */
2074 32, /* size of l1 cache. */
2075 256, /* size of l2 cache. */
2076 64, /* size of prefetch block */
2077 6, /* number of parallel prefetches */
2078 3, /* Branch cost */
2079 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2080 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2081 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2082 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2083 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2084 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2085
2086 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
2087 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
2088 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
2089 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
2090 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
2091 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
2092 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
2093 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2094 intel_memcpy,
2095 intel_memset,
2096 1, /* scalar_stmt_cost. */
2097 1, /* scalar load_cost. */
2098 1, /* scalar_store_cost. */
2099 1, /* vec_stmt_cost. */
2100 4, /* vec_to_scalar_cost. */
2101 1, /* scalar_to_vec_cost. */
2102 1, /* vec_align_load_cost. */
2103 2, /* vec_unalign_load_cost. */
2104 1, /* vec_store_cost. */
2105 3, /* cond_taken_branch_cost. */
2106 1, /* cond_not_taken_branch_cost. */
2107 };
2108
2109 /* Generic should produce code tuned for Core-i7 (and newer chips)
2110 and btver1 (and newer chips). */
2111
2112 static stringop_algs generic_memcpy[2] = {
2113 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2114 {-1, libcall, false}}},
2115 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2116 {-1, libcall, false}}}};
2117 static stringop_algs generic_memset[2] = {
2118 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2119 {-1, libcall, false}}},
2120 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2121 {-1, libcall, false}}}};
2122 static const
2123 struct processor_costs generic_cost = {
2124 COSTS_N_INSNS (1), /* cost of an add instruction */
2125 /* On all chips taken into consideration lea is 2 cycles and more. With
2126 this cost however our current implementation of synth_mult results in
2127 use of unnecessary temporary registers causing regression on several
2128 SPECfp benchmarks. */
2129 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2130 COSTS_N_INSNS (1), /* variable shift costs */
2131 COSTS_N_INSNS (1), /* constant shift costs */
2132 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2133 COSTS_N_INSNS (4), /* HI */
2134 COSTS_N_INSNS (3), /* SI */
2135 COSTS_N_INSNS (4), /* DI */
2136 COSTS_N_INSNS (2)}, /* other */
2137 0, /* cost of multiply per each bit set */
2138 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2139 COSTS_N_INSNS (26), /* HI */
2140 COSTS_N_INSNS (42), /* SI */
2141 COSTS_N_INSNS (74), /* DI */
2142 COSTS_N_INSNS (74)}, /* other */
2143 COSTS_N_INSNS (1), /* cost of movsx */
2144 COSTS_N_INSNS (1), /* cost of movzx */
2145 8, /* "large" insn */
2146 17, /* MOVE_RATIO */
2147 4, /* cost for loading QImode using movzbl */
2148 {4, 4, 4}, /* cost of loading integer registers
2149 in QImode, HImode and SImode.
2150 Relative to reg-reg move (2). */
2151 {4, 4, 4}, /* cost of storing integer registers */
2152 4, /* cost of reg,reg fld/fst */
2153 {12, 12, 12}, /* cost of loading fp registers
2154 in SFmode, DFmode and XFmode */
2155 {6, 6, 8}, /* cost of storing fp registers
2156 in SFmode, DFmode and XFmode */
2157 2, /* cost of moving MMX register */
2158 {8, 8}, /* cost of loading MMX registers
2159 in SImode and DImode */
2160 {8, 8}, /* cost of storing MMX registers
2161 in SImode and DImode */
2162 2, /* cost of moving SSE register */
2163 {8, 8, 8}, /* cost of loading SSE registers
2164 in SImode, DImode and TImode */
2165 {8, 8, 8}, /* cost of storing SSE registers
2166 in SImode, DImode and TImode */
2167 5, /* MMX or SSE register to integer */
2168 32, /* size of l1 cache. */
2169 512, /* size of l2 cache. */
2170 64, /* size of prefetch block */
2171 6, /* number of parallel prefetches */
2172 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2173 value is increased to perhaps more appropriate value of 5. */
2174 3, /* Branch cost */
2175 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2176 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2177 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2178 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2179 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2180 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2181
2182 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
2183 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
2184 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
2185 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
2186 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
2187 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
2188 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
2189 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2190 generic_memcpy,
2191 generic_memset,
2192 1, /* scalar_stmt_cost. */
2193 1, /* scalar load_cost. */
2194 1, /* scalar_store_cost. */
2195 1, /* vec_stmt_cost. */
2196 1, /* vec_to_scalar_cost. */
2197 1, /* scalar_to_vec_cost. */
2198 1, /* vec_align_load_cost. */
2199 2, /* vec_unalign_load_cost. */
2200 1, /* vec_store_cost. */
2201 3, /* cond_taken_branch_cost. */
2202 1, /* cond_not_taken_branch_cost. */
2203 };
2204
2205 /* core_cost should produce code tuned for Core familly of CPUs. */
2206 static stringop_algs core_memcpy[2] = {
2207 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2208 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2209 {-1, libcall, false}}}};
2210 static stringop_algs core_memset[2] = {
2211 {libcall, {{6, loop_1_byte, true},
2212 {24, loop, true},
2213 {8192, rep_prefix_4_byte, true},
2214 {-1, libcall, false}}},
2215 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2216 {-1, libcall, false}}}};
2217
2218 static const
2219 struct processor_costs core_cost = {
2220 COSTS_N_INSNS (1), /* cost of an add instruction */
2221 /* On all chips taken into consideration lea is 2 cycles and more. With
2222 this cost however our current implementation of synth_mult results in
2223 use of unnecessary temporary registers causing regression on several
2224 SPECfp benchmarks. */
2225 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2226 COSTS_N_INSNS (1), /* variable shift costs */
2227 COSTS_N_INSNS (1), /* constant shift costs */
2228 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2229 COSTS_N_INSNS (4), /* HI */
2230 COSTS_N_INSNS (3), /* SI */
2231 COSTS_N_INSNS (4), /* DI */
2232 COSTS_N_INSNS (2)}, /* other */
2233 0, /* cost of multiply per each bit set */
2234 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2235 COSTS_N_INSNS (26), /* HI */
2236 COSTS_N_INSNS (42), /* SI */
2237 COSTS_N_INSNS (74), /* DI */
2238 COSTS_N_INSNS (74)}, /* other */
2239 COSTS_N_INSNS (1), /* cost of movsx */
2240 COSTS_N_INSNS (1), /* cost of movzx */
2241 8, /* "large" insn */
2242 17, /* MOVE_RATIO */
2243 4, /* cost for loading QImode using movzbl */
2244 {4, 4, 4}, /* cost of loading integer registers
2245 in QImode, HImode and SImode.
2246 Relative to reg-reg move (2). */
2247 {4, 4, 4}, /* cost of storing integer registers */
2248 4, /* cost of reg,reg fld/fst */
2249 {12, 12, 12}, /* cost of loading fp registers
2250 in SFmode, DFmode and XFmode */
2251 {6, 6, 8}, /* cost of storing fp registers
2252 in SFmode, DFmode and XFmode */
2253 2, /* cost of moving MMX register */
2254 {8, 8}, /* cost of loading MMX registers
2255 in SImode and DImode */
2256 {8, 8}, /* cost of storing MMX registers
2257 in SImode and DImode */
2258 2, /* cost of moving SSE register */
2259 {8, 8, 8}, /* cost of loading SSE registers
2260 in SImode, DImode and TImode */
2261 {8, 8, 8}, /* cost of storing SSE registers
2262 in SImode, DImode and TImode */
2263 5, /* MMX or SSE register to integer */
2264 64, /* size of l1 cache. */
2265 512, /* size of l2 cache. */
2266 64, /* size of prefetch block */
2267 6, /* number of parallel prefetches */
2268 /* FIXME perhaps more appropriate value is 5. */
2269 3, /* Branch cost */
2270 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2271 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2272 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2273 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2274 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2275 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2276
2277 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2278 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2279 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2280 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
2281 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
2282 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
2283 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
2284 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2285 core_memcpy,
2286 core_memset,
2287 1, /* scalar_stmt_cost. */
2288 1, /* scalar load_cost. */
2289 1, /* scalar_store_cost. */
2290 1, /* vec_stmt_cost. */
2291 1, /* vec_to_scalar_cost. */
2292 1, /* scalar_to_vec_cost. */
2293 1, /* vec_align_load_cost. */
2294 2, /* vec_unalign_load_cost. */
2295 1, /* vec_store_cost. */
2296 3, /* cond_taken_branch_cost. */
2297 1, /* cond_not_taken_branch_cost. */
2298 };
2299