amdfam10
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
3 2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to
19 the Free Software Foundation, 51 Franklin Street, Fifth Floor,
20 Boston, MA 02110-1301, USA. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "real.h"
32 #include "insn-config.h"
33 #include "conditions.h"
34 #include "output.h"
35 #include "insn-codes.h"
36 #include "insn-attr.h"
37 #include "flags.h"
38 #include "except.h"
39 #include "function.h"
40 #include "recog.h"
41 #include "expr.h"
42 #include "optabs.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "langhooks.h"
49 #include "cgraph.h"
50 #include "tree-gimple.h"
51 #include "dwarf2.h"
52 #include "tm-constrs.h"
53 #include "params.h"
54
55 #ifndef CHECK_STACK_LIMIT
56 #define CHECK_STACK_LIMIT (-1)
57 #endif
58
59 /* Return index of given mode in mult and division cost tables. */
60 #define MODE_INDEX(mode) \
61 ((mode) == QImode ? 0 \
62 : (mode) == HImode ? 1 \
63 : (mode) == SImode ? 2 \
64 : (mode) == DImode ? 3 \
65 : 4)
66
67 /* Processor costs (relative to an add) */
68 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
69 #define COSTS_N_BYTES(N) ((N) * 2)
70
71 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
72
73 static const
74 struct processor_costs size_cost = { /* costs for tuning for size */
75 COSTS_N_BYTES (2), /* cost of an add instruction */
76 COSTS_N_BYTES (3), /* cost of a lea instruction */
77 COSTS_N_BYTES (2), /* variable shift costs */
78 COSTS_N_BYTES (3), /* constant shift costs */
79 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
80 COSTS_N_BYTES (3), /* HI */
81 COSTS_N_BYTES (3), /* SI */
82 COSTS_N_BYTES (3), /* DI */
83 COSTS_N_BYTES (5)}, /* other */
84 0, /* cost of multiply per each bit set */
85 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
86 COSTS_N_BYTES (3), /* HI */
87 COSTS_N_BYTES (3), /* SI */
88 COSTS_N_BYTES (3), /* DI */
89 COSTS_N_BYTES (5)}, /* other */
90 COSTS_N_BYTES (3), /* cost of movsx */
91 COSTS_N_BYTES (3), /* cost of movzx */
92 0, /* "large" insn */
93 2, /* MOVE_RATIO */
94 2, /* cost for loading QImode using movzbl */
95 {2, 2, 2}, /* cost of loading integer registers
96 in QImode, HImode and SImode.
97 Relative to reg-reg move (2). */
98 {2, 2, 2}, /* cost of storing integer registers */
99 2, /* cost of reg,reg fld/fst */
100 {2, 2, 2}, /* cost of loading fp registers
101 in SFmode, DFmode and XFmode */
102 {2, 2, 2}, /* cost of storing fp registers
103 in SFmode, DFmode and XFmode */
104 3, /* cost of moving MMX register */
105 {3, 3}, /* cost of loading MMX registers
106 in SImode and DImode */
107 {3, 3}, /* cost of storing MMX registers
108 in SImode and DImode */
109 3, /* cost of moving SSE register */
110 {3, 3, 3}, /* cost of loading SSE registers
111 in SImode, DImode and TImode */
112 {3, 3, 3}, /* cost of storing SSE registers
113 in SImode, DImode and TImode */
114 3, /* MMX or SSE register to integer */
115 0, /* size of prefetch block */
116 0, /* number of parallel prefetches */
117 2, /* Branch cost */
118 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
119 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
120 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
121 COSTS_N_BYTES (2), /* cost of FABS instruction. */
122 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
123 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
124 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
125 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
126 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
127 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}
128 };
129
130 /* Processor costs (relative to an add) */
131 static const
132 struct processor_costs i386_cost = { /* 386 specific costs */
133 COSTS_N_INSNS (1), /* cost of an add instruction */
134 COSTS_N_INSNS (1), /* cost of a lea instruction */
135 COSTS_N_INSNS (3), /* variable shift costs */
136 COSTS_N_INSNS (2), /* constant shift costs */
137 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
138 COSTS_N_INSNS (6), /* HI */
139 COSTS_N_INSNS (6), /* SI */
140 COSTS_N_INSNS (6), /* DI */
141 COSTS_N_INSNS (6)}, /* other */
142 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
143 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
144 COSTS_N_INSNS (23), /* HI */
145 COSTS_N_INSNS (23), /* SI */
146 COSTS_N_INSNS (23), /* DI */
147 COSTS_N_INSNS (23)}, /* other */
148 COSTS_N_INSNS (3), /* cost of movsx */
149 COSTS_N_INSNS (2), /* cost of movzx */
150 15, /* "large" insn */
151 3, /* MOVE_RATIO */
152 4, /* cost for loading QImode using movzbl */
153 {2, 4, 2}, /* cost of loading integer registers
154 in QImode, HImode and SImode.
155 Relative to reg-reg move (2). */
156 {2, 4, 2}, /* cost of storing integer registers */
157 2, /* cost of reg,reg fld/fst */
158 {8, 8, 8}, /* cost of loading fp registers
159 in SFmode, DFmode and XFmode */
160 {8, 8, 8}, /* cost of storing fp registers
161 in SFmode, DFmode and XFmode */
162 2, /* cost of moving MMX register */
163 {4, 8}, /* cost of loading MMX registers
164 in SImode and DImode */
165 {4, 8}, /* cost of storing MMX registers
166 in SImode and DImode */
167 2, /* cost of moving SSE register */
168 {4, 8, 16}, /* cost of loading SSE registers
169 in SImode, DImode and TImode */
170 {4, 8, 16}, /* cost of storing SSE registers
171 in SImode, DImode and TImode */
172 3, /* MMX or SSE register to integer */
173 0, /* size of prefetch block */
174 0, /* number of parallel prefetches */
175 1, /* Branch cost */
176 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
177 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
178 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
179 COSTS_N_INSNS (22), /* cost of FABS instruction. */
180 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
181 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
182 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
183 DUMMY_STRINGOP_ALGS},
184 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
185 DUMMY_STRINGOP_ALGS},
186 };
187
188 static const
189 struct processor_costs i486_cost = { /* 486 specific costs */
190 COSTS_N_INSNS (1), /* cost of an add instruction */
191 COSTS_N_INSNS (1), /* cost of a lea instruction */
192 COSTS_N_INSNS (3), /* variable shift costs */
193 COSTS_N_INSNS (2), /* constant shift costs */
194 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
195 COSTS_N_INSNS (12), /* HI */
196 COSTS_N_INSNS (12), /* SI */
197 COSTS_N_INSNS (12), /* DI */
198 COSTS_N_INSNS (12)}, /* other */
199 1, /* cost of multiply per each bit set */
200 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
201 COSTS_N_INSNS (40), /* HI */
202 COSTS_N_INSNS (40), /* SI */
203 COSTS_N_INSNS (40), /* DI */
204 COSTS_N_INSNS (40)}, /* other */
205 COSTS_N_INSNS (3), /* cost of movsx */
206 COSTS_N_INSNS (2), /* cost of movzx */
207 15, /* "large" insn */
208 3, /* MOVE_RATIO */
209 4, /* cost for loading QImode using movzbl */
210 {2, 4, 2}, /* cost of loading integer registers
211 in QImode, HImode and SImode.
212 Relative to reg-reg move (2). */
213 {2, 4, 2}, /* cost of storing integer registers */
214 2, /* cost of reg,reg fld/fst */
215 {8, 8, 8}, /* cost of loading fp registers
216 in SFmode, DFmode and XFmode */
217 {8, 8, 8}, /* cost of storing fp registers
218 in SFmode, DFmode and XFmode */
219 2, /* cost of moving MMX register */
220 {4, 8}, /* cost of loading MMX registers
221 in SImode and DImode */
222 {4, 8}, /* cost of storing MMX registers
223 in SImode and DImode */
224 2, /* cost of moving SSE register */
225 {4, 8, 16}, /* cost of loading SSE registers
226 in SImode, DImode and TImode */
227 {4, 8, 16}, /* cost of storing SSE registers
228 in SImode, DImode and TImode */
229 3, /* MMX or SSE register to integer */
230 0, /* size of prefetch block */
231 0, /* number of parallel prefetches */
232 1, /* Branch cost */
233 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
234 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
235 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
236 COSTS_N_INSNS (3), /* cost of FABS instruction. */
237 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
238 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
239 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
240 DUMMY_STRINGOP_ALGS},
241 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
242 DUMMY_STRINGOP_ALGS}
243 };
244
245 static const
246 struct processor_costs pentium_cost = {
247 COSTS_N_INSNS (1), /* cost of an add instruction */
248 COSTS_N_INSNS (1), /* cost of a lea instruction */
249 COSTS_N_INSNS (4), /* variable shift costs */
250 COSTS_N_INSNS (1), /* constant shift costs */
251 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
252 COSTS_N_INSNS (11), /* HI */
253 COSTS_N_INSNS (11), /* SI */
254 COSTS_N_INSNS (11), /* DI */
255 COSTS_N_INSNS (11)}, /* other */
256 0, /* cost of multiply per each bit set */
257 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
258 COSTS_N_INSNS (25), /* HI */
259 COSTS_N_INSNS (25), /* SI */
260 COSTS_N_INSNS (25), /* DI */
261 COSTS_N_INSNS (25)}, /* other */
262 COSTS_N_INSNS (3), /* cost of movsx */
263 COSTS_N_INSNS (2), /* cost of movzx */
264 8, /* "large" insn */
265 6, /* MOVE_RATIO */
266 6, /* cost for loading QImode using movzbl */
267 {2, 4, 2}, /* cost of loading integer registers
268 in QImode, HImode and SImode.
269 Relative to reg-reg move (2). */
270 {2, 4, 2}, /* cost of storing integer registers */
271 2, /* cost of reg,reg fld/fst */
272 {2, 2, 6}, /* cost of loading fp registers
273 in SFmode, DFmode and XFmode */
274 {4, 4, 6}, /* cost of storing fp registers
275 in SFmode, DFmode and XFmode */
276 8, /* cost of moving MMX register */
277 {8, 8}, /* cost of loading MMX registers
278 in SImode and DImode */
279 {8, 8}, /* cost of storing MMX registers
280 in SImode and DImode */
281 2, /* cost of moving SSE register */
282 {4, 8, 16}, /* cost of loading SSE registers
283 in SImode, DImode and TImode */
284 {4, 8, 16}, /* cost of storing SSE registers
285 in SImode, DImode and TImode */
286 3, /* MMX or SSE register to integer */
287 0, /* size of prefetch block */
288 0, /* number of parallel prefetches */
289 2, /* Branch cost */
290 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
291 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
292 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
293 COSTS_N_INSNS (1), /* cost of FABS instruction. */
294 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
295 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
296 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
297 DUMMY_STRINGOP_ALGS},
298 {{libcall, {{-1, rep_prefix_4_byte}}},
299 DUMMY_STRINGOP_ALGS}
300 };
301
302 static const
303 struct processor_costs pentiumpro_cost = {
304 COSTS_N_INSNS (1), /* cost of an add instruction */
305 COSTS_N_INSNS (1), /* cost of a lea instruction */
306 COSTS_N_INSNS (1), /* variable shift costs */
307 COSTS_N_INSNS (1), /* constant shift costs */
308 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
309 COSTS_N_INSNS (4), /* HI */
310 COSTS_N_INSNS (4), /* SI */
311 COSTS_N_INSNS (4), /* DI */
312 COSTS_N_INSNS (4)}, /* other */
313 0, /* cost of multiply per each bit set */
314 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
315 COSTS_N_INSNS (17), /* HI */
316 COSTS_N_INSNS (17), /* SI */
317 COSTS_N_INSNS (17), /* DI */
318 COSTS_N_INSNS (17)}, /* other */
319 COSTS_N_INSNS (1), /* cost of movsx */
320 COSTS_N_INSNS (1), /* cost of movzx */
321 8, /* "large" insn */
322 6, /* MOVE_RATIO */
323 2, /* cost for loading QImode using movzbl */
324 {4, 4, 4}, /* cost of loading integer registers
325 in QImode, HImode and SImode.
326 Relative to reg-reg move (2). */
327 {2, 2, 2}, /* cost of storing integer registers */
328 2, /* cost of reg,reg fld/fst */
329 {2, 2, 6}, /* cost of loading fp registers
330 in SFmode, DFmode and XFmode */
331 {4, 4, 6}, /* cost of storing fp registers
332 in SFmode, DFmode and XFmode */
333 2, /* cost of moving MMX register */
334 {2, 2}, /* cost of loading MMX registers
335 in SImode and DImode */
336 {2, 2}, /* cost of storing MMX registers
337 in SImode and DImode */
338 2, /* cost of moving SSE register */
339 {2, 2, 8}, /* cost of loading SSE registers
340 in SImode, DImode and TImode */
341 {2, 2, 8}, /* cost of storing SSE registers
342 in SImode, DImode and TImode */
343 3, /* MMX or SSE register to integer */
344 32, /* size of prefetch block */
345 6, /* number of parallel prefetches */
346 2, /* Branch cost */
347 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
348 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
349 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
350 COSTS_N_INSNS (2), /* cost of FABS instruction. */
351 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
352 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
353 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes (we ensure
354 the alignment). For small blocks inline loop is still a noticeable win, for bigger
355 blocks either rep movsl or rep movsb is way to go. Rep movsb has apparently
356 more expensive startup time in CPU, but after 4K the difference is down in the noise.
357 */
358 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
359 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
360 DUMMY_STRINGOP_ALGS},
361 {{rep_prefix_4_byte, {{1024, unrolled_loop},
362 {8192, rep_prefix_4_byte}, {-1, libcall}}},
363 DUMMY_STRINGOP_ALGS}
364 };
365
366 static const
367 struct processor_costs geode_cost = {
368 COSTS_N_INSNS (1), /* cost of an add instruction */
369 COSTS_N_INSNS (1), /* cost of a lea instruction */
370 COSTS_N_INSNS (2), /* variable shift costs */
371 COSTS_N_INSNS (1), /* constant shift costs */
372 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
373 COSTS_N_INSNS (4), /* HI */
374 COSTS_N_INSNS (7), /* SI */
375 COSTS_N_INSNS (7), /* DI */
376 COSTS_N_INSNS (7)}, /* other */
377 0, /* cost of multiply per each bit set */
378 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
379 COSTS_N_INSNS (23), /* HI */
380 COSTS_N_INSNS (39), /* SI */
381 COSTS_N_INSNS (39), /* DI */
382 COSTS_N_INSNS (39)}, /* other */
383 COSTS_N_INSNS (1), /* cost of movsx */
384 COSTS_N_INSNS (1), /* cost of movzx */
385 8, /* "large" insn */
386 4, /* MOVE_RATIO */
387 1, /* cost for loading QImode using movzbl */
388 {1, 1, 1}, /* cost of loading integer registers
389 in QImode, HImode and SImode.
390 Relative to reg-reg move (2). */
391 {1, 1, 1}, /* cost of storing integer registers */
392 1, /* cost of reg,reg fld/fst */
393 {1, 1, 1}, /* cost of loading fp registers
394 in SFmode, DFmode and XFmode */
395 {4, 6, 6}, /* cost of storing fp registers
396 in SFmode, DFmode and XFmode */
397
398 1, /* cost of moving MMX register */
399 {1, 1}, /* cost of loading MMX registers
400 in SImode and DImode */
401 {1, 1}, /* cost of storing MMX registers
402 in SImode and DImode */
403 1, /* cost of moving SSE register */
404 {1, 1, 1}, /* cost of loading SSE registers
405 in SImode, DImode and TImode */
406 {1, 1, 1}, /* cost of storing SSE registers
407 in SImode, DImode and TImode */
408 1, /* MMX or SSE register to integer */
409 32, /* size of prefetch block */
410 1, /* number of parallel prefetches */
411 1, /* Branch cost */
412 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
413 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
414 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
415 COSTS_N_INSNS (1), /* cost of FABS instruction. */
416 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
417 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
418 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
419 DUMMY_STRINGOP_ALGS},
420 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
421 DUMMY_STRINGOP_ALGS}
422 };
423
424 static const
425 struct processor_costs k6_cost = {
426 COSTS_N_INSNS (1), /* cost of an add instruction */
427 COSTS_N_INSNS (2), /* cost of a lea instruction */
428 COSTS_N_INSNS (1), /* variable shift costs */
429 COSTS_N_INSNS (1), /* constant shift costs */
430 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
431 COSTS_N_INSNS (3), /* HI */
432 COSTS_N_INSNS (3), /* SI */
433 COSTS_N_INSNS (3), /* DI */
434 COSTS_N_INSNS (3)}, /* other */
435 0, /* cost of multiply per each bit set */
436 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
437 COSTS_N_INSNS (18), /* HI */
438 COSTS_N_INSNS (18), /* SI */
439 COSTS_N_INSNS (18), /* DI */
440 COSTS_N_INSNS (18)}, /* other */
441 COSTS_N_INSNS (2), /* cost of movsx */
442 COSTS_N_INSNS (2), /* cost of movzx */
443 8, /* "large" insn */
444 4, /* MOVE_RATIO */
445 3, /* cost for loading QImode using movzbl */
446 {4, 5, 4}, /* cost of loading integer registers
447 in QImode, HImode and SImode.
448 Relative to reg-reg move (2). */
449 {2, 3, 2}, /* cost of storing integer registers */
450 4, /* cost of reg,reg fld/fst */
451 {6, 6, 6}, /* cost of loading fp registers
452 in SFmode, DFmode and XFmode */
453 {4, 4, 4}, /* cost of storing fp registers
454 in SFmode, DFmode and XFmode */
455 2, /* cost of moving MMX register */
456 {2, 2}, /* cost of loading MMX registers
457 in SImode and DImode */
458 {2, 2}, /* cost of storing MMX registers
459 in SImode and DImode */
460 2, /* cost of moving SSE register */
461 {2, 2, 8}, /* cost of loading SSE registers
462 in SImode, DImode and TImode */
463 {2, 2, 8}, /* cost of storing SSE registers
464 in SImode, DImode and TImode */
465 6, /* MMX or SSE register to integer */
466 32, /* size of prefetch block */
467 1, /* number of parallel prefetches */
468 1, /* Branch cost */
469 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
470 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
471 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
472 COSTS_N_INSNS (2), /* cost of FABS instruction. */
473 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
474 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
475 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
476 DUMMY_STRINGOP_ALGS},
477 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
478 DUMMY_STRINGOP_ALGS}
479 };
480
481 static const
482 struct processor_costs athlon_cost = {
483 COSTS_N_INSNS (1), /* cost of an add instruction */
484 COSTS_N_INSNS (2), /* cost of a lea instruction */
485 COSTS_N_INSNS (1), /* variable shift costs */
486 COSTS_N_INSNS (1), /* constant shift costs */
487 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
488 COSTS_N_INSNS (5), /* HI */
489 COSTS_N_INSNS (5), /* SI */
490 COSTS_N_INSNS (5), /* DI */
491 COSTS_N_INSNS (5)}, /* other */
492 0, /* cost of multiply per each bit set */
493 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
494 COSTS_N_INSNS (26), /* HI */
495 COSTS_N_INSNS (42), /* SI */
496 COSTS_N_INSNS (74), /* DI */
497 COSTS_N_INSNS (74)}, /* other */
498 COSTS_N_INSNS (1), /* cost of movsx */
499 COSTS_N_INSNS (1), /* cost of movzx */
500 8, /* "large" insn */
501 9, /* MOVE_RATIO */
502 4, /* cost for loading QImode using movzbl */
503 {3, 4, 3}, /* cost of loading integer registers
504 in QImode, HImode and SImode.
505 Relative to reg-reg move (2). */
506 {3, 4, 3}, /* cost of storing integer registers */
507 4, /* cost of reg,reg fld/fst */
508 {4, 4, 12}, /* cost of loading fp registers
509 in SFmode, DFmode and XFmode */
510 {6, 6, 8}, /* cost of storing fp registers
511 in SFmode, DFmode and XFmode */
512 2, /* cost of moving MMX register */
513 {4, 4}, /* cost of loading MMX registers
514 in SImode and DImode */
515 {4, 4}, /* cost of storing MMX registers
516 in SImode and DImode */
517 2, /* cost of moving SSE register */
518 {4, 4, 6}, /* cost of loading SSE registers
519 in SImode, DImode and TImode */
520 {4, 4, 5}, /* cost of storing SSE registers
521 in SImode, DImode and TImode */
522 5, /* MMX or SSE register to integer */
523 64, /* size of prefetch block */
524 6, /* number of parallel prefetches */
525 5, /* Branch cost */
526 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
527 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
528 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
529 COSTS_N_INSNS (2), /* cost of FABS instruction. */
530 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
531 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
532 /* For some reason, Athlon deals better with REP prefix (relative to loops)
533 compared to K8. Alignment becomes important after 8 bytes for memcpy and
534 128 bytes for memset. */
535 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
536 DUMMY_STRINGOP_ALGS},
537 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
538 DUMMY_STRINGOP_ALGS}
539 };
540
541 static const
542 struct processor_costs k8_cost = {
543 COSTS_N_INSNS (1), /* cost of an add instruction */
544 COSTS_N_INSNS (2), /* cost of a lea instruction */
545 COSTS_N_INSNS (1), /* variable shift costs */
546 COSTS_N_INSNS (1), /* constant shift costs */
547 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
548 COSTS_N_INSNS (4), /* HI */
549 COSTS_N_INSNS (3), /* SI */
550 COSTS_N_INSNS (4), /* DI */
551 COSTS_N_INSNS (5)}, /* other */
552 0, /* cost of multiply per each bit set */
553 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
554 COSTS_N_INSNS (26), /* HI */
555 COSTS_N_INSNS (42), /* SI */
556 COSTS_N_INSNS (74), /* DI */
557 COSTS_N_INSNS (74)}, /* other */
558 COSTS_N_INSNS (1), /* cost of movsx */
559 COSTS_N_INSNS (1), /* cost of movzx */
560 8, /* "large" insn */
561 9, /* MOVE_RATIO */
562 4, /* cost for loading QImode using movzbl */
563 {3, 4, 3}, /* cost of loading integer registers
564 in QImode, HImode and SImode.
565 Relative to reg-reg move (2). */
566 {3, 4, 3}, /* cost of storing integer registers */
567 4, /* cost of reg,reg fld/fst */
568 {4, 4, 12}, /* cost of loading fp registers
569 in SFmode, DFmode and XFmode */
570 {6, 6, 8}, /* cost of storing fp registers
571 in SFmode, DFmode and XFmode */
572 2, /* cost of moving MMX register */
573 {3, 3}, /* cost of loading MMX registers
574 in SImode and DImode */
575 {4, 4}, /* cost of storing MMX registers
576 in SImode and DImode */
577 2, /* cost of moving SSE register */
578 {4, 3, 6}, /* cost of loading SSE registers
579 in SImode, DImode and TImode */
580 {4, 4, 5}, /* cost of storing SSE registers
581 in SImode, DImode and TImode */
582 5, /* MMX or SSE register to integer */
583 64, /* size of prefetch block */
584 /* New AMD processors never drop prefetches; if they cannot be performed
585 immediately, they are queued. We set number of simultaneous prefetches
586 to a large constant to reflect this (it probably is not a good idea not
587 to limit number of prefetches at all, as their execution also takes some
588 time). */
589 100, /* number of parallel prefetches */
590 5, /* Branch cost */
591 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
592 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
593 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
594 COSTS_N_INSNS (2), /* cost of FABS instruction. */
595 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
596 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
597 /* K8 has optimized REP instruction for medium sized blocks, but for very small
598 blocks it is better to use loop. For large blocks, libcall can do
599 nontemporary accesses and beat inline considerably. */
600 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
601 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
602 {{libcall, {{8, loop}, {24, unrolled_loop},
603 {2048, rep_prefix_4_byte}, {-1, libcall}}},
604 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
605 };
606
607 struct processor_costs amdfam10_cost = {
608 COSTS_N_INSNS (1), /* cost of an add instruction */
609 COSTS_N_INSNS (2), /* cost of a lea instruction */
610 COSTS_N_INSNS (1), /* variable shift costs */
611 COSTS_N_INSNS (1), /* constant shift costs */
612 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
613 COSTS_N_INSNS (4), /* HI */
614 COSTS_N_INSNS (3), /* SI */
615 COSTS_N_INSNS (4), /* DI */
616 COSTS_N_INSNS (5)}, /* other */
617 0, /* cost of multiply per each bit set */
618 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
619 COSTS_N_INSNS (35), /* HI */
620 COSTS_N_INSNS (51), /* SI */
621 COSTS_N_INSNS (83), /* DI */
622 COSTS_N_INSNS (83)}, /* other */
623 COSTS_N_INSNS (1), /* cost of movsx */
624 COSTS_N_INSNS (1), /* cost of movzx */
625 8, /* "large" insn */
626 9, /* MOVE_RATIO */
627 4, /* cost for loading QImode using movzbl */
628 {3, 4, 3}, /* cost of loading integer registers
629 in QImode, HImode and SImode.
630 Relative to reg-reg move (2). */
631 {3, 4, 3}, /* cost of storing integer registers */
632 4, /* cost of reg,reg fld/fst */
633 {4, 4, 12}, /* cost of loading fp registers
634 in SFmode, DFmode and XFmode */
635 {6, 6, 8}, /* cost of storing fp registers
636 in SFmode, DFmode and XFmode */
637 2, /* cost of moving MMX register */
638 {3, 3}, /* cost of loading MMX registers
639 in SImode and DImode */
640 {4, 4}, /* cost of storing MMX registers
641 in SImode and DImode */
642 2, /* cost of moving SSE register */
643 {4, 4, 3}, /* cost of loading SSE registers
644 in SImode, DImode and TImode */
645 {4, 4, 5}, /* cost of storing SSE registers
646 in SImode, DImode and TImode */
647 3, /* MMX or SSE register to integer */
648 /* On K8
649 MOVD reg64, xmmreg Double FSTORE 4
650 MOVD reg32, xmmreg Double FSTORE 4
651 On AMDFAM10
652 MOVD reg64, xmmreg Double FADD 3
653 1/1 1/1
654 MOVD reg32, xmmreg Double FADD 3
655 1/1 1/1 */
656 64, /* size of prefetch block */
657 /* New AMD processors never drop prefetches; if they cannot be performed
658 immediately, they are queued. We set number of simultaneous prefetches
659 to a large constant to reflect this (it probably is not a good idea not
660 to limit number of prefetches at all, as their execution also takes some
661 time). */
662 100, /* number of parallel prefetches */
663 5, /* Branch cost */
664 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
665 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
666 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
667 COSTS_N_INSNS (2), /* cost of FABS instruction. */
668 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
669 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
670
671 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
672 very small blocks it is better to use loop. For large blocks, libcall can
673 do nontemporary accesses and beat inline considerably. */
674 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
675 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
676 {{libcall, {{8, loop}, {24, unrolled_loop},
677 {2048, rep_prefix_4_byte}, {-1, libcall}}},
678 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
679 };
680
681 static const
682 struct processor_costs pentium4_cost = {
683 COSTS_N_INSNS (1), /* cost of an add instruction */
684 COSTS_N_INSNS (3), /* cost of a lea instruction */
685 COSTS_N_INSNS (4), /* variable shift costs */
686 COSTS_N_INSNS (4), /* constant shift costs */
687 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
688 COSTS_N_INSNS (15), /* HI */
689 COSTS_N_INSNS (15), /* SI */
690 COSTS_N_INSNS (15), /* DI */
691 COSTS_N_INSNS (15)}, /* other */
692 0, /* cost of multiply per each bit set */
693 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
694 COSTS_N_INSNS (56), /* HI */
695 COSTS_N_INSNS (56), /* SI */
696 COSTS_N_INSNS (56), /* DI */
697 COSTS_N_INSNS (56)}, /* other */
698 COSTS_N_INSNS (1), /* cost of movsx */
699 COSTS_N_INSNS (1), /* cost of movzx */
700 16, /* "large" insn */
701 6, /* MOVE_RATIO */
702 2, /* cost for loading QImode using movzbl */
703 {4, 5, 4}, /* cost of loading integer registers
704 in QImode, HImode and SImode.
705 Relative to reg-reg move (2). */
706 {2, 3, 2}, /* cost of storing integer registers */
707 2, /* cost of reg,reg fld/fst */
708 {2, 2, 6}, /* cost of loading fp registers
709 in SFmode, DFmode and XFmode */
710 {4, 4, 6}, /* cost of storing fp registers
711 in SFmode, DFmode and XFmode */
712 2, /* cost of moving MMX register */
713 {2, 2}, /* cost of loading MMX registers
714 in SImode and DImode */
715 {2, 2}, /* cost of storing MMX registers
716 in SImode and DImode */
717 12, /* cost of moving SSE register */
718 {12, 12, 12}, /* cost of loading SSE registers
719 in SImode, DImode and TImode */
720 {2, 2, 8}, /* cost of storing SSE registers
721 in SImode, DImode and TImode */
722 10, /* MMX or SSE register to integer */
723 64, /* size of prefetch block */
724 6, /* number of parallel prefetches */
725 2, /* Branch cost */
726 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
727 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
728 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
729 COSTS_N_INSNS (2), /* cost of FABS instruction. */
730 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
731 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
732 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
733 DUMMY_STRINGOP_ALGS},
734 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
735 {-1, libcall}}},
736 DUMMY_STRINGOP_ALGS},
737 };
738
739 static const
740 struct processor_costs nocona_cost = {
741 COSTS_N_INSNS (1), /* cost of an add instruction */
742 COSTS_N_INSNS (1), /* cost of a lea instruction */
743 COSTS_N_INSNS (1), /* variable shift costs */
744 COSTS_N_INSNS (1), /* constant shift costs */
745 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
746 COSTS_N_INSNS (10), /* HI */
747 COSTS_N_INSNS (10), /* SI */
748 COSTS_N_INSNS (10), /* DI */
749 COSTS_N_INSNS (10)}, /* other */
750 0, /* cost of multiply per each bit set */
751 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
752 COSTS_N_INSNS (66), /* HI */
753 COSTS_N_INSNS (66), /* SI */
754 COSTS_N_INSNS (66), /* DI */
755 COSTS_N_INSNS (66)}, /* other */
756 COSTS_N_INSNS (1), /* cost of movsx */
757 COSTS_N_INSNS (1), /* cost of movzx */
758 16, /* "large" insn */
759 17, /* MOVE_RATIO */
760 4, /* cost for loading QImode using movzbl */
761 {4, 4, 4}, /* cost of loading integer registers
762 in QImode, HImode and SImode.
763 Relative to reg-reg move (2). */
764 {4, 4, 4}, /* cost of storing integer registers */
765 3, /* cost of reg,reg fld/fst */
766 {12, 12, 12}, /* cost of loading fp registers
767 in SFmode, DFmode and XFmode */
768 {4, 4, 4}, /* cost of storing fp registers
769 in SFmode, DFmode and XFmode */
770 6, /* cost of moving MMX register */
771 {12, 12}, /* cost of loading MMX registers
772 in SImode and DImode */
773 {12, 12}, /* cost of storing MMX registers
774 in SImode and DImode */
775 6, /* cost of moving SSE register */
776 {12, 12, 12}, /* cost of loading SSE registers
777 in SImode, DImode and TImode */
778 {12, 12, 12}, /* cost of storing SSE registers
779 in SImode, DImode and TImode */
780 8, /* MMX or SSE register to integer */
781 128, /* size of prefetch block */
782 8, /* number of parallel prefetches */
783 1, /* Branch cost */
784 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
785 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
786 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
787 COSTS_N_INSNS (3), /* cost of FABS instruction. */
788 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
789 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
790 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
791 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
792 {100000, unrolled_loop}, {-1, libcall}}}},
793 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
794 {-1, libcall}}},
795 {libcall, {{24, loop}, {64, unrolled_loop},
796 {8192, rep_prefix_8_byte}, {-1, libcall}}}}
797 };
798
799 static const
800 struct processor_costs core2_cost = {
801 COSTS_N_INSNS (1), /* cost of an add instruction */
802 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
803 COSTS_N_INSNS (1), /* variable shift costs */
804 COSTS_N_INSNS (1), /* constant shift costs */
805 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
806 COSTS_N_INSNS (3), /* HI */
807 COSTS_N_INSNS (3), /* SI */
808 COSTS_N_INSNS (3), /* DI */
809 COSTS_N_INSNS (3)}, /* other */
810 0, /* cost of multiply per each bit set */
811 {COSTS_N_INSNS (22), /* cost of a divide/mod for QI */
812 COSTS_N_INSNS (22), /* HI */
813 COSTS_N_INSNS (22), /* SI */
814 COSTS_N_INSNS (22), /* DI */
815 COSTS_N_INSNS (22)}, /* other */
816 COSTS_N_INSNS (1), /* cost of movsx */
817 COSTS_N_INSNS (1), /* cost of movzx */
818 8, /* "large" insn */
819 16, /* MOVE_RATIO */
820 2, /* cost for loading QImode using movzbl */
821 {6, 6, 6}, /* cost of loading integer registers
822 in QImode, HImode and SImode.
823 Relative to reg-reg move (2). */
824 {4, 4, 4}, /* cost of storing integer registers */
825 2, /* cost of reg,reg fld/fst */
826 {6, 6, 6}, /* cost of loading fp registers
827 in SFmode, DFmode and XFmode */
828 {4, 4, 4}, /* cost of loading integer registers */
829 2, /* cost of moving MMX register */
830 {6, 6}, /* cost of loading MMX registers
831 in SImode and DImode */
832 {4, 4}, /* cost of storing MMX registers
833 in SImode and DImode */
834 2, /* cost of moving SSE register */
835 {6, 6, 6}, /* cost of loading SSE registers
836 in SImode, DImode and TImode */
837 {4, 4, 4}, /* cost of storing SSE registers
838 in SImode, DImode and TImode */
839 2, /* MMX or SSE register to integer */
840 128, /* size of prefetch block */
841 8, /* number of parallel prefetches */
842 3, /* Branch cost */
843 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
844 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
845 COSTS_N_INSNS (32), /* cost of FDIV instruction. */
846 COSTS_N_INSNS (1), /* cost of FABS instruction. */
847 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
848 COSTS_N_INSNS (58), /* cost of FSQRT instruction. */
849 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
850 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
851 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
852 {{libcall, {{8, loop}, {15, unrolled_loop},
853 {2048, rep_prefix_4_byte}, {-1, libcall}}},
854 {libcall, {{24, loop}, {32, unrolled_loop},
855 {8192, rep_prefix_8_byte}, {-1, libcall}}}}
856 };
857
858 /* Generic64 should produce code tuned for Nocona and K8. */
859 static const
860 struct processor_costs generic64_cost = {
861 COSTS_N_INSNS (1), /* cost of an add instruction */
862 /* On all chips taken into consideration lea is 2 cycles and more. With
863 this cost however our current implementation of synth_mult results in
864 use of unnecessary temporary registers causing regression on several
865 SPECfp benchmarks. */
866 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
867 COSTS_N_INSNS (1), /* variable shift costs */
868 COSTS_N_INSNS (1), /* constant shift costs */
869 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
870 COSTS_N_INSNS (4), /* HI */
871 COSTS_N_INSNS (3), /* SI */
872 COSTS_N_INSNS (4), /* DI */
873 COSTS_N_INSNS (2)}, /* other */
874 0, /* cost of multiply per each bit set */
875 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
876 COSTS_N_INSNS (26), /* HI */
877 COSTS_N_INSNS (42), /* SI */
878 COSTS_N_INSNS (74), /* DI */
879 COSTS_N_INSNS (74)}, /* other */
880 COSTS_N_INSNS (1), /* cost of movsx */
881 COSTS_N_INSNS (1), /* cost of movzx */
882 8, /* "large" insn */
883 17, /* MOVE_RATIO */
884 4, /* cost for loading QImode using movzbl */
885 {4, 4, 4}, /* cost of loading integer registers
886 in QImode, HImode and SImode.
887 Relative to reg-reg move (2). */
888 {4, 4, 4}, /* cost of storing integer registers */
889 4, /* cost of reg,reg fld/fst */
890 {12, 12, 12}, /* cost of loading fp registers
891 in SFmode, DFmode and XFmode */
892 {6, 6, 8}, /* cost of storing fp registers
893 in SFmode, DFmode and XFmode */
894 2, /* cost of moving MMX register */
895 {8, 8}, /* cost of loading MMX registers
896 in SImode and DImode */
897 {8, 8}, /* cost of storing MMX registers
898 in SImode and DImode */
899 2, /* cost of moving SSE register */
900 {8, 8, 8}, /* cost of loading SSE registers
901 in SImode, DImode and TImode */
902 {8, 8, 8}, /* cost of storing SSE registers
903 in SImode, DImode and TImode */
904 5, /* MMX or SSE register to integer */
905 64, /* size of prefetch block */
906 6, /* number of parallel prefetches */
907 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this value
908 is increased to perhaps more appropriate value of 5. */
909 3, /* Branch cost */
910 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
911 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
912 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
913 COSTS_N_INSNS (8), /* cost of FABS instruction. */
914 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
915 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
916 {DUMMY_STRINGOP_ALGS,
917 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
918 {DUMMY_STRINGOP_ALGS,
919 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
920 };
921
922 /* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8. */
923 static const
924 struct processor_costs generic32_cost = {
925 COSTS_N_INSNS (1), /* cost of an add instruction */
926 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
927 COSTS_N_INSNS (1), /* variable shift costs */
928 COSTS_N_INSNS (1), /* constant shift costs */
929 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
930 COSTS_N_INSNS (4), /* HI */
931 COSTS_N_INSNS (3), /* SI */
932 COSTS_N_INSNS (4), /* DI */
933 COSTS_N_INSNS (2)}, /* other */
934 0, /* cost of multiply per each bit set */
935 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
936 COSTS_N_INSNS (26), /* HI */
937 COSTS_N_INSNS (42), /* SI */
938 COSTS_N_INSNS (74), /* DI */
939 COSTS_N_INSNS (74)}, /* other */
940 COSTS_N_INSNS (1), /* cost of movsx */
941 COSTS_N_INSNS (1), /* cost of movzx */
942 8, /* "large" insn */
943 17, /* MOVE_RATIO */
944 4, /* cost for loading QImode using movzbl */
945 {4, 4, 4}, /* cost of loading integer registers
946 in QImode, HImode and SImode.
947 Relative to reg-reg move (2). */
948 {4, 4, 4}, /* cost of storing integer registers */
949 4, /* cost of reg,reg fld/fst */
950 {12, 12, 12}, /* cost of loading fp registers
951 in SFmode, DFmode and XFmode */
952 {6, 6, 8}, /* cost of storing fp registers
953 in SFmode, DFmode and XFmode */
954 2, /* cost of moving MMX register */
955 {8, 8}, /* cost of loading MMX registers
956 in SImode and DImode */
957 {8, 8}, /* cost of storing MMX registers
958 in SImode and DImode */
959 2, /* cost of moving SSE register */
960 {8, 8, 8}, /* cost of loading SSE registers
961 in SImode, DImode and TImode */
962 {8, 8, 8}, /* cost of storing SSE registers
963 in SImode, DImode and TImode */
964 5, /* MMX or SSE register to integer */
965 64, /* size of prefetch block */
966 6, /* number of parallel prefetches */
967 3, /* Branch cost */
968 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
969 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
970 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
971 COSTS_N_INSNS (8), /* cost of FABS instruction. */
972 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
973 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
974 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
975 DUMMY_STRINGOP_ALGS},
976 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
977 DUMMY_STRINGOP_ALGS},
978 };
979
980 const struct processor_costs *ix86_cost = &pentium_cost;
981
982 /* Processor feature/optimization bitmasks. */
983 #define m_386 (1<<PROCESSOR_I386)
984 #define m_486 (1<<PROCESSOR_I486)
985 #define m_PENT (1<<PROCESSOR_PENTIUM)
986 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
987 #define m_GEODE (1<<PROCESSOR_GEODE)
988 #define m_K6_GEODE (m_K6 | m_GEODE)
989 #define m_K6 (1<<PROCESSOR_K6)
990 #define m_ATHLON (1<<PROCESSOR_ATHLON)
991 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
992 #define m_K8 (1<<PROCESSOR_K8)
993 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
994 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
995 #define m_NOCONA (1<<PROCESSOR_NOCONA)
996 #define m_CORE2 (1<<PROCESSOR_CORE2)
997 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
998 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
999 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1000 #define m_ATHLON_K8_AMDFAM10 (m_K8 | m_ATHLON | m_AMDFAM10)
1001
1002 /* Generic instruction choice should be common subset of supported CPUs
1003 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1004
1005 /* Leave is not affecting Nocona SPEC2000 results negatively, so enabling for
1006 Generic64 seems like good code size tradeoff. We can't enable it for 32bit
1007 generic because it is not working well with PPro base chips. */
1008 const int x86_use_leave = m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_CORE2
1009 | m_GENERIC64;
1010 const int x86_push_memory = m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
1011 | m_NOCONA | m_CORE2 | m_GENERIC;
1012 const int x86_zero_extend_with_and = m_486 | m_PENT;
1013 /* Enable to zero extend integer registers to avoid partial dependencies */
1014 const int x86_movx = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA
1015 | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */;
1016 const int x86_double_with_add = ~m_386;
1017 const int x86_use_bit_test = m_386;
1018 const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10
1019 | m_K6 | m_CORE2 | m_GENERIC;
1020 const int x86_cmove = m_PPRO | m_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
1021 | m_NOCONA;
1022 const int x86_3dnow_a = m_ATHLON_K8_AMDFAM10;
1023 const int x86_deep_branch = m_PPRO | m_K6_GEODE | m_ATHLON_K8_AMDFAM10
1024 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
1025 /* Branch hints were put in P4 based on simulation result. But
1026 after P4 was made, no performance benefit was observed with
1027 branch hints. It also increases the code size. As the result,
1028 icc never generates branch hints. */
1029 const int x86_branch_hints = 0;
1030 const int x86_use_sahf = m_PPRO | m_K6_GEODE | m_PENT4 | m_NOCONA | m_GENERIC32;
1031 /*m_GENERIC | m_ATHLON_K8 ? */
1032 /* We probably ought to watch for partial register stalls on Generic32
1033 compilation setting as well. However in current implementation the
1034 partial register stalls are not eliminated very well - they can
1035 be introduced via subregs synthesized by combine and can happen
1036 in caller/callee saving sequences.
1037 Because this option pays back little on PPro based chips and is in conflict
1038 with partial reg. dependencies used by Athlon/P4 based chips, it is better
1039 to leave it off for generic32 for now. */
1040 const int x86_partial_reg_stall = m_PPRO;
1041 const int x86_partial_flag_reg_stall = m_CORE2 | m_GENERIC;
1042 const int x86_use_himode_fiop = m_386 | m_486 | m_K6_GEODE;
1043 const int x86_use_simode_fiop = ~(m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT
1044 | m_CORE2 | m_GENERIC);
1045 const int x86_use_mov0 = m_K6;
1046 const int x86_use_cltd = ~(m_PENT | m_K6 | m_CORE2 | m_GENERIC);
1047 const int x86_read_modify_write = ~m_PENT;
1048 const int x86_read_modify = ~(m_PENT | m_PPRO);
1049 const int x86_split_long_moves = m_PPRO;
1050 const int x86_promote_QImode = m_K6_GEODE | m_PENT | m_386 | m_486
1051 | m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC;
1052 /* m_PENT4 ? */
1053 const int x86_fast_prefix = ~(m_PENT | m_486 | m_386);
1054 const int x86_single_stringop = m_386 | m_PENT4 | m_NOCONA;
1055 const int x86_qimode_math = ~(0);
1056 const int x86_promote_qi_regs = 0;
1057 /* On PPro this flag is meant to avoid partial register stalls. Just like
1058 the x86_partial_reg_stall this option might be considered for Generic32
1059 if our scheme for avoiding partial stalls was more effective. */
1060 const int x86_himode_math = ~(m_PPRO);
1061 const int x86_promote_hi_regs = m_PPRO;
1062 /* Enable if add/sub rsp is preferred over 1 or 2 push/pop */
1063 const int x86_sub_esp_4 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA
1064 | m_CORE2 | m_GENERIC;
1065 const int x86_sub_esp_8 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_386 | m_486
1066 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
1067 const int x86_add_esp_4 = m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT4 | m_NOCONA
1068 | m_CORE2 | m_GENERIC;
1069 const int x86_add_esp_8 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_K6_GEODE | m_386
1070 | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
1071 /* Enable if integer moves are preferred for DFmode copies */
1072 const int x86_integer_DFmode_moves = ~(m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA
1073 | m_PPRO | m_CORE2 | m_GENERIC | m_GEODE);
1074 const int x86_partial_reg_dependency = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA
1075 | m_CORE2 | m_GENERIC;
1076 const int x86_memory_mismatch_stall = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA
1077 | m_CORE2 | m_GENERIC;
1078 /* If ACCUMULATE_OUTGOING_ARGS is enabled, the maximum amount of space required
1079 for outgoing arguments will be computed and placed into the variable
1080 `current_function_outgoing_args_size'. No space will be pushed onto the stack
1081 for each call; instead, the function prologue should increase the stack frame
1082 size by this amount. Setting both PUSH_ARGS and ACCUMULATE_OUTGOING_ARGS is
1083 not proper. */
1084 const int x86_accumulate_outgoing_args = m_ATHLON_K8_AMDFAM10 | m_PENT4
1085 | m_NOCONA | m_PPRO | m_CORE2
1086 | m_GENERIC;
1087 const int x86_prologue_using_move = m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC;
1088 const int x86_epilogue_using_move = m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC;
1089 const int x86_shift1 = ~m_486;
1090 const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO
1091 | m_ATHLON_K8_AMDFAM10 | m_PENT4
1092 | m_NOCONA | m_CORE2 | m_GENERIC;
1093 /* In Generic model we have an conflict here in between PPro/Pentium4 based chips
1094 that thread 128bit SSE registers as single units versus K8 based chips that
1095 divide SSE registers to two 64bit halves.
1096 x86_sse_partial_reg_dependency promote all store destinations to be 128bit
1097 to allow register renaming on 128bit SSE units, but usually results in one
1098 extra microop on 64bit SSE units. Experimental results shows that disabling
1099 this option on P4 brings over 20% SPECfp regression, while enabling it on
1100 K8 brings roughly 2.4% regression that can be partly masked by careful scheduling
1101 of moves. */
1102 const int x86_sse_partial_reg_dependency = m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
1103 | m_GENERIC | m_AMDFAM10;
1104 /* Set for machines where the type and dependencies are resolved on SSE
1105 register parts instead of whole registers, so we may maintain just
1106 lower part of scalar values in proper format leaving the upper part
1107 undefined. */
1108 const int x86_sse_split_regs = m_ATHLON_K8;
1109 /* Code generation for scalar reg-reg moves of single and double precision data:
1110 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
1111 movaps reg, reg
1112 else
1113 movss reg, reg
1114 if (x86_sse_partial_reg_dependency == true)
1115 movapd reg, reg
1116 else
1117 movsd reg, reg
1118
1119 Code generation for scalar loads of double precision data:
1120 if (x86_sse_split_regs == true)
1121 movlpd mem, reg (gas syntax)
1122 else
1123 movsd mem, reg
1124
1125 Code generation for unaligned packed loads of single precision data
1126 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
1127 if (x86_sse_unaligned_move_optimal)
1128 movups mem, reg
1129
1130 if (x86_sse_partial_reg_dependency == true)
1131 {
1132 xorps reg, reg
1133 movlps mem, reg
1134 movhps mem+8, reg
1135 }
1136 else
1137 {
1138 movlps mem, reg
1139 movhps mem+8, reg
1140 }
1141
1142 Code generation for unaligned packed loads of double precision data
1143 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
1144 if (x86_sse_unaligned_move_optimal)
1145 movupd mem, reg
1146
1147 if (x86_sse_split_regs == true)
1148 {
1149 movlpd mem, reg
1150 movhpd mem+8, reg
1151 }
1152 else
1153 {
1154 movsd mem, reg
1155 movhpd mem+8, reg
1156 }
1157 */
1158 const int x86_sse_unaligned_move_optimal = m_AMDFAM10;
1159 const int x86_sse_typeless_stores = m_ATHLON_K8_AMDFAM10;
1160 const int x86_sse_load0_by_pxor = m_PPRO | m_PENT4 | m_NOCONA;
1161 const int x86_use_ffreep = m_ATHLON_K8_AMDFAM10;
1162 const int x86_use_incdec = ~(m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC);
1163
1164 /* ??? Allowing interunit moves makes it all too easy for the compiler to put
1165 integer data in xmm registers. Which results in pretty abysmal code. */
1166 const int x86_inter_unit_moves = 0 /* ~(m_ATHLON_K8) */;
1167
1168 const int x86_ext_80387_constants = m_K6_GEODE | m_ATHLON_K8 | m_PENT4
1169 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
1170 /* Some CPU cores are not able to predict more than 4 branch instructions in
1171 the 16 byte window. */
1172 const int x86_four_jump_limit = m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4
1173 | m_NOCONA | m_CORE2 | m_GENERIC;
1174 const int x86_schedule = m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT
1175 | m_CORE2 | m_GENERIC;
1176 const int x86_use_bt = m_ATHLON_K8_AMDFAM10;
1177 /* Compare and exchange was added for 80486. */
1178 const int x86_cmpxchg = ~m_386;
1179 /* Compare and exchange 8 bytes was added for pentium. */
1180 const int x86_cmpxchg8b = ~(m_386 | m_486);
1181 /* Exchange and add was added for 80486. */
1182 const int x86_xadd = ~m_386;
1183 /* Byteswap was added for 80486. */
1184 const int x86_bswap = ~m_386;
1185 const int x86_pad_returns = m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC;
1186
1187 static enum stringop_alg stringop_alg = no_stringop;
1188
1189 /* In case the average insn count for single function invocation is
1190 lower than this constant, emit fast (but longer) prologue and
1191 epilogue code. */
1192 #define FAST_PROLOGUE_INSN_COUNT 20
1193
1194 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
1195 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1196 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
1197 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
1198
1199 /* Array of the smallest class containing reg number REGNO, indexed by
1200 REGNO. Used by REGNO_REG_CLASS in i386.h. */
1201
1202 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
1203 {
1204 /* ax, dx, cx, bx */
1205 AREG, DREG, CREG, BREG,
1206 /* si, di, bp, sp */
1207 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
1208 /* FP registers */
1209 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
1210 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
1211 /* arg pointer */
1212 NON_Q_REGS,
1213 /* flags, fpsr, fpcr, frame */
1214 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
1215 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1216 SSE_REGS, SSE_REGS,
1217 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
1218 MMX_REGS, MMX_REGS,
1219 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1220 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1221 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1222 SSE_REGS, SSE_REGS,
1223 };
1224
1225 /* The "default" register map used in 32bit mode. */
1226
1227 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
1228 {
1229 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
1230 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
1231 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1232 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
1233 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
1234 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1235 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1236 };
1237
1238 static int const x86_64_int_parameter_registers[6] =
1239 {
1240 5 /*RDI*/, 4 /*RSI*/, 1 /*RDX*/, 2 /*RCX*/,
1241 FIRST_REX_INT_REG /*R8 */, FIRST_REX_INT_REG + 1 /*R9 */
1242 };
1243
1244 static int const x86_64_int_return_registers[4] =
1245 {
1246 0 /*RAX*/, 1 /*RDI*/, 5 /*RDI*/, 4 /*RSI*/
1247 };
1248
1249 /* The "default" register map used in 64bit mode. */
1250 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
1251 {
1252 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
1253 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
1254 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1255 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
1256 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
1257 8,9,10,11,12,13,14,15, /* extended integer registers */
1258 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
1259 };
1260
1261 /* Define the register numbers to be used in Dwarf debugging information.
1262 The SVR4 reference port C compiler uses the following register numbers
1263 in its Dwarf output code:
1264 0 for %eax (gcc regno = 0)
1265 1 for %ecx (gcc regno = 2)
1266 2 for %edx (gcc regno = 1)
1267 3 for %ebx (gcc regno = 3)
1268 4 for %esp (gcc regno = 7)
1269 5 for %ebp (gcc regno = 6)
1270 6 for %esi (gcc regno = 4)
1271 7 for %edi (gcc regno = 5)
1272 The following three DWARF register numbers are never generated by
1273 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
1274 believes these numbers have these meanings.
1275 8 for %eip (no gcc equivalent)
1276 9 for %eflags (gcc regno = 17)
1277 10 for %trapno (no gcc equivalent)
1278 It is not at all clear how we should number the FP stack registers
1279 for the x86 architecture. If the version of SDB on x86/svr4 were
1280 a bit less brain dead with respect to floating-point then we would
1281 have a precedent to follow with respect to DWARF register numbers
1282 for x86 FP registers, but the SDB on x86/svr4 is so completely
1283 broken with respect to FP registers that it is hardly worth thinking
1284 of it as something to strive for compatibility with.
1285 The version of x86/svr4 SDB I have at the moment does (partially)
1286 seem to believe that DWARF register number 11 is associated with
1287 the x86 register %st(0), but that's about all. Higher DWARF
1288 register numbers don't seem to be associated with anything in
1289 particular, and even for DWARF regno 11, SDB only seems to under-
1290 stand that it should say that a variable lives in %st(0) (when
1291 asked via an `=' command) if we said it was in DWARF regno 11,
1292 but SDB still prints garbage when asked for the value of the
1293 variable in question (via a `/' command).
1294 (Also note that the labels SDB prints for various FP stack regs
1295 when doing an `x' command are all wrong.)
1296 Note that these problems generally don't affect the native SVR4
1297 C compiler because it doesn't allow the use of -O with -g and
1298 because when it is *not* optimizing, it allocates a memory
1299 location for each floating-point variable, and the memory
1300 location is what gets described in the DWARF AT_location
1301 attribute for the variable in question.
1302 Regardless of the severe mental illness of the x86/svr4 SDB, we
1303 do something sensible here and we use the following DWARF
1304 register numbers. Note that these are all stack-top-relative
1305 numbers.
1306 11 for %st(0) (gcc regno = 8)
1307 12 for %st(1) (gcc regno = 9)
1308 13 for %st(2) (gcc regno = 10)
1309 14 for %st(3) (gcc regno = 11)
1310 15 for %st(4) (gcc regno = 12)
1311 16 for %st(5) (gcc regno = 13)
1312 17 for %st(6) (gcc regno = 14)
1313 18 for %st(7) (gcc regno = 15)
1314 */
1315 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
1316 {
1317 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
1318 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
1319 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1320 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
1321 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
1322 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1323 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1324 };
1325
1326 /* Test and compare insns in i386.md store the information needed to
1327 generate branch and scc insns here. */
1328
1329 rtx ix86_compare_op0 = NULL_RTX;
1330 rtx ix86_compare_op1 = NULL_RTX;
1331 rtx ix86_compare_emitted = NULL_RTX;
1332
1333 /* Size of the register save area. */
1334 #define X86_64_VARARGS_SIZE (REGPARM_MAX * UNITS_PER_WORD + SSE_REGPARM_MAX * 16)
1335
1336 /* Define the structure for the machine field in struct function. */
1337
1338 struct stack_local_entry GTY(())
1339 {
1340 unsigned short mode;
1341 unsigned short n;
1342 rtx rtl;
1343 struct stack_local_entry *next;
1344 };
1345
1346 /* Structure describing stack frame layout.
1347 Stack grows downward:
1348
1349 [arguments]
1350 <- ARG_POINTER
1351 saved pc
1352
1353 saved frame pointer if frame_pointer_needed
1354 <- HARD_FRAME_POINTER
1355 [saved regs]
1356
1357 [padding1] \
1358 )
1359 [va_arg registers] (
1360 > to_allocate <- FRAME_POINTER
1361 [frame] (
1362 )
1363 [padding2] /
1364 */
1365 struct ix86_frame
1366 {
1367 int nregs;
1368 int padding1;
1369 int va_arg_size;
1370 HOST_WIDE_INT frame;
1371 int padding2;
1372 int outgoing_arguments_size;
1373 int red_zone_size;
1374
1375 HOST_WIDE_INT to_allocate;
1376 /* The offsets relative to ARG_POINTER. */
1377 HOST_WIDE_INT frame_pointer_offset;
1378 HOST_WIDE_INT hard_frame_pointer_offset;
1379 HOST_WIDE_INT stack_pointer_offset;
1380
1381 /* When save_regs_using_mov is set, emit prologue using
1382 move instead of push instructions. */
1383 bool save_regs_using_mov;
1384 };
1385
1386 /* Code model option. */
1387 enum cmodel ix86_cmodel;
1388 /* Asm dialect. */
1389 enum asm_dialect ix86_asm_dialect = ASM_ATT;
1390 /* TLS dialects. */
1391 enum tls_dialect ix86_tls_dialect = TLS_DIALECT_GNU;
1392
1393 /* Which unit we are generating floating point math for. */
1394 enum fpmath_unit ix86_fpmath;
1395
1396 /* Which cpu are we scheduling for. */
1397 enum processor_type ix86_tune;
1398 /* Which instruction set architecture to use. */
1399 enum processor_type ix86_arch;
1400
1401 /* true if sse prefetch instruction is not NOOP. */
1402 int x86_prefetch_sse;
1403
1404 /* true if cmpxchg16b is supported. */
1405 int x86_cmpxchg16b;
1406
1407 /* ix86_regparm_string as a number */
1408 static int ix86_regparm;
1409
1410 /* -mstackrealign option */
1411 extern int ix86_force_align_arg_pointer;
1412 static const char ix86_force_align_arg_pointer_string[] = "force_align_arg_pointer";
1413
1414 /* Preferred alignment for stack boundary in bits. */
1415 unsigned int ix86_preferred_stack_boundary;
1416
1417 /* Values 1-5: see jump.c */
1418 int ix86_branch_cost;
1419
1420 /* Variables which are this size or smaller are put in the data/bss
1421 or ldata/lbss sections. */
1422
1423 int ix86_section_threshold = 65536;
1424
1425 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
1426 char internal_label_prefix[16];
1427 int internal_label_prefix_len;
1428 \f
1429 static bool ix86_handle_option (size_t, const char *, int);
1430 static void output_pic_addr_const (FILE *, rtx, int);
1431 static void put_condition_code (enum rtx_code, enum machine_mode,
1432 int, int, FILE *);
1433 static const char *get_some_local_dynamic_name (void);
1434 static int get_some_local_dynamic_name_1 (rtx *, void *);
1435 static rtx ix86_expand_int_compare (enum rtx_code, rtx, rtx);
1436 static enum rtx_code ix86_prepare_fp_compare_args (enum rtx_code, rtx *,
1437 rtx *);
1438 static bool ix86_fixed_condition_code_regs (unsigned int *, unsigned int *);
1439 static enum machine_mode ix86_cc_modes_compatible (enum machine_mode,
1440 enum machine_mode);
1441 static rtx get_thread_pointer (int);
1442 static rtx legitimize_tls_address (rtx, enum tls_model, int);
1443 static void get_pc_thunk_name (char [32], unsigned int);
1444 static rtx gen_push (rtx);
1445 static int ix86_flags_dependent (rtx, rtx, enum attr_type);
1446 static int ix86_agi_dependent (rtx, rtx, enum attr_type);
1447 static struct machine_function * ix86_init_machine_status (void);
1448 static int ix86_split_to_parts (rtx, rtx *, enum machine_mode);
1449 static int ix86_nsaved_regs (void);
1450 static void ix86_emit_save_regs (void);
1451 static void ix86_emit_save_regs_using_mov (rtx, HOST_WIDE_INT);
1452 static void ix86_emit_restore_regs_using_mov (rtx, HOST_WIDE_INT, int);
1453 static void ix86_output_function_epilogue (FILE *, HOST_WIDE_INT);
1454 static HOST_WIDE_INT ix86_GOT_alias_set (void);
1455 static void ix86_adjust_counter (rtx, HOST_WIDE_INT);
1456 static void ix86_expand_strlensi_unroll_1 (rtx, rtx, rtx);
1457 static int ix86_issue_rate (void);
1458 static int ix86_adjust_cost (rtx, rtx, rtx, int);
1459 static int ia32_multipass_dfa_lookahead (void);
1460 static void ix86_init_mmx_sse_builtins (void);
1461 static rtx x86_this_parameter (tree);
1462 static void x86_output_mi_thunk (FILE *, tree, HOST_WIDE_INT,
1463 HOST_WIDE_INT, tree);
1464 static bool x86_can_output_mi_thunk (tree, HOST_WIDE_INT, HOST_WIDE_INT, tree);
1465 static void x86_file_start (void);
1466 static void ix86_reorg (void);
1467 static bool ix86_expand_carry_flag_compare (enum rtx_code, rtx, rtx, rtx*);
1468 static tree ix86_build_builtin_va_list (void);
1469 static void ix86_setup_incoming_varargs (CUMULATIVE_ARGS *, enum machine_mode,
1470 tree, int *, int);
1471 static tree ix86_gimplify_va_arg (tree, tree, tree *, tree *);
1472 static bool ix86_scalar_mode_supported_p (enum machine_mode);
1473 static bool ix86_vector_mode_supported_p (enum machine_mode);
1474
1475 static int ix86_address_cost (rtx);
1476 static bool ix86_cannot_force_const_mem (rtx);
1477 static rtx ix86_delegitimize_address (rtx);
1478
1479 static void i386_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED;
1480
1481 struct builtin_description;
1482 static rtx ix86_expand_sse_comi (const struct builtin_description *,
1483 tree, rtx);
1484 static rtx ix86_expand_sse_compare (const struct builtin_description *,
1485 tree, rtx);
1486 static rtx ix86_expand_unop1_builtin (enum insn_code, tree, rtx);
1487 static rtx ix86_expand_unop_builtin (enum insn_code, tree, rtx, int);
1488 static rtx ix86_expand_binop_builtin (enum insn_code, tree, rtx);
1489 static rtx ix86_expand_store_builtin (enum insn_code, tree);
1490 static rtx safe_vector_operand (rtx, enum machine_mode);
1491 static rtx ix86_expand_fp_compare (enum rtx_code, rtx, rtx, rtx, rtx *, rtx *);
1492 static int ix86_fp_comparison_arithmetics_cost (enum rtx_code code);
1493 static int ix86_fp_comparison_fcomi_cost (enum rtx_code code);
1494 static int ix86_fp_comparison_sahf_cost (enum rtx_code code);
1495 static int ix86_fp_comparison_cost (enum rtx_code code);
1496 static unsigned int ix86_select_alt_pic_regnum (void);
1497 static int ix86_save_reg (unsigned int, int);
1498 static void ix86_compute_frame_layout (struct ix86_frame *);
1499 static int ix86_comp_type_attributes (tree, tree);
1500 static int ix86_function_regparm (tree, tree);
1501 const struct attribute_spec ix86_attribute_table[];
1502 static bool ix86_function_ok_for_sibcall (tree, tree);
1503 static tree ix86_handle_cconv_attribute (tree *, tree, tree, int, bool *);
1504 static int ix86_value_regno (enum machine_mode, tree, tree);
1505 static bool contains_128bit_aligned_vector_p (tree);
1506 static rtx ix86_struct_value_rtx (tree, int);
1507 static bool ix86_ms_bitfield_layout_p (tree);
1508 static tree ix86_handle_struct_attribute (tree *, tree, tree, int, bool *);
1509 static int extended_reg_mentioned_1 (rtx *, void *);
1510 static bool ix86_rtx_costs (rtx, int, int, int *);
1511 static int min_insn_size (rtx);
1512 static tree ix86_md_asm_clobbers (tree outputs, tree inputs, tree clobbers);
1513 static bool ix86_must_pass_in_stack (enum machine_mode mode, tree type);
1514 static bool ix86_pass_by_reference (CUMULATIVE_ARGS *, enum machine_mode,
1515 tree, bool);
1516 static void ix86_init_builtins (void);
1517 static rtx ix86_expand_builtin (tree, rtx, rtx, enum machine_mode, int);
1518 static tree ix86_builtin_vectorized_function (enum built_in_function, tree, tree);
1519 static const char *ix86_mangle_fundamental_type (tree);
1520 static tree ix86_stack_protect_fail (void);
1521 static rtx ix86_internal_arg_pointer (void);
1522 static void ix86_dwarf_handle_frame_unspec (const char *, rtx, int);
1523
1524 /* This function is only used on Solaris. */
1525 static void i386_solaris_elf_named_section (const char *, unsigned int, tree)
1526 ATTRIBUTE_UNUSED;
1527
1528 /* Register class used for passing given 64bit part of the argument.
1529 These represent classes as documented by the PS ABI, with the exception
1530 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
1531 use SF or DFmode move instead of DImode to avoid reformatting penalties.
1532
1533 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
1534 whenever possible (upper half does contain padding).
1535 */
1536 enum x86_64_reg_class
1537 {
1538 X86_64_NO_CLASS,
1539 X86_64_INTEGER_CLASS,
1540 X86_64_INTEGERSI_CLASS,
1541 X86_64_SSE_CLASS,
1542 X86_64_SSESF_CLASS,
1543 X86_64_SSEDF_CLASS,
1544 X86_64_SSEUP_CLASS,
1545 X86_64_X87_CLASS,
1546 X86_64_X87UP_CLASS,
1547 X86_64_COMPLEX_X87_CLASS,
1548 X86_64_MEMORY_CLASS
1549 };
1550 static const char * const x86_64_reg_class_name[] = {
1551 "no", "integer", "integerSI", "sse", "sseSF", "sseDF",
1552 "sseup", "x87", "x87up", "cplx87", "no"
1553 };
1554
1555 #define MAX_CLASSES 4
1556
1557 /* Table of constants used by fldpi, fldln2, etc.... */
1558 static REAL_VALUE_TYPE ext_80387_constants_table [5];
1559 static bool ext_80387_constants_init = 0;
1560 static void init_ext_80387_constants (void);
1561 static bool ix86_in_large_data_p (tree) ATTRIBUTE_UNUSED;
1562 static void ix86_encode_section_info (tree, rtx, int) ATTRIBUTE_UNUSED;
1563 static void x86_64_elf_unique_section (tree decl, int reloc) ATTRIBUTE_UNUSED;
1564 static section *x86_64_elf_select_section (tree decl, int reloc,
1565 unsigned HOST_WIDE_INT align)
1566 ATTRIBUTE_UNUSED;
1567 \f
1568 /* Initialize the GCC target structure. */
1569 #undef TARGET_ATTRIBUTE_TABLE
1570 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
1571 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
1572 # undef TARGET_MERGE_DECL_ATTRIBUTES
1573 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
1574 #endif
1575
1576 #undef TARGET_COMP_TYPE_ATTRIBUTES
1577 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
1578
1579 #undef TARGET_INIT_BUILTINS
1580 #define TARGET_INIT_BUILTINS ix86_init_builtins
1581 #undef TARGET_EXPAND_BUILTIN
1582 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
1583 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
1584 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION ix86_builtin_vectorized_function
1585
1586 #undef TARGET_ASM_FUNCTION_EPILOGUE
1587 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
1588
1589 #undef TARGET_ENCODE_SECTION_INFO
1590 #ifndef SUBTARGET_ENCODE_SECTION_INFO
1591 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
1592 #else
1593 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
1594 #endif
1595
1596 #undef TARGET_ASM_OPEN_PAREN
1597 #define TARGET_ASM_OPEN_PAREN ""
1598 #undef TARGET_ASM_CLOSE_PAREN
1599 #define TARGET_ASM_CLOSE_PAREN ""
1600
1601 #undef TARGET_ASM_ALIGNED_HI_OP
1602 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
1603 #undef TARGET_ASM_ALIGNED_SI_OP
1604 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
1605 #ifdef ASM_QUAD
1606 #undef TARGET_ASM_ALIGNED_DI_OP
1607 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
1608 #endif
1609
1610 #undef TARGET_ASM_UNALIGNED_HI_OP
1611 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
1612 #undef TARGET_ASM_UNALIGNED_SI_OP
1613 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
1614 #undef TARGET_ASM_UNALIGNED_DI_OP
1615 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
1616
1617 #undef TARGET_SCHED_ADJUST_COST
1618 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
1619 #undef TARGET_SCHED_ISSUE_RATE
1620 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
1621 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
1622 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
1623 ia32_multipass_dfa_lookahead
1624
1625 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
1626 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
1627
1628 #ifdef HAVE_AS_TLS
1629 #undef TARGET_HAVE_TLS
1630 #define TARGET_HAVE_TLS true
1631 #endif
1632 #undef TARGET_CANNOT_FORCE_CONST_MEM
1633 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
1634 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
1635 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_rtx_true
1636
1637 #undef TARGET_DELEGITIMIZE_ADDRESS
1638 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
1639
1640 #undef TARGET_MS_BITFIELD_LAYOUT_P
1641 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
1642
1643 #if TARGET_MACHO
1644 #undef TARGET_BINDS_LOCAL_P
1645 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
1646 #endif
1647
1648 #undef TARGET_ASM_OUTPUT_MI_THUNK
1649 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
1650 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
1651 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
1652
1653 #undef TARGET_ASM_FILE_START
1654 #define TARGET_ASM_FILE_START x86_file_start
1655
1656 #undef TARGET_DEFAULT_TARGET_FLAGS
1657 #define TARGET_DEFAULT_TARGET_FLAGS \
1658 (TARGET_DEFAULT \
1659 | TARGET_64BIT_DEFAULT \
1660 | TARGET_SUBTARGET_DEFAULT \
1661 | TARGET_TLS_DIRECT_SEG_REFS_DEFAULT)
1662
1663 #undef TARGET_HANDLE_OPTION
1664 #define TARGET_HANDLE_OPTION ix86_handle_option
1665
1666 #undef TARGET_RTX_COSTS
1667 #define TARGET_RTX_COSTS ix86_rtx_costs
1668 #undef TARGET_ADDRESS_COST
1669 #define TARGET_ADDRESS_COST ix86_address_cost
1670
1671 #undef TARGET_FIXED_CONDITION_CODE_REGS
1672 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
1673 #undef TARGET_CC_MODES_COMPATIBLE
1674 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
1675
1676 #undef TARGET_MACHINE_DEPENDENT_REORG
1677 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
1678
1679 #undef TARGET_BUILD_BUILTIN_VA_LIST
1680 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
1681
1682 #undef TARGET_MD_ASM_CLOBBERS
1683 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
1684
1685 #undef TARGET_PROMOTE_PROTOTYPES
1686 #define TARGET_PROMOTE_PROTOTYPES hook_bool_tree_true
1687 #undef TARGET_STRUCT_VALUE_RTX
1688 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
1689 #undef TARGET_SETUP_INCOMING_VARARGS
1690 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
1691 #undef TARGET_MUST_PASS_IN_STACK
1692 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
1693 #undef TARGET_PASS_BY_REFERENCE
1694 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
1695 #undef TARGET_INTERNAL_ARG_POINTER
1696 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
1697 #undef TARGET_DWARF_HANDLE_FRAME_UNSPEC
1698 #define TARGET_DWARF_HANDLE_FRAME_UNSPEC ix86_dwarf_handle_frame_unspec
1699
1700 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
1701 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
1702
1703 #undef TARGET_SCALAR_MODE_SUPPORTED_P
1704 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
1705
1706 #undef TARGET_VECTOR_MODE_SUPPORTED_P
1707 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
1708
1709 #ifdef HAVE_AS_TLS
1710 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
1711 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
1712 #endif
1713
1714 #ifdef SUBTARGET_INSERT_ATTRIBUTES
1715 #undef TARGET_INSERT_ATTRIBUTES
1716 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
1717 #endif
1718
1719 #undef TARGET_MANGLE_FUNDAMENTAL_TYPE
1720 #define TARGET_MANGLE_FUNDAMENTAL_TYPE ix86_mangle_fundamental_type
1721
1722 #undef TARGET_STACK_PROTECT_FAIL
1723 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
1724
1725 #undef TARGET_FUNCTION_VALUE
1726 #define TARGET_FUNCTION_VALUE ix86_function_value
1727
1728 struct gcc_target targetm = TARGET_INITIALIZER;
1729
1730 \f
1731 /* The svr4 ABI for the i386 says that records and unions are returned
1732 in memory. */
1733 #ifndef DEFAULT_PCC_STRUCT_RETURN
1734 #define DEFAULT_PCC_STRUCT_RETURN 1
1735 #endif
1736
1737 /* Implement TARGET_HANDLE_OPTION. */
1738
1739 static bool
1740 ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value)
1741 {
1742 switch (code)
1743 {
1744 case OPT_m3dnow:
1745 if (!value)
1746 {
1747 target_flags &= ~MASK_3DNOW_A;
1748 target_flags_explicit |= MASK_3DNOW_A;
1749 }
1750 return true;
1751
1752 case OPT_mmmx:
1753 if (!value)
1754 {
1755 target_flags &= ~(MASK_3DNOW | MASK_3DNOW_A);
1756 target_flags_explicit |= MASK_3DNOW | MASK_3DNOW_A;
1757 }
1758 return true;
1759
1760 case OPT_msse:
1761 if (!value)
1762 {
1763 target_flags &= ~(MASK_SSE2 | MASK_SSE3 | MASK_SSE4A);
1764 target_flags_explicit |= MASK_SSE2 | MASK_SSE3 | MASK_SSE4A;
1765 }
1766 return true;
1767
1768 case OPT_msse2:
1769 if (!value)
1770 {
1771 target_flags &= ~(MASK_SSE3 | MASK_SSE4A);
1772 target_flags_explicit |= MASK_SSE3 | MASK_SSE4A;
1773 }
1774 return true;
1775
1776 case OPT_msse3:
1777 if (!value)
1778 {
1779 target_flags &= ~MASK_SSE4A;
1780 target_flags_explicit |= MASK_SSE4A;
1781 }
1782 return true;
1783
1784 default:
1785 return true;
1786 }
1787 }
1788
1789 /* Sometimes certain combinations of command options do not make
1790 sense on a particular target machine. You can define a macro
1791 `OVERRIDE_OPTIONS' to take account of this. This macro, if
1792 defined, is executed once just after all the command options have
1793 been parsed.
1794
1795 Don't use this macro to turn on various extra optimizations for
1796 `-O'. That is what `OPTIMIZATION_OPTIONS' is for. */
1797
1798 void
1799 override_options (void)
1800 {
1801 int i;
1802 int ix86_tune_defaulted = 0;
1803
1804 /* Comes from final.c -- no real reason to change it. */
1805 #define MAX_CODE_ALIGN 16
1806
1807 static struct ptt
1808 {
1809 const struct processor_costs *cost; /* Processor costs */
1810 const int target_enable; /* Target flags to enable. */
1811 const int target_disable; /* Target flags to disable. */
1812 const int align_loop; /* Default alignments. */
1813 const int align_loop_max_skip;
1814 const int align_jump;
1815 const int align_jump_max_skip;
1816 const int align_func;
1817 }
1818 const processor_target_table[PROCESSOR_max] =
1819 {
1820 {&i386_cost, 0, 0, 4, 3, 4, 3, 4},
1821 {&i486_cost, 0, 0, 16, 15, 16, 15, 16},
1822 {&pentium_cost, 0, 0, 16, 7, 16, 7, 16},
1823 {&pentiumpro_cost, 0, 0, 16, 15, 16, 7, 16},
1824 {&geode_cost, 0, 0, 0, 0, 0, 0, 0},
1825 {&k6_cost, 0, 0, 32, 7, 32, 7, 32},
1826 {&athlon_cost, 0, 0, 16, 7, 16, 7, 16},
1827 {&pentium4_cost, 0, 0, 0, 0, 0, 0, 0},
1828 {&k8_cost, 0, 0, 16, 7, 16, 7, 16},
1829 {&nocona_cost, 0, 0, 0, 0, 0, 0, 0},
1830 {&core2_cost, 0, 0, 16, 7, 16, 7, 16},
1831 {&generic32_cost, 0, 0, 16, 7, 16, 7, 16},
1832 {&generic64_cost, 0, 0, 16, 7, 16, 7, 16},
1833 {&amdfam10_cost, 0, 0, 32, 7, 32, 7, 32}
1834 };
1835
1836 static const char * const cpu_names[] = TARGET_CPU_DEFAULT_NAMES;
1837 static struct pta
1838 {
1839 const char *const name; /* processor name or nickname. */
1840 const enum processor_type processor;
1841 const enum pta_flags
1842 {
1843 PTA_SSE = 1,
1844 PTA_SSE2 = 2,
1845 PTA_SSE3 = 4,
1846 PTA_MMX = 8,
1847 PTA_PREFETCH_SSE = 16,
1848 PTA_3DNOW = 32,
1849 PTA_3DNOW_A = 64,
1850 PTA_64BIT = 128,
1851 PTA_SSSE3 = 256,
1852 PTA_CX16 = 512,
1853 PTA_POPCNT = 1024,
1854 PTA_ABM = 2048,
1855 PTA_SSE4A = 4096
1856 } flags;
1857 }
1858 const processor_alias_table[] =
1859 {
1860 {"i386", PROCESSOR_I386, 0},
1861 {"i486", PROCESSOR_I486, 0},
1862 {"i586", PROCESSOR_PENTIUM, 0},
1863 {"pentium", PROCESSOR_PENTIUM, 0},
1864 {"pentium-mmx", PROCESSOR_PENTIUM, PTA_MMX},
1865 {"winchip-c6", PROCESSOR_I486, PTA_MMX},
1866 {"winchip2", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},
1867 {"c3", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},
1868 {"c3-2", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_PREFETCH_SSE | PTA_SSE},
1869 {"i686", PROCESSOR_PENTIUMPRO, 0},
1870 {"pentiumpro", PROCESSOR_PENTIUMPRO, 0},
1871 {"pentium2", PROCESSOR_PENTIUMPRO, PTA_MMX},
1872 {"pentium3", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_PREFETCH_SSE},
1873 {"pentium3m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_PREFETCH_SSE},
1874 {"pentium-m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_PREFETCH_SSE | PTA_SSE2},
1875 {"pentium4", PROCESSOR_PENTIUM4, PTA_SSE | PTA_SSE2
1876 | PTA_MMX | PTA_PREFETCH_SSE},
1877 {"pentium4m", PROCESSOR_PENTIUM4, PTA_SSE | PTA_SSE2
1878 | PTA_MMX | PTA_PREFETCH_SSE},
1879 {"prescott", PROCESSOR_NOCONA, PTA_SSE | PTA_SSE2 | PTA_SSE3
1880 | PTA_MMX | PTA_PREFETCH_SSE},
1881 {"nocona", PROCESSOR_NOCONA, PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_64BIT
1882 | PTA_MMX | PTA_PREFETCH_SSE | PTA_CX16},
1883 {"core2", PROCESSOR_CORE2, PTA_SSE | PTA_SSE2 | PTA_SSE3
1884 | PTA_64BIT | PTA_MMX
1885 | PTA_PREFETCH_SSE | PTA_CX16},
1886 {"geode", PROCESSOR_GEODE, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1887 | PTA_3DNOW_A},
1888 {"k6", PROCESSOR_K6, PTA_MMX},
1889 {"k6-2", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
1890 {"k6-3", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
1891 {"athlon", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1892 | PTA_3DNOW_A},
1893 {"athlon-tbird", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE
1894 | PTA_3DNOW | PTA_3DNOW_A},
1895 {"athlon-4", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1896 | PTA_3DNOW_A | PTA_SSE},
1897 {"athlon-xp", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1898 | PTA_3DNOW_A | PTA_SSE},
1899 {"athlon-mp", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1900 | PTA_3DNOW_A | PTA_SSE},
1901 {"x86-64", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_64BIT
1902 | PTA_SSE | PTA_SSE2 },
1903 {"k8", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT
1904 | PTA_3DNOW_A | PTA_SSE | PTA_SSE2},
1905 {"opteron", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT
1906 | PTA_3DNOW_A | PTA_SSE | PTA_SSE2},
1907 {"athlon64", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT
1908 | PTA_3DNOW_A | PTA_SSE | PTA_SSE2},
1909 {"athlon-fx", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT
1910 | PTA_3DNOW_A | PTA_SSE | PTA_SSE2},
1911 {"amdfam10", PROCESSOR_AMDFAM10, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1912 | PTA_64BIT | PTA_3DNOW_A | PTA_SSE
1913 | PTA_SSE2 | PTA_SSE3 | PTA_POPCNT
1914 | PTA_ABM | PTA_SSE4A | PTA_CX16},
1915 {"generic32", PROCESSOR_GENERIC32, 0 /* flags are only used for -march switch. */ },
1916 {"generic64", PROCESSOR_GENERIC64, PTA_64BIT /* flags are only used for -march switch. */ },
1917 };
1918
1919 int const pta_size = ARRAY_SIZE (processor_alias_table);
1920
1921 #ifdef SUBTARGET_OVERRIDE_OPTIONS
1922 SUBTARGET_OVERRIDE_OPTIONS;
1923 #endif
1924
1925 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
1926 SUBSUBTARGET_OVERRIDE_OPTIONS;
1927 #endif
1928
1929 /* -fPIC is the default for x86_64. */
1930 if (TARGET_MACHO && TARGET_64BIT)
1931 flag_pic = 2;
1932
1933 /* Set the default values for switches whose default depends on TARGET_64BIT
1934 in case they weren't overwritten by command line options. */
1935 if (TARGET_64BIT)
1936 {
1937 /* Mach-O doesn't support omitting the frame pointer for now. */
1938 if (flag_omit_frame_pointer == 2)
1939 flag_omit_frame_pointer = (TARGET_MACHO ? 0 : 1);
1940 if (flag_asynchronous_unwind_tables == 2)
1941 flag_asynchronous_unwind_tables = 1;
1942 if (flag_pcc_struct_return == 2)
1943 flag_pcc_struct_return = 0;
1944 }
1945 else
1946 {
1947 if (flag_omit_frame_pointer == 2)
1948 flag_omit_frame_pointer = 0;
1949 if (flag_asynchronous_unwind_tables == 2)
1950 flag_asynchronous_unwind_tables = 0;
1951 if (flag_pcc_struct_return == 2)
1952 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
1953 }
1954
1955 /* Need to check -mtune=generic first. */
1956 if (ix86_tune_string)
1957 {
1958 if (!strcmp (ix86_tune_string, "generic")
1959 || !strcmp (ix86_tune_string, "i686")
1960 /* As special support for cross compilers we read -mtune=native
1961 as -mtune=generic. With native compilers we won't see the
1962 -mtune=native, as it was changed by the driver. */
1963 || !strcmp (ix86_tune_string, "native"))
1964 {
1965 if (TARGET_64BIT)
1966 ix86_tune_string = "generic64";
1967 else
1968 ix86_tune_string = "generic32";
1969 }
1970 else if (!strncmp (ix86_tune_string, "generic", 7))
1971 error ("bad value (%s) for -mtune= switch", ix86_tune_string);
1972 }
1973 else
1974 {
1975 if (ix86_arch_string)
1976 ix86_tune_string = ix86_arch_string;
1977 if (!ix86_tune_string)
1978 {
1979 ix86_tune_string = cpu_names [TARGET_CPU_DEFAULT];
1980 ix86_tune_defaulted = 1;
1981 }
1982
1983 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
1984 need to use a sensible tune option. */
1985 if (!strcmp (ix86_tune_string, "generic")
1986 || !strcmp (ix86_tune_string, "x86-64")
1987 || !strcmp (ix86_tune_string, "i686"))
1988 {
1989 if (TARGET_64BIT)
1990 ix86_tune_string = "generic64";
1991 else
1992 ix86_tune_string = "generic32";
1993 }
1994 }
1995 if (ix86_stringop_string)
1996 {
1997 if (!strcmp (ix86_stringop_string, "rep_byte"))
1998 stringop_alg = rep_prefix_1_byte;
1999 else if (!strcmp (ix86_stringop_string, "libcall"))
2000 stringop_alg = libcall;
2001 else if (!strcmp (ix86_stringop_string, "rep_4byte"))
2002 stringop_alg = rep_prefix_4_byte;
2003 else if (!strcmp (ix86_stringop_string, "rep_8byte"))
2004 stringop_alg = rep_prefix_8_byte;
2005 else if (!strcmp (ix86_stringop_string, "byte_loop"))
2006 stringop_alg = loop_1_byte;
2007 else if (!strcmp (ix86_stringop_string, "loop"))
2008 stringop_alg = loop;
2009 else if (!strcmp (ix86_stringop_string, "unrolled_loop"))
2010 stringop_alg = unrolled_loop;
2011 else
2012 error ("bad value (%s) for -mstringop-strategy= switch", ix86_stringop_string);
2013 }
2014 if (!strcmp (ix86_tune_string, "x86-64"))
2015 warning (OPT_Wdeprecated, "-mtune=x86-64 is deprecated. Use -mtune=k8 or "
2016 "-mtune=generic instead as appropriate.");
2017
2018 if (!ix86_arch_string)
2019 ix86_arch_string = TARGET_64BIT ? "x86-64" : "i386";
2020 if (!strcmp (ix86_arch_string, "generic"))
2021 error ("generic CPU can be used only for -mtune= switch");
2022 if (!strncmp (ix86_arch_string, "generic", 7))
2023 error ("bad value (%s) for -march= switch", ix86_arch_string);
2024
2025 if (ix86_cmodel_string != 0)
2026 {
2027 if (!strcmp (ix86_cmodel_string, "small"))
2028 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
2029 else if (!strcmp (ix86_cmodel_string, "medium"))
2030 ix86_cmodel = flag_pic ? CM_MEDIUM_PIC : CM_MEDIUM;
2031 else if (flag_pic)
2032 sorry ("code model %s not supported in PIC mode", ix86_cmodel_string);
2033 else if (!strcmp (ix86_cmodel_string, "32"))
2034 ix86_cmodel = CM_32;
2035 else if (!strcmp (ix86_cmodel_string, "kernel") && !flag_pic)
2036 ix86_cmodel = CM_KERNEL;
2037 else if (!strcmp (ix86_cmodel_string, "large") && !flag_pic)
2038 ix86_cmodel = CM_LARGE;
2039 else
2040 error ("bad value (%s) for -mcmodel= switch", ix86_cmodel_string);
2041 }
2042 else
2043 {
2044 ix86_cmodel = CM_32;
2045 if (TARGET_64BIT)
2046 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
2047 }
2048 if (ix86_asm_string != 0)
2049 {
2050 if (! TARGET_MACHO
2051 && !strcmp (ix86_asm_string, "intel"))
2052 ix86_asm_dialect = ASM_INTEL;
2053 else if (!strcmp (ix86_asm_string, "att"))
2054 ix86_asm_dialect = ASM_ATT;
2055 else
2056 error ("bad value (%s) for -masm= switch", ix86_asm_string);
2057 }
2058 if ((TARGET_64BIT == 0) != (ix86_cmodel == CM_32))
2059 error ("code model %qs not supported in the %s bit mode",
2060 ix86_cmodel_string, TARGET_64BIT ? "64" : "32");
2061 if (ix86_cmodel == CM_LARGE)
2062 sorry ("code model %<large%> not supported yet");
2063 if ((TARGET_64BIT != 0) != ((target_flags & MASK_64BIT) != 0))
2064 sorry ("%i-bit mode not compiled in",
2065 (target_flags & MASK_64BIT) ? 64 : 32);
2066
2067 for (i = 0; i < pta_size; i++)
2068 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
2069 {
2070 ix86_arch = processor_alias_table[i].processor;
2071 /* Default cpu tuning to the architecture. */
2072 ix86_tune = ix86_arch;
2073 if (processor_alias_table[i].flags & PTA_MMX
2074 && !(target_flags_explicit & MASK_MMX))
2075 target_flags |= MASK_MMX;
2076 if (processor_alias_table[i].flags & PTA_3DNOW
2077 && !(target_flags_explicit & MASK_3DNOW))
2078 target_flags |= MASK_3DNOW;
2079 if (processor_alias_table[i].flags & PTA_3DNOW_A
2080 && !(target_flags_explicit & MASK_3DNOW_A))
2081 target_flags |= MASK_3DNOW_A;
2082 if (processor_alias_table[i].flags & PTA_SSE
2083 && !(target_flags_explicit & MASK_SSE))
2084 target_flags |= MASK_SSE;
2085 if (processor_alias_table[i].flags & PTA_SSE2
2086 && !(target_flags_explicit & MASK_SSE2))
2087 target_flags |= MASK_SSE2;
2088 if (processor_alias_table[i].flags & PTA_SSE3
2089 && !(target_flags_explicit & MASK_SSE3))
2090 target_flags |= MASK_SSE3;
2091 if (processor_alias_table[i].flags & PTA_SSSE3
2092 && !(target_flags_explicit & MASK_SSSE3))
2093 target_flags |= MASK_SSSE3;
2094 if (processor_alias_table[i].flags & PTA_PREFETCH_SSE)
2095 x86_prefetch_sse = true;
2096 if (processor_alias_table[i].flags & PTA_CX16)
2097 x86_cmpxchg16b = true;
2098 if (processor_alias_table[i].flags & PTA_POPCNT
2099 && !(target_flags_explicit & MASK_POPCNT))
2100 target_flags |= MASK_POPCNT;
2101 if (processor_alias_table[i].flags & PTA_ABM
2102 && !(target_flags_explicit & MASK_ABM))
2103 target_flags |= MASK_ABM;
2104 if (processor_alias_table[i].flags & PTA_SSE4A
2105 && !(target_flags_explicit & MASK_SSE4A))
2106 target_flags |= MASK_SSE4A;
2107 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
2108 error ("CPU you selected does not support x86-64 "
2109 "instruction set");
2110 break;
2111 }
2112
2113 if (i == pta_size)
2114 error ("bad value (%s) for -march= switch", ix86_arch_string);
2115
2116 for (i = 0; i < pta_size; i++)
2117 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
2118 {
2119 ix86_tune = processor_alias_table[i].processor;
2120 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
2121 {
2122 if (ix86_tune_defaulted)
2123 {
2124 ix86_tune_string = "x86-64";
2125 for (i = 0; i < pta_size; i++)
2126 if (! strcmp (ix86_tune_string,
2127 processor_alias_table[i].name))
2128 break;
2129 ix86_tune = processor_alias_table[i].processor;
2130 }
2131 else
2132 error ("CPU you selected does not support x86-64 "
2133 "instruction set");
2134 }
2135 /* Intel CPUs have always interpreted SSE prefetch instructions as
2136 NOPs; so, we can enable SSE prefetch instructions even when
2137 -mtune (rather than -march) points us to a processor that has them.
2138 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
2139 higher processors. */
2140 if (TARGET_CMOVE && (processor_alias_table[i].flags & PTA_PREFETCH_SSE))
2141 x86_prefetch_sse = true;
2142 break;
2143 }
2144 if (i == pta_size)
2145 error ("bad value (%s) for -mtune= switch", ix86_tune_string);
2146
2147 if (optimize_size)
2148 ix86_cost = &size_cost;
2149 else
2150 ix86_cost = processor_target_table[ix86_tune].cost;
2151 target_flags |= processor_target_table[ix86_tune].target_enable;
2152 target_flags &= ~processor_target_table[ix86_tune].target_disable;
2153
2154 /* Arrange to set up i386_stack_locals for all functions. */
2155 init_machine_status = ix86_init_machine_status;
2156
2157 /* Validate -mregparm= value. */
2158 if (ix86_regparm_string)
2159 {
2160 i = atoi (ix86_regparm_string);
2161 if (i < 0 || i > REGPARM_MAX)
2162 error ("-mregparm=%d is not between 0 and %d", i, REGPARM_MAX);
2163 else
2164 ix86_regparm = i;
2165 }
2166 else
2167 if (TARGET_64BIT)
2168 ix86_regparm = REGPARM_MAX;
2169
2170 /* If the user has provided any of the -malign-* options,
2171 warn and use that value only if -falign-* is not set.
2172 Remove this code in GCC 3.2 or later. */
2173 if (ix86_align_loops_string)
2174 {
2175 warning (0, "-malign-loops is obsolete, use -falign-loops");
2176 if (align_loops == 0)
2177 {
2178 i = atoi (ix86_align_loops_string);
2179 if (i < 0 || i > MAX_CODE_ALIGN)
2180 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2181 else
2182 align_loops = 1 << i;
2183 }
2184 }
2185
2186 if (ix86_align_jumps_string)
2187 {
2188 warning (0, "-malign-jumps is obsolete, use -falign-jumps");
2189 if (align_jumps == 0)
2190 {
2191 i = atoi (ix86_align_jumps_string);
2192 if (i < 0 || i > MAX_CODE_ALIGN)
2193 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2194 else
2195 align_jumps = 1 << i;
2196 }
2197 }
2198
2199 if (ix86_align_funcs_string)
2200 {
2201 warning (0, "-malign-functions is obsolete, use -falign-functions");
2202 if (align_functions == 0)
2203 {
2204 i = atoi (ix86_align_funcs_string);
2205 if (i < 0 || i > MAX_CODE_ALIGN)
2206 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2207 else
2208 align_functions = 1 << i;
2209 }
2210 }
2211
2212 /* Default align_* from the processor table. */
2213 if (align_loops == 0)
2214 {
2215 align_loops = processor_target_table[ix86_tune].align_loop;
2216 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
2217 }
2218 if (align_jumps == 0)
2219 {
2220 align_jumps = processor_target_table[ix86_tune].align_jump;
2221 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
2222 }
2223 if (align_functions == 0)
2224 {
2225 align_functions = processor_target_table[ix86_tune].align_func;
2226 }
2227
2228 /* Validate -mbranch-cost= value, or provide default. */
2229 ix86_branch_cost = ix86_cost->branch_cost;
2230 if (ix86_branch_cost_string)
2231 {
2232 i = atoi (ix86_branch_cost_string);
2233 if (i < 0 || i > 5)
2234 error ("-mbranch-cost=%d is not between 0 and 5", i);
2235 else
2236 ix86_branch_cost = i;
2237 }
2238 if (ix86_section_threshold_string)
2239 {
2240 i = atoi (ix86_section_threshold_string);
2241 if (i < 0)
2242 error ("-mlarge-data-threshold=%d is negative", i);
2243 else
2244 ix86_section_threshold = i;
2245 }
2246
2247 if (ix86_tls_dialect_string)
2248 {
2249 if (strcmp (ix86_tls_dialect_string, "gnu") == 0)
2250 ix86_tls_dialect = TLS_DIALECT_GNU;
2251 else if (strcmp (ix86_tls_dialect_string, "gnu2") == 0)
2252 ix86_tls_dialect = TLS_DIALECT_GNU2;
2253 else if (strcmp (ix86_tls_dialect_string, "sun") == 0)
2254 ix86_tls_dialect = TLS_DIALECT_SUN;
2255 else
2256 error ("bad value (%s) for -mtls-dialect= switch",
2257 ix86_tls_dialect_string);
2258 }
2259
2260 /* Keep nonleaf frame pointers. */
2261 if (flag_omit_frame_pointer)
2262 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
2263 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
2264 flag_omit_frame_pointer = 1;
2265
2266 /* If we're doing fast math, we don't care about comparison order
2267 wrt NaNs. This lets us use a shorter comparison sequence. */
2268 if (flag_finite_math_only)
2269 target_flags &= ~MASK_IEEE_FP;
2270
2271 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
2272 since the insns won't need emulation. */
2273 if (x86_arch_always_fancy_math_387 & (1 << ix86_arch))
2274 target_flags &= ~MASK_NO_FANCY_MATH_387;
2275
2276 /* Likewise, if the target doesn't have a 387, or we've specified
2277 software floating point, don't use 387 inline intrinsics. */
2278 if (!TARGET_80387)
2279 target_flags |= MASK_NO_FANCY_MATH_387;
2280
2281 /* Turn on SSE3 builtins for -mssse3. */
2282 if (TARGET_SSSE3)
2283 target_flags |= MASK_SSE3;
2284
2285 /* Turn on SSE3 builtins for -msse4a. */
2286 if (TARGET_SSE4A)
2287 target_flags |= MASK_SSE3;
2288
2289 /* Turn on SSE2 builtins for -msse3. */
2290 if (TARGET_SSE3)
2291 target_flags |= MASK_SSE2;
2292
2293 /* Turn on SSE builtins for -msse2. */
2294 if (TARGET_SSE2)
2295 target_flags |= MASK_SSE;
2296
2297 /* Turn on MMX builtins for -msse. */
2298 if (TARGET_SSE)
2299 {
2300 target_flags |= MASK_MMX & ~target_flags_explicit;
2301 x86_prefetch_sse = true;
2302 }
2303
2304 /* Turn on MMX builtins for 3Dnow. */
2305 if (TARGET_3DNOW)
2306 target_flags |= MASK_MMX;
2307
2308 /* Turn on POPCNT builtins for -mabm. */
2309 if (TARGET_ABM)
2310 target_flags |= MASK_POPCNT;
2311
2312 if (TARGET_64BIT)
2313 {
2314 if (TARGET_ALIGN_DOUBLE)
2315 error ("-malign-double makes no sense in the 64bit mode");
2316 if (TARGET_RTD)
2317 error ("-mrtd calling convention not supported in the 64bit mode");
2318
2319 /* Enable by default the SSE and MMX builtins. Do allow the user to
2320 explicitly disable any of these. In particular, disabling SSE and
2321 MMX for kernel code is extremely useful. */
2322 target_flags
2323 |= ((MASK_SSE2 | MASK_SSE | MASK_MMX | MASK_128BIT_LONG_DOUBLE)
2324 & ~target_flags_explicit);
2325 }
2326 else
2327 {
2328 /* i386 ABI does not specify red zone. It still makes sense to use it
2329 when programmer takes care to stack from being destroyed. */
2330 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
2331 target_flags |= MASK_NO_RED_ZONE;
2332 }
2333
2334 /* Validate -mpreferred-stack-boundary= value, or provide default.
2335 The default of 128 bits is for Pentium III's SSE __m128. We can't
2336 change it because of optimize_size. Otherwise, we can't mix object
2337 files compiled with -Os and -On. */
2338 ix86_preferred_stack_boundary = 128;
2339 if (ix86_preferred_stack_boundary_string)
2340 {
2341 i = atoi (ix86_preferred_stack_boundary_string);
2342 if (i < (TARGET_64BIT ? 4 : 2) || i > 12)
2343 error ("-mpreferred-stack-boundary=%d is not between %d and 12", i,
2344 TARGET_64BIT ? 4 : 2);
2345 else
2346 ix86_preferred_stack_boundary = (1 << i) * BITS_PER_UNIT;
2347 }
2348
2349 /* Accept -msseregparm only if at least SSE support is enabled. */
2350 if (TARGET_SSEREGPARM
2351 && ! TARGET_SSE)
2352 error ("-msseregparm used without SSE enabled");
2353
2354 ix86_fpmath = TARGET_FPMATH_DEFAULT;
2355
2356 if (ix86_fpmath_string != 0)
2357 {
2358 if (! strcmp (ix86_fpmath_string, "387"))
2359 ix86_fpmath = FPMATH_387;
2360 else if (! strcmp (ix86_fpmath_string, "sse"))
2361 {
2362 if (!TARGET_SSE)
2363 {
2364 warning (0, "SSE instruction set disabled, using 387 arithmetics");
2365 ix86_fpmath = FPMATH_387;
2366 }
2367 else
2368 ix86_fpmath = FPMATH_SSE;
2369 }
2370 else if (! strcmp (ix86_fpmath_string, "387,sse")
2371 || ! strcmp (ix86_fpmath_string, "sse,387"))
2372 {
2373 if (!TARGET_SSE)
2374 {
2375 warning (0, "SSE instruction set disabled, using 387 arithmetics");
2376 ix86_fpmath = FPMATH_387;
2377 }
2378 else if (!TARGET_80387)
2379 {
2380 warning (0, "387 instruction set disabled, using SSE arithmetics");
2381 ix86_fpmath = FPMATH_SSE;
2382 }
2383 else
2384 ix86_fpmath = FPMATH_SSE | FPMATH_387;
2385 }
2386 else
2387 error ("bad value (%s) for -mfpmath= switch", ix86_fpmath_string);
2388 }
2389
2390 /* If the i387 is disabled, then do not return values in it. */
2391 if (!TARGET_80387)
2392 target_flags &= ~MASK_FLOAT_RETURNS;
2393
2394 if ((x86_accumulate_outgoing_args & TUNEMASK)
2395 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
2396 && !optimize_size)
2397 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
2398
2399 /* ??? Unwind info is not correct around the CFG unless either a frame
2400 pointer is present or M_A_O_A is set. Fixing this requires rewriting
2401 unwind info generation to be aware of the CFG and propagating states
2402 around edges. */
2403 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
2404 || flag_exceptions || flag_non_call_exceptions)
2405 && flag_omit_frame_pointer
2406 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
2407 {
2408 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
2409 warning (0, "unwind tables currently require either a frame pointer "
2410 "or -maccumulate-outgoing-args for correctness");
2411 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
2412 }
2413
2414 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
2415 {
2416 char *p;
2417 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
2418 p = strchr (internal_label_prefix, 'X');
2419 internal_label_prefix_len = p - internal_label_prefix;
2420 *p = '\0';
2421 }
2422
2423 /* When scheduling description is not available, disable scheduler pass
2424 so it won't slow down the compilation and make x87 code slower. */
2425 if (!TARGET_SCHEDULE)
2426 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
2427
2428 if (!PARAM_SET_P (PARAM_SIMULTANEOUS_PREFETCHES))
2429 set_param_value ("simultaneous-prefetches",
2430 ix86_cost->simultaneous_prefetches);
2431 if (!PARAM_SET_P (PARAM_L1_CACHE_LINE_SIZE))
2432 set_param_value ("l1-cache-line-size", ix86_cost->prefetch_block);
2433 }
2434 \f
2435 /* switch to the appropriate section for output of DECL.
2436 DECL is either a `VAR_DECL' node or a constant of some sort.
2437 RELOC indicates whether forming the initial value of DECL requires
2438 link-time relocations. */
2439
2440 static section *
2441 x86_64_elf_select_section (tree decl, int reloc,
2442 unsigned HOST_WIDE_INT align)
2443 {
2444 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2445 && ix86_in_large_data_p (decl))
2446 {
2447 const char *sname = NULL;
2448 unsigned int flags = SECTION_WRITE;
2449 switch (categorize_decl_for_section (decl, reloc, flag_pic))
2450 {
2451 case SECCAT_DATA:
2452 sname = ".ldata";
2453 break;
2454 case SECCAT_DATA_REL:
2455 sname = ".ldata.rel";
2456 break;
2457 case SECCAT_DATA_REL_LOCAL:
2458 sname = ".ldata.rel.local";
2459 break;
2460 case SECCAT_DATA_REL_RO:
2461 sname = ".ldata.rel.ro";
2462 break;
2463 case SECCAT_DATA_REL_RO_LOCAL:
2464 sname = ".ldata.rel.ro.local";
2465 break;
2466 case SECCAT_BSS:
2467 sname = ".lbss";
2468 flags |= SECTION_BSS;
2469 break;
2470 case SECCAT_RODATA:
2471 case SECCAT_RODATA_MERGE_STR:
2472 case SECCAT_RODATA_MERGE_STR_INIT:
2473 case SECCAT_RODATA_MERGE_CONST:
2474 sname = ".lrodata";
2475 flags = 0;
2476 break;
2477 case SECCAT_SRODATA:
2478 case SECCAT_SDATA:
2479 case SECCAT_SBSS:
2480 gcc_unreachable ();
2481 case SECCAT_TEXT:
2482 case SECCAT_TDATA:
2483 case SECCAT_TBSS:
2484 /* We don't split these for medium model. Place them into
2485 default sections and hope for best. */
2486 break;
2487 }
2488 if (sname)
2489 {
2490 /* We might get called with string constants, but get_named_section
2491 doesn't like them as they are not DECLs. Also, we need to set
2492 flags in that case. */
2493 if (!DECL_P (decl))
2494 return get_section (sname, flags, NULL);
2495 return get_named_section (decl, sname, reloc);
2496 }
2497 }
2498 return default_elf_select_section (decl, reloc, align);
2499 }
2500
2501 /* Build up a unique section name, expressed as a
2502 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
2503 RELOC indicates whether the initial value of EXP requires
2504 link-time relocations. */
2505
2506 static void
2507 x86_64_elf_unique_section (tree decl, int reloc)
2508 {
2509 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2510 && ix86_in_large_data_p (decl))
2511 {
2512 const char *prefix = NULL;
2513 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
2514 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
2515
2516 switch (categorize_decl_for_section (decl, reloc, flag_pic))
2517 {
2518 case SECCAT_DATA:
2519 case SECCAT_DATA_REL:
2520 case SECCAT_DATA_REL_LOCAL:
2521 case SECCAT_DATA_REL_RO:
2522 case SECCAT_DATA_REL_RO_LOCAL:
2523 prefix = one_only ? ".gnu.linkonce.ld." : ".ldata.";
2524 break;
2525 case SECCAT_BSS:
2526 prefix = one_only ? ".gnu.linkonce.lb." : ".lbss.";
2527 break;
2528 case SECCAT_RODATA:
2529 case SECCAT_RODATA_MERGE_STR:
2530 case SECCAT_RODATA_MERGE_STR_INIT:
2531 case SECCAT_RODATA_MERGE_CONST:
2532 prefix = one_only ? ".gnu.linkonce.lr." : ".lrodata.";
2533 break;
2534 case SECCAT_SRODATA:
2535 case SECCAT_SDATA:
2536 case SECCAT_SBSS:
2537 gcc_unreachable ();
2538 case SECCAT_TEXT:
2539 case SECCAT_TDATA:
2540 case SECCAT_TBSS:
2541 /* We don't split these for medium model. Place them into
2542 default sections and hope for best. */
2543 break;
2544 }
2545 if (prefix)
2546 {
2547 const char *name;
2548 size_t nlen, plen;
2549 char *string;
2550 plen = strlen (prefix);
2551
2552 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
2553 name = targetm.strip_name_encoding (name);
2554 nlen = strlen (name);
2555
2556 string = alloca (nlen + plen + 1);
2557 memcpy (string, prefix, plen);
2558 memcpy (string + plen, name, nlen + 1);
2559
2560 DECL_SECTION_NAME (decl) = build_string (nlen + plen, string);
2561 return;
2562 }
2563 }
2564 default_unique_section (decl, reloc);
2565 }
2566
2567 #ifdef COMMON_ASM_OP
2568 /* This says how to output assembler code to declare an
2569 uninitialized external linkage data object.
2570
2571 For medium model x86-64 we need to use .largecomm opcode for
2572 large objects. */
2573 void
2574 x86_elf_aligned_common (FILE *file,
2575 const char *name, unsigned HOST_WIDE_INT size,
2576 int align)
2577 {
2578 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2579 && size > (unsigned int)ix86_section_threshold)
2580 fprintf (file, ".largecomm\t");
2581 else
2582 fprintf (file, "%s", COMMON_ASM_OP);
2583 assemble_name (file, name);
2584 fprintf (file, ","HOST_WIDE_INT_PRINT_UNSIGNED",%u\n",
2585 size, align / BITS_PER_UNIT);
2586 }
2587 #endif
2588 /* Utility function for targets to use in implementing
2589 ASM_OUTPUT_ALIGNED_BSS. */
2590
2591 void
2592 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
2593 const char *name, unsigned HOST_WIDE_INT size,
2594 int align)
2595 {
2596 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2597 && size > (unsigned int)ix86_section_threshold)
2598 switch_to_section (get_named_section (decl, ".lbss", 0));
2599 else
2600 switch_to_section (bss_section);
2601 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
2602 #ifdef ASM_DECLARE_OBJECT_NAME
2603 last_assemble_variable_decl = decl;
2604 ASM_DECLARE_OBJECT_NAME (file, name, decl);
2605 #else
2606 /* Standard thing is just output label for the object. */
2607 ASM_OUTPUT_LABEL (file, name);
2608 #endif /* ASM_DECLARE_OBJECT_NAME */
2609 ASM_OUTPUT_SKIP (file, size ? size : 1);
2610 }
2611 \f
2612 void
2613 optimization_options (int level, int size ATTRIBUTE_UNUSED)
2614 {
2615 /* For -O2 and beyond, turn off -fschedule-insns by default. It tends to
2616 make the problem with not enough registers even worse. */
2617 #ifdef INSN_SCHEDULING
2618 if (level > 1)
2619 flag_schedule_insns = 0;
2620 #endif
2621
2622 if (TARGET_MACHO)
2623 /* The Darwin libraries never set errno, so we might as well
2624 avoid calling them when that's the only reason we would. */
2625 flag_errno_math = 0;
2626
2627 /* The default values of these switches depend on the TARGET_64BIT
2628 that is not known at this moment. Mark these values with 2 and
2629 let user the to override these. In case there is no command line option
2630 specifying them, we will set the defaults in override_options. */
2631 if (optimize >= 1)
2632 flag_omit_frame_pointer = 2;
2633 flag_pcc_struct_return = 2;
2634 flag_asynchronous_unwind_tables = 2;
2635 #ifdef SUBTARGET_OPTIMIZATION_OPTIONS
2636 SUBTARGET_OPTIMIZATION_OPTIONS;
2637 #endif
2638 }
2639 \f
2640 /* Table of valid machine attributes. */
2641 const struct attribute_spec ix86_attribute_table[] =
2642 {
2643 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler } */
2644 /* Stdcall attribute says callee is responsible for popping arguments
2645 if they are not variable. */
2646 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute },
2647 /* Fastcall attribute says callee is responsible for popping arguments
2648 if they are not variable. */
2649 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute },
2650 /* Cdecl attribute says the callee is a normal C declaration */
2651 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute },
2652 /* Regparm attribute specifies how many integer arguments are to be
2653 passed in registers. */
2654 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute },
2655 /* Sseregparm attribute says we are using x86_64 calling conventions
2656 for FP arguments. */
2657 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute },
2658 /* force_align_arg_pointer says this function realigns the stack at entry. */
2659 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
2660 false, true, true, ix86_handle_cconv_attribute },
2661 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
2662 { "dllimport", 0, 0, false, false, false, handle_dll_attribute },
2663 { "dllexport", 0, 0, false, false, false, handle_dll_attribute },
2664 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute },
2665 #endif
2666 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute },
2667 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute },
2668 #ifdef SUBTARGET_ATTRIBUTE_TABLE
2669 SUBTARGET_ATTRIBUTE_TABLE,
2670 #endif
2671 { NULL, 0, 0, false, false, false, NULL }
2672 };
2673
2674 /* Decide whether we can make a sibling call to a function. DECL is the
2675 declaration of the function being targeted by the call and EXP is the
2676 CALL_EXPR representing the call. */
2677
2678 static bool
2679 ix86_function_ok_for_sibcall (tree decl, tree exp)
2680 {
2681 tree func;
2682 rtx a, b;
2683
2684 /* If we are generating position-independent code, we cannot sibcall
2685 optimize any indirect call, or a direct call to a global function,
2686 as the PLT requires %ebx be live. */
2687 if (!TARGET_64BIT && flag_pic && (!decl || !targetm.binds_local_p (decl)))
2688 return false;
2689
2690 if (decl)
2691 func = decl;
2692 else
2693 {
2694 func = TREE_TYPE (TREE_OPERAND (exp, 0));
2695 if (POINTER_TYPE_P (func))
2696 func = TREE_TYPE (func);
2697 }
2698
2699 /* Check that the return value locations are the same. Like
2700 if we are returning floats on the 80387 register stack, we cannot
2701 make a sibcall from a function that doesn't return a float to a
2702 function that does or, conversely, from a function that does return
2703 a float to a function that doesn't; the necessary stack adjustment
2704 would not be executed. This is also the place we notice
2705 differences in the return value ABI. Note that it is ok for one
2706 of the functions to have void return type as long as the return
2707 value of the other is passed in a register. */
2708 a = ix86_function_value (TREE_TYPE (exp), func, false);
2709 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
2710 cfun->decl, false);
2711 if (STACK_REG_P (a) || STACK_REG_P (b))
2712 {
2713 if (!rtx_equal_p (a, b))
2714 return false;
2715 }
2716 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
2717 ;
2718 else if (!rtx_equal_p (a, b))
2719 return false;
2720
2721 /* If this call is indirect, we'll need to be able to use a call-clobbered
2722 register for the address of the target function. Make sure that all
2723 such registers are not used for passing parameters. */
2724 if (!decl && !TARGET_64BIT)
2725 {
2726 tree type;
2727
2728 /* We're looking at the CALL_EXPR, we need the type of the function. */
2729 type = TREE_OPERAND (exp, 0); /* pointer expression */
2730 type = TREE_TYPE (type); /* pointer type */
2731 type = TREE_TYPE (type); /* function type */
2732
2733 if (ix86_function_regparm (type, NULL) >= 3)
2734 {
2735 /* ??? Need to count the actual number of registers to be used,
2736 not the possible number of registers. Fix later. */
2737 return false;
2738 }
2739 }
2740
2741 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
2742 /* Dllimport'd functions are also called indirectly. */
2743 if (decl && DECL_DLLIMPORT_P (decl)
2744 && ix86_function_regparm (TREE_TYPE (decl), NULL) >= 3)
2745 return false;
2746 #endif
2747
2748 /* If we forced aligned the stack, then sibcalling would unalign the
2749 stack, which may break the called function. */
2750 if (cfun->machine->force_align_arg_pointer)
2751 return false;
2752
2753 /* Otherwise okay. That also includes certain types of indirect calls. */
2754 return true;
2755 }
2756
2757 /* Handle "cdecl", "stdcall", "fastcall", "regparm" and "sseregparm"
2758 calling convention attributes;
2759 arguments as in struct attribute_spec.handler. */
2760
2761 static tree
2762 ix86_handle_cconv_attribute (tree *node, tree name,
2763 tree args,
2764 int flags ATTRIBUTE_UNUSED,
2765 bool *no_add_attrs)
2766 {
2767 if (TREE_CODE (*node) != FUNCTION_TYPE
2768 && TREE_CODE (*node) != METHOD_TYPE
2769 && TREE_CODE (*node) != FIELD_DECL
2770 && TREE_CODE (*node) != TYPE_DECL)
2771 {
2772 warning (OPT_Wattributes, "%qs attribute only applies to functions",
2773 IDENTIFIER_POINTER (name));
2774 *no_add_attrs = true;
2775 return NULL_TREE;
2776 }
2777
2778 /* Can combine regparm with all attributes but fastcall. */
2779 if (is_attribute_p ("regparm", name))
2780 {
2781 tree cst;
2782
2783 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
2784 {
2785 error ("fastcall and regparm attributes are not compatible");
2786 }
2787
2788 cst = TREE_VALUE (args);
2789 if (TREE_CODE (cst) != INTEGER_CST)
2790 {
2791 warning (OPT_Wattributes,
2792 "%qs attribute requires an integer constant argument",
2793 IDENTIFIER_POINTER (name));
2794 *no_add_attrs = true;
2795 }
2796 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
2797 {
2798 warning (OPT_Wattributes, "argument to %qs attribute larger than %d",
2799 IDENTIFIER_POINTER (name), REGPARM_MAX);
2800 *no_add_attrs = true;
2801 }
2802
2803 if (!TARGET_64BIT
2804 && lookup_attribute (ix86_force_align_arg_pointer_string,
2805 TYPE_ATTRIBUTES (*node))
2806 && compare_tree_int (cst, REGPARM_MAX-1))
2807 {
2808 error ("%s functions limited to %d register parameters",
2809 ix86_force_align_arg_pointer_string, REGPARM_MAX-1);
2810 }
2811
2812 return NULL_TREE;
2813 }
2814
2815 if (TARGET_64BIT)
2816 {
2817 warning (OPT_Wattributes, "%qs attribute ignored",
2818 IDENTIFIER_POINTER (name));
2819 *no_add_attrs = true;
2820 return NULL_TREE;
2821 }
2822
2823 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
2824 if (is_attribute_p ("fastcall", name))
2825 {
2826 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
2827 {
2828 error ("fastcall and cdecl attributes are not compatible");
2829 }
2830 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
2831 {
2832 error ("fastcall and stdcall attributes are not compatible");
2833 }
2834 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
2835 {
2836 error ("fastcall and regparm attributes are not compatible");
2837 }
2838 }
2839
2840 /* Can combine stdcall with fastcall (redundant), regparm and
2841 sseregparm. */
2842 else if (is_attribute_p ("stdcall", name))
2843 {
2844 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
2845 {
2846 error ("stdcall and cdecl attributes are not compatible");
2847 }
2848 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
2849 {
2850 error ("stdcall and fastcall attributes are not compatible");
2851 }
2852 }
2853
2854 /* Can combine cdecl with regparm and sseregparm. */
2855 else if (is_attribute_p ("cdecl", name))
2856 {
2857 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
2858 {
2859 error ("stdcall and cdecl attributes are not compatible");
2860 }
2861 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
2862 {
2863 error ("fastcall and cdecl attributes are not compatible");
2864 }
2865 }
2866
2867 /* Can combine sseregparm with all attributes. */
2868
2869 return NULL_TREE;
2870 }
2871
2872 /* Return 0 if the attributes for two types are incompatible, 1 if they
2873 are compatible, and 2 if they are nearly compatible (which causes a
2874 warning to be generated). */
2875
2876 static int
2877 ix86_comp_type_attributes (tree type1, tree type2)
2878 {
2879 /* Check for mismatch of non-default calling convention. */
2880 const char *const rtdstr = TARGET_RTD ? "cdecl" : "stdcall";
2881
2882 if (TREE_CODE (type1) != FUNCTION_TYPE)
2883 return 1;
2884
2885 /* Check for mismatched fastcall/regparm types. */
2886 if ((!lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type1))
2887 != !lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type2)))
2888 || (ix86_function_regparm (type1, NULL)
2889 != ix86_function_regparm (type2, NULL)))
2890 return 0;
2891
2892 /* Check for mismatched sseregparm types. */
2893 if (!lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type1))
2894 != !lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type2)))
2895 return 0;
2896
2897 /* Check for mismatched return types (cdecl vs stdcall). */
2898 if (!lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type1))
2899 != !lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type2)))
2900 return 0;
2901
2902 return 1;
2903 }
2904 \f
2905 /* Return the regparm value for a function with the indicated TYPE and DECL.
2906 DECL may be NULL when calling function indirectly
2907 or considering a libcall. */
2908
2909 static int
2910 ix86_function_regparm (tree type, tree decl)
2911 {
2912 tree attr;
2913 int regparm = ix86_regparm;
2914 bool user_convention = false;
2915
2916 if (!TARGET_64BIT)
2917 {
2918 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
2919 if (attr)
2920 {
2921 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
2922 user_convention = true;
2923 }
2924
2925 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type)))
2926 {
2927 regparm = 2;
2928 user_convention = true;
2929 }
2930
2931 /* Use register calling convention for local functions when possible. */
2932 if (!TARGET_64BIT && !user_convention && decl
2933 && flag_unit_at_a_time && !profile_flag)
2934 {
2935 struct cgraph_local_info *i = cgraph_local_info (decl);
2936 if (i && i->local)
2937 {
2938 int local_regparm, globals = 0, regno;
2939
2940 /* Make sure no regparm register is taken by a global register
2941 variable. */
2942 for (local_regparm = 0; local_regparm < 3; local_regparm++)
2943 if (global_regs[local_regparm])
2944 break;
2945 /* We can't use regparm(3) for nested functions as these use
2946 static chain pointer in third argument. */
2947 if (local_regparm == 3
2948 && decl_function_context (decl)
2949 && !DECL_NO_STATIC_CHAIN (decl))
2950 local_regparm = 2;
2951 /* If the function realigns its stackpointer, the
2952 prologue will clobber %ecx. If we've already
2953 generated code for the callee, the callee
2954 DECL_STRUCT_FUNCTION is gone, so we fall back to
2955 scanning the attributes for the self-realigning
2956 property. */
2957 if ((DECL_STRUCT_FUNCTION (decl)
2958 && DECL_STRUCT_FUNCTION (decl)->machine->force_align_arg_pointer)
2959 || (!DECL_STRUCT_FUNCTION (decl)
2960 && lookup_attribute (ix86_force_align_arg_pointer_string,
2961 TYPE_ATTRIBUTES (TREE_TYPE (decl)))))
2962 local_regparm = 2;
2963 /* Each global register variable increases register preassure,
2964 so the more global reg vars there are, the smaller regparm
2965 optimization use, unless requested by the user explicitly. */
2966 for (regno = 0; regno < 6; regno++)
2967 if (global_regs[regno])
2968 globals++;
2969 local_regparm
2970 = globals < local_regparm ? local_regparm - globals : 0;
2971
2972 if (local_regparm > regparm)
2973 regparm = local_regparm;
2974 }
2975 }
2976 }
2977 return regparm;
2978 }
2979
2980 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
2981 DFmode (2) arguments in SSE registers for a function with the
2982 indicated TYPE and DECL. DECL may be NULL when calling function
2983 indirectly or considering a libcall. Otherwise return 0. */
2984
2985 static int
2986 ix86_function_sseregparm (tree type, tree decl)
2987 {
2988 /* Use SSE registers to pass SFmode and DFmode arguments if requested
2989 by the sseregparm attribute. */
2990 if (TARGET_SSEREGPARM
2991 || (type
2992 && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
2993 {
2994 if (!TARGET_SSE)
2995 {
2996 if (decl)
2997 error ("Calling %qD with attribute sseregparm without "
2998 "SSE/SSE2 enabled", decl);
2999 else
3000 error ("Calling %qT with attribute sseregparm without "
3001 "SSE/SSE2 enabled", type);
3002 return 0;
3003 }
3004
3005 return 2;
3006 }
3007
3008 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
3009 (and DFmode for SSE2) arguments in SSE registers,
3010 even for 32-bit targets. */
3011 if (!TARGET_64BIT && decl
3012 && TARGET_SSE_MATH && flag_unit_at_a_time && !profile_flag)
3013 {
3014 struct cgraph_local_info *i = cgraph_local_info (decl);
3015 if (i && i->local)
3016 return TARGET_SSE2 ? 2 : 1;
3017 }
3018
3019 return 0;
3020 }
3021
3022 /* Return true if EAX is live at the start of the function. Used by
3023 ix86_expand_prologue to determine if we need special help before
3024 calling allocate_stack_worker. */
3025
3026 static bool
3027 ix86_eax_live_at_start_p (void)
3028 {
3029 /* Cheat. Don't bother working forward from ix86_function_regparm
3030 to the function type to whether an actual argument is located in
3031 eax. Instead just look at cfg info, which is still close enough
3032 to correct at this point. This gives false positives for broken
3033 functions that might use uninitialized data that happens to be
3034 allocated in eax, but who cares? */
3035 return REGNO_REG_SET_P (ENTRY_BLOCK_PTR->il.rtl->global_live_at_end, 0);
3036 }
3037
3038 /* Value is the number of bytes of arguments automatically
3039 popped when returning from a subroutine call.
3040 FUNDECL is the declaration node of the function (as a tree),
3041 FUNTYPE is the data type of the function (as a tree),
3042 or for a library call it is an identifier node for the subroutine name.
3043 SIZE is the number of bytes of arguments passed on the stack.
3044
3045 On the 80386, the RTD insn may be used to pop them if the number
3046 of args is fixed, but if the number is variable then the caller
3047 must pop them all. RTD can't be used for library calls now
3048 because the library is compiled with the Unix compiler.
3049 Use of RTD is a selectable option, since it is incompatible with
3050 standard Unix calling sequences. If the option is not selected,
3051 the caller must always pop the args.
3052
3053 The attribute stdcall is equivalent to RTD on a per module basis. */
3054
3055 int
3056 ix86_return_pops_args (tree fundecl, tree funtype, int size)
3057 {
3058 int rtd = TARGET_RTD && (!fundecl || TREE_CODE (fundecl) != IDENTIFIER_NODE);
3059
3060 /* Cdecl functions override -mrtd, and never pop the stack. */
3061 if (! lookup_attribute ("cdecl", TYPE_ATTRIBUTES (funtype))) {
3062
3063 /* Stdcall and fastcall functions will pop the stack if not
3064 variable args. */
3065 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (funtype))
3066 || lookup_attribute ("fastcall", TYPE_ATTRIBUTES (funtype)))
3067 rtd = 1;
3068
3069 if (rtd
3070 && (TYPE_ARG_TYPES (funtype) == NULL_TREE
3071 || (TREE_VALUE (tree_last (TYPE_ARG_TYPES (funtype)))
3072 == void_type_node)))
3073 return size;
3074 }
3075
3076 /* Lose any fake structure return argument if it is passed on the stack. */
3077 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
3078 && !TARGET_64BIT
3079 && !KEEP_AGGREGATE_RETURN_POINTER)
3080 {
3081 int nregs = ix86_function_regparm (funtype, fundecl);
3082
3083 if (!nregs)
3084 return GET_MODE_SIZE (Pmode);
3085 }
3086
3087 return 0;
3088 }
3089 \f
3090 /* Argument support functions. */
3091
3092 /* Return true when register may be used to pass function parameters. */
3093 bool
3094 ix86_function_arg_regno_p (int regno)
3095 {
3096 int i;
3097 if (!TARGET_64BIT)
3098 {
3099 if (TARGET_MACHO)
3100 return (regno < REGPARM_MAX
3101 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
3102 else
3103 return (regno < REGPARM_MAX
3104 || (TARGET_MMX && MMX_REGNO_P (regno)
3105 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
3106 || (TARGET_SSE && SSE_REGNO_P (regno)
3107 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
3108 }
3109
3110 if (TARGET_MACHO)
3111 {
3112 if (SSE_REGNO_P (regno) && TARGET_SSE)
3113 return true;
3114 }
3115 else
3116 {
3117 if (TARGET_SSE && SSE_REGNO_P (regno)
3118 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
3119 return true;
3120 }
3121 /* RAX is used as hidden argument to va_arg functions. */
3122 if (!regno)
3123 return true;
3124 for (i = 0; i < REGPARM_MAX; i++)
3125 if (regno == x86_64_int_parameter_registers[i])
3126 return true;
3127 return false;
3128 }
3129
3130 /* Return if we do not know how to pass TYPE solely in registers. */
3131
3132 static bool
3133 ix86_must_pass_in_stack (enum machine_mode mode, tree type)
3134 {
3135 if (must_pass_in_stack_var_size_or_pad (mode, type))
3136 return true;
3137
3138 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
3139 The layout_type routine is crafty and tries to trick us into passing
3140 currently unsupported vector types on the stack by using TImode. */
3141 return (!TARGET_64BIT && mode == TImode
3142 && type && TREE_CODE (type) != VECTOR_TYPE);
3143 }
3144
3145 /* Initialize a variable CUM of type CUMULATIVE_ARGS
3146 for a call to a function whose data type is FNTYPE.
3147 For a library call, FNTYPE is 0. */
3148
3149 void
3150 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
3151 tree fntype, /* tree ptr for function decl */
3152 rtx libname, /* SYMBOL_REF of library name or 0 */
3153 tree fndecl)
3154 {
3155 static CUMULATIVE_ARGS zero_cum;
3156 tree param, next_param;
3157
3158 if (TARGET_DEBUG_ARG)
3159 {
3160 fprintf (stderr, "\ninit_cumulative_args (");
3161 if (fntype)
3162 fprintf (stderr, "fntype code = %s, ret code = %s",
3163 tree_code_name[(int) TREE_CODE (fntype)],
3164 tree_code_name[(int) TREE_CODE (TREE_TYPE (fntype))]);
3165 else
3166 fprintf (stderr, "no fntype");
3167
3168 if (libname)
3169 fprintf (stderr, ", libname = %s", XSTR (libname, 0));
3170 }
3171
3172 *cum = zero_cum;
3173
3174 /* Set up the number of registers to use for passing arguments. */
3175 cum->nregs = ix86_regparm;
3176 if (TARGET_SSE)
3177 cum->sse_nregs = SSE_REGPARM_MAX;
3178 if (TARGET_MMX)
3179 cum->mmx_nregs = MMX_REGPARM_MAX;
3180 cum->warn_sse = true;
3181 cum->warn_mmx = true;
3182 cum->maybe_vaarg = false;
3183
3184 /* Use ecx and edx registers if function has fastcall attribute,
3185 else look for regparm information. */
3186 if (fntype && !TARGET_64BIT)
3187 {
3188 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)))
3189 {
3190 cum->nregs = 2;
3191 cum->fastcall = 1;
3192 }
3193 else
3194 cum->nregs = ix86_function_regparm (fntype, fndecl);
3195 }
3196
3197 /* Set up the number of SSE registers used for passing SFmode
3198 and DFmode arguments. Warn for mismatching ABI. */
3199 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl);
3200
3201 /* Determine if this function has variable arguments. This is
3202 indicated by the last argument being 'void_type_mode' if there
3203 are no variable arguments. If there are variable arguments, then
3204 we won't pass anything in registers in 32-bit mode. */
3205
3206 if (cum->nregs || cum->mmx_nregs || cum->sse_nregs)
3207 {
3208 for (param = (fntype) ? TYPE_ARG_TYPES (fntype) : 0;
3209 param != 0; param = next_param)
3210 {
3211 next_param = TREE_CHAIN (param);
3212 if (next_param == 0 && TREE_VALUE (param) != void_type_node)
3213 {
3214 if (!TARGET_64BIT)
3215 {
3216 cum->nregs = 0;
3217 cum->sse_nregs = 0;
3218 cum->mmx_nregs = 0;
3219 cum->warn_sse = 0;
3220 cum->warn_mmx = 0;
3221 cum->fastcall = 0;
3222 cum->float_in_sse = 0;
3223 }
3224 cum->maybe_vaarg = true;
3225 }
3226 }
3227 }
3228 if ((!fntype && !libname)
3229 || (fntype && !TYPE_ARG_TYPES (fntype)))
3230 cum->maybe_vaarg = true;
3231
3232 if (TARGET_DEBUG_ARG)
3233 fprintf (stderr, ", nregs=%d )\n", cum->nregs);
3234
3235 return;
3236 }
3237
3238 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
3239 But in the case of vector types, it is some vector mode.
3240
3241 When we have only some of our vector isa extensions enabled, then there
3242 are some modes for which vector_mode_supported_p is false. For these
3243 modes, the generic vector support in gcc will choose some non-vector mode
3244 in order to implement the type. By computing the natural mode, we'll
3245 select the proper ABI location for the operand and not depend on whatever
3246 the middle-end decides to do with these vector types. */
3247
3248 static enum machine_mode
3249 type_natural_mode (tree type)
3250 {
3251 enum machine_mode mode = TYPE_MODE (type);
3252
3253 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
3254 {
3255 HOST_WIDE_INT size = int_size_in_bytes (type);
3256 if ((size == 8 || size == 16)
3257 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
3258 && TYPE_VECTOR_SUBPARTS (type) > 1)
3259 {
3260 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
3261
3262 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
3263 mode = MIN_MODE_VECTOR_FLOAT;
3264 else
3265 mode = MIN_MODE_VECTOR_INT;
3266
3267 /* Get the mode which has this inner mode and number of units. */
3268 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
3269 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
3270 && GET_MODE_INNER (mode) == innermode)
3271 return mode;
3272
3273 gcc_unreachable ();
3274 }
3275 }
3276
3277 return mode;
3278 }
3279
3280 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
3281 this may not agree with the mode that the type system has chosen for the
3282 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
3283 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
3284
3285 static rtx
3286 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
3287 unsigned int regno)
3288 {
3289 rtx tmp;
3290
3291 if (orig_mode != BLKmode)
3292 tmp = gen_rtx_REG (orig_mode, regno);
3293 else
3294 {
3295 tmp = gen_rtx_REG (mode, regno);
3296 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
3297 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
3298 }
3299
3300 return tmp;
3301 }
3302
3303 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
3304 of this code is to classify each 8bytes of incoming argument by the register
3305 class and assign registers accordingly. */
3306
3307 /* Return the union class of CLASS1 and CLASS2.
3308 See the x86-64 PS ABI for details. */
3309
3310 static enum x86_64_reg_class
3311 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
3312 {
3313 /* Rule #1: If both classes are equal, this is the resulting class. */
3314 if (class1 == class2)
3315 return class1;
3316
3317 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
3318 the other class. */
3319 if (class1 == X86_64_NO_CLASS)
3320 return class2;
3321 if (class2 == X86_64_NO_CLASS)
3322 return class1;
3323
3324 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
3325 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
3326 return X86_64_MEMORY_CLASS;
3327
3328 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
3329 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
3330 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
3331 return X86_64_INTEGERSI_CLASS;
3332 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
3333 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
3334 return X86_64_INTEGER_CLASS;
3335
3336 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
3337 MEMORY is used. */
3338 if (class1 == X86_64_X87_CLASS
3339 || class1 == X86_64_X87UP_CLASS
3340 || class1 == X86_64_COMPLEX_X87_CLASS
3341 || class2 == X86_64_X87_CLASS
3342 || class2 == X86_64_X87UP_CLASS
3343 || class2 == X86_64_COMPLEX_X87_CLASS)
3344 return X86_64_MEMORY_CLASS;
3345
3346 /* Rule #6: Otherwise class SSE is used. */
3347 return X86_64_SSE_CLASS;
3348 }
3349
3350 /* Classify the argument of type TYPE and mode MODE.
3351 CLASSES will be filled by the register class used to pass each word
3352 of the operand. The number of words is returned. In case the parameter
3353 should be passed in memory, 0 is returned. As a special case for zero
3354 sized containers, classes[0] will be NO_CLASS and 1 is returned.
3355
3356 BIT_OFFSET is used internally for handling records and specifies offset
3357 of the offset in bits modulo 256 to avoid overflow cases.
3358
3359 See the x86-64 PS ABI for details.
3360 */
3361
3362 static int
3363 classify_argument (enum machine_mode mode, tree type,
3364 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
3365 {
3366 HOST_WIDE_INT bytes =
3367 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
3368 int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
3369
3370 /* Variable sized entities are always passed/returned in memory. */
3371 if (bytes < 0)
3372 return 0;
3373
3374 if (mode != VOIDmode
3375 && targetm.calls.must_pass_in_stack (mode, type))
3376 return 0;
3377
3378 if (type && AGGREGATE_TYPE_P (type))
3379 {
3380 int i;
3381 tree field;
3382 enum x86_64_reg_class subclasses[MAX_CLASSES];
3383
3384 /* On x86-64 we pass structures larger than 16 bytes on the stack. */
3385 if (bytes > 16)
3386 return 0;
3387
3388 for (i = 0; i < words; i++)
3389 classes[i] = X86_64_NO_CLASS;
3390
3391 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
3392 signalize memory class, so handle it as special case. */
3393 if (!words)
3394 {
3395 classes[0] = X86_64_NO_CLASS;
3396 return 1;
3397 }
3398
3399 /* Classify each field of record and merge classes. */
3400 switch (TREE_CODE (type))
3401 {
3402 case RECORD_TYPE:
3403 /* And now merge the fields of structure. */
3404 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3405 {
3406 if (TREE_CODE (field) == FIELD_DECL)
3407 {
3408 int num;
3409
3410 if (TREE_TYPE (field) == error_mark_node)
3411 continue;
3412
3413 /* Bitfields are always classified as integer. Handle them
3414 early, since later code would consider them to be
3415 misaligned integers. */
3416 if (DECL_BIT_FIELD (field))
3417 {
3418 for (i = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
3419 i < ((int_bit_position (field) + (bit_offset % 64))
3420 + tree_low_cst (DECL_SIZE (field), 0)
3421 + 63) / 8 / 8; i++)
3422 classes[i] =
3423 merge_classes (X86_64_INTEGER_CLASS,
3424 classes[i]);
3425 }
3426 else
3427 {
3428 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
3429 TREE_TYPE (field), subclasses,
3430 (int_bit_position (field)
3431 + bit_offset) % 256);
3432 if (!num)
3433 return 0;
3434 for (i = 0; i < num; i++)
3435 {
3436 int pos =
3437 (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
3438 classes[i + pos] =
3439 merge_classes (subclasses[i], classes[i + pos]);
3440 }
3441 }
3442 }
3443 }
3444 break;
3445
3446 case ARRAY_TYPE:
3447 /* Arrays are handled as small records. */
3448 {
3449 int num;
3450 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
3451 TREE_TYPE (type), subclasses, bit_offset);
3452 if (!num)
3453 return 0;
3454
3455 /* The partial classes are now full classes. */
3456 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
3457 subclasses[0] = X86_64_SSE_CLASS;
3458 if (subclasses[0] == X86_64_INTEGERSI_CLASS && bytes != 4)
3459 subclasses[0] = X86_64_INTEGER_CLASS;
3460
3461 for (i = 0; i < words; i++)
3462 classes[i] = subclasses[i % num];
3463
3464 break;
3465 }
3466 case UNION_TYPE:
3467 case QUAL_UNION_TYPE:
3468 /* Unions are similar to RECORD_TYPE but offset is always 0.
3469 */
3470 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3471 {
3472 if (TREE_CODE (field) == FIELD_DECL)
3473 {
3474 int num;
3475
3476 if (TREE_TYPE (field) == error_mark_node)
3477 continue;
3478
3479 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
3480 TREE_TYPE (field), subclasses,
3481 bit_offset);
3482 if (!num)
3483 return 0;
3484 for (i = 0; i < num; i++)
3485 classes[i] = merge_classes (subclasses[i], classes[i]);
3486 }
3487 }
3488 break;
3489
3490 default:
3491 gcc_unreachable ();
3492 }
3493
3494 /* Final merger cleanup. */
3495 for (i = 0; i < words; i++)
3496 {
3497 /* If one class is MEMORY, everything should be passed in
3498 memory. */
3499 if (classes[i] == X86_64_MEMORY_CLASS)
3500 return 0;
3501
3502 /* The X86_64_SSEUP_CLASS should be always preceded by
3503 X86_64_SSE_CLASS. */
3504 if (classes[i] == X86_64_SSEUP_CLASS
3505 && (i == 0 || classes[i - 1] != X86_64_SSE_CLASS))
3506 classes[i] = X86_64_SSE_CLASS;
3507
3508 /* X86_64_X87UP_CLASS should be preceded by X86_64_X87_CLASS. */
3509 if (classes[i] == X86_64_X87UP_CLASS
3510 && (i == 0 || classes[i - 1] != X86_64_X87_CLASS))
3511 classes[i] = X86_64_SSE_CLASS;
3512 }
3513 return words;
3514 }
3515
3516 /* Compute alignment needed. We align all types to natural boundaries with
3517 exception of XFmode that is aligned to 64bits. */
3518 if (mode != VOIDmode && mode != BLKmode)
3519 {
3520 int mode_alignment = GET_MODE_BITSIZE (mode);
3521
3522 if (mode == XFmode)
3523 mode_alignment = 128;
3524 else if (mode == XCmode)
3525 mode_alignment = 256;
3526 if (COMPLEX_MODE_P (mode))
3527 mode_alignment /= 2;
3528 /* Misaligned fields are always returned in memory. */
3529 if (bit_offset % mode_alignment)
3530 return 0;
3531 }
3532
3533 /* for V1xx modes, just use the base mode */
3534 if (VECTOR_MODE_P (mode)
3535 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
3536 mode = GET_MODE_INNER (mode);
3537
3538 /* Classification of atomic types. */
3539 switch (mode)
3540 {
3541 case SDmode:
3542 case DDmode:
3543 classes[0] = X86_64_SSE_CLASS;
3544 return 1;
3545 case TDmode:
3546 classes[0] = X86_64_SSE_CLASS;
3547 classes[1] = X86_64_SSEUP_CLASS;
3548 return 2;
3549 case DImode:
3550 case SImode:
3551 case HImode:
3552 case QImode:
3553 case CSImode:
3554 case CHImode:
3555 case CQImode:
3556 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
3557 classes[0] = X86_64_INTEGERSI_CLASS;
3558 else
3559 classes[0] = X86_64_INTEGER_CLASS;
3560 return 1;
3561 case CDImode:
3562 case TImode:
3563 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
3564 return 2;
3565 case CTImode:
3566 return 0;
3567 case SFmode:
3568 if (!(bit_offset % 64))
3569 classes[0] = X86_64_SSESF_CLASS;
3570 else
3571 classes[0] = X86_64_SSE_CLASS;
3572 return 1;
3573 case DFmode:
3574 classes[0] = X86_64_SSEDF_CLASS;
3575 return 1;
3576 case XFmode:
3577 classes[0] = X86_64_X87_CLASS;
3578 classes[1] = X86_64_X87UP_CLASS;
3579 return 2;
3580 case TFmode:
3581 classes[0] = X86_64_SSE_CLASS;
3582 classes[1] = X86_64_SSEUP_CLASS;
3583 return 2;
3584 case SCmode:
3585 classes[0] = X86_64_SSE_CLASS;
3586 return 1;
3587 case DCmode:
3588 classes[0] = X86_64_SSEDF_CLASS;
3589 classes[1] = X86_64_SSEDF_CLASS;
3590 return 2;
3591 case XCmode:
3592 classes[0] = X86_64_COMPLEX_X87_CLASS;
3593 return 1;
3594 case TCmode:
3595 /* This modes is larger than 16 bytes. */
3596 return 0;
3597 case V4SFmode:
3598 case V4SImode:
3599 case V16QImode:
3600 case V8HImode:
3601 case V2DFmode:
3602 case V2DImode:
3603 classes[0] = X86_64_SSE_CLASS;
3604 classes[1] = X86_64_SSEUP_CLASS;
3605 return 2;
3606 case V2SFmode:
3607 case V2SImode:
3608 case V4HImode:
3609 case V8QImode:
3610 classes[0] = X86_64_SSE_CLASS;
3611 return 1;
3612 case BLKmode:
3613 case VOIDmode:
3614 return 0;
3615 default:
3616 gcc_assert (VECTOR_MODE_P (mode));
3617
3618 if (bytes > 16)
3619 return 0;
3620
3621 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
3622
3623 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
3624 classes[0] = X86_64_INTEGERSI_CLASS;
3625 else
3626 classes[0] = X86_64_INTEGER_CLASS;
3627 classes[1] = X86_64_INTEGER_CLASS;
3628 return 1 + (bytes > 8);
3629 }
3630 }
3631
3632 /* Examine the argument and return set number of register required in each
3633 class. Return 0 iff parameter should be passed in memory. */
3634 static int
3635 examine_argument (enum machine_mode mode, tree type, int in_return,
3636 int *int_nregs, int *sse_nregs)
3637 {
3638 enum x86_64_reg_class class[MAX_CLASSES];
3639 int n = classify_argument (mode, type, class, 0);
3640
3641 *int_nregs = 0;
3642 *sse_nregs = 0;
3643 if (!n)
3644 return 0;
3645 for (n--; n >= 0; n--)
3646 switch (class[n])
3647 {
3648 case X86_64_INTEGER_CLASS:
3649 case X86_64_INTEGERSI_CLASS:
3650 (*int_nregs)++;
3651 break;
3652 case X86_64_SSE_CLASS:
3653 case X86_64_SSESF_CLASS:
3654 case X86_64_SSEDF_CLASS:
3655 (*sse_nregs)++;
3656 break;
3657 case X86_64_NO_CLASS:
3658 case X86_64_SSEUP_CLASS:
3659 break;
3660 case X86_64_X87_CLASS:
3661 case X86_64_X87UP_CLASS:
3662 if (!in_return)
3663 return 0;
3664 break;
3665 case X86_64_COMPLEX_X87_CLASS:
3666 return in_return ? 2 : 0;
3667 case X86_64_MEMORY_CLASS:
3668 gcc_unreachable ();
3669 }
3670 return 1;
3671 }
3672
3673 /* Construct container for the argument used by GCC interface. See
3674 FUNCTION_ARG for the detailed description. */
3675
3676 static rtx
3677 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
3678 tree type, int in_return, int nintregs, int nsseregs,
3679 const int *intreg, int sse_regno)
3680 {
3681 /* The following variables hold the static issued_error state. */
3682 static bool issued_sse_arg_error;
3683 static bool issued_sse_ret_error;
3684 static bool issued_x87_ret_error;
3685
3686 enum machine_mode tmpmode;
3687 int bytes =
3688 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
3689 enum x86_64_reg_class class[MAX_CLASSES];
3690 int n;
3691 int i;
3692 int nexps = 0;
3693 int needed_sseregs, needed_intregs;
3694 rtx exp[MAX_CLASSES];
3695 rtx ret;
3696
3697 n = classify_argument (mode, type, class, 0);
3698 if (TARGET_DEBUG_ARG)
3699 {
3700 if (!n)
3701 fprintf (stderr, "Memory class\n");
3702 else
3703 {
3704 fprintf (stderr, "Classes:");
3705 for (i = 0; i < n; i++)
3706 {
3707 fprintf (stderr, " %s", x86_64_reg_class_name[class[i]]);
3708 }
3709 fprintf (stderr, "\n");
3710 }
3711 }
3712 if (!n)
3713 return NULL;
3714 if (!examine_argument (mode, type, in_return, &needed_intregs,
3715 &needed_sseregs))
3716 return NULL;
3717 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
3718 return NULL;
3719
3720 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
3721 some less clueful developer tries to use floating-point anyway. */
3722 if (needed_sseregs && !TARGET_SSE)
3723 {
3724 if (in_return)
3725 {
3726 if (!issued_sse_ret_error)
3727 {
3728 error ("SSE register return with SSE disabled");
3729 issued_sse_ret_error = true;
3730 }
3731 }
3732 else if (!issued_sse_arg_error)
3733 {
3734 error ("SSE register argument with SSE disabled");
3735 issued_sse_arg_error = true;
3736 }
3737 return NULL;
3738 }
3739
3740 /* Likewise, error if the ABI requires us to return values in the
3741 x87 registers and the user specified -mno-80387. */
3742 if (!TARGET_80387 && in_return)
3743 for (i = 0; i < n; i++)
3744 if (class[i] == X86_64_X87_CLASS
3745 || class[i] == X86_64_X87UP_CLASS
3746 || class[i] == X86_64_COMPLEX_X87_CLASS)
3747 {
3748 if (!issued_x87_ret_error)
3749 {
3750 error ("x87 register return with x87 disabled");
3751 issued_x87_ret_error = true;
3752 }
3753 return NULL;
3754 }
3755
3756 /* First construct simple cases. Avoid SCmode, since we want to use
3757 single register to pass this type. */
3758 if (n == 1 && mode != SCmode)
3759 switch (class[0])
3760 {
3761 case X86_64_INTEGER_CLASS:
3762 case X86_64_INTEGERSI_CLASS:
3763 return gen_rtx_REG (mode, intreg[0]);
3764 case X86_64_SSE_CLASS:
3765 case X86_64_SSESF_CLASS:
3766 case X86_64_SSEDF_CLASS:
3767 return gen_reg_or_parallel (mode, orig_mode, SSE_REGNO (sse_regno));
3768 case X86_64_X87_CLASS:
3769 case X86_64_COMPLEX_X87_CLASS:
3770 return gen_rtx_REG (mode, FIRST_STACK_REG);
3771 case X86_64_NO_CLASS:
3772 /* Zero sized array, struct or class. */
3773 return NULL;
3774 default:
3775 gcc_unreachable ();
3776 }
3777 if (n == 2 && class[0] == X86_64_SSE_CLASS && class[1] == X86_64_SSEUP_CLASS
3778 && mode != BLKmode)
3779 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
3780 if (n == 2
3781 && class[0] == X86_64_X87_CLASS && class[1] == X86_64_X87UP_CLASS)
3782 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
3783 if (n == 2 && class[0] == X86_64_INTEGER_CLASS
3784 && class[1] == X86_64_INTEGER_CLASS
3785 && (mode == CDImode || mode == TImode || mode == TFmode)
3786 && intreg[0] + 1 == intreg[1])
3787 return gen_rtx_REG (mode, intreg[0]);
3788
3789 /* Otherwise figure out the entries of the PARALLEL. */
3790 for (i = 0; i < n; i++)
3791 {
3792 switch (class[i])
3793 {
3794 case X86_64_NO_CLASS:
3795 break;
3796 case X86_64_INTEGER_CLASS:
3797 case X86_64_INTEGERSI_CLASS:
3798 /* Merge TImodes on aligned occasions here too. */
3799 if (i * 8 + 8 > bytes)
3800 tmpmode = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
3801 else if (class[i] == X86_64_INTEGERSI_CLASS)
3802 tmpmode = SImode;
3803 else
3804 tmpmode = DImode;
3805 /* We've requested 24 bytes we don't have mode for. Use DImode. */
3806 if (tmpmode == BLKmode)
3807 tmpmode = DImode;
3808 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
3809 gen_rtx_REG (tmpmode, *intreg),
3810 GEN_INT (i*8));
3811 intreg++;
3812 break;
3813 case X86_64_SSESF_CLASS:
3814 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
3815 gen_rtx_REG (SFmode,
3816 SSE_REGNO (sse_regno)),
3817 GEN_INT (i*8));
3818 sse_regno++;
3819 break;
3820 case X86_64_SSEDF_CLASS:
3821 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
3822 gen_rtx_REG (DFmode,
3823 SSE_REGNO (sse_regno)),
3824 GEN_INT (i*8));
3825 sse_regno++;
3826 break;
3827 case X86_64_SSE_CLASS:
3828 if (i < n - 1 && class[i + 1] == X86_64_SSEUP_CLASS)
3829 tmpmode = TImode;
3830 else
3831 tmpmode = DImode;
3832 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
3833 gen_rtx_REG (tmpmode,
3834 SSE_REGNO (sse_regno)),
3835 GEN_INT (i*8));
3836 if (tmpmode == TImode)
3837 i++;
3838 sse_regno++;
3839 break;
3840 default:
3841 gcc_unreachable ();
3842 }
3843 }
3844
3845 /* Empty aligned struct, union or class. */
3846 if (nexps == 0)
3847 return NULL;
3848
3849 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
3850 for (i = 0; i < nexps; i++)
3851 XVECEXP (ret, 0, i) = exp [i];
3852 return ret;
3853 }
3854
3855 /* Update the data in CUM to advance over an argument
3856 of mode MODE and data type TYPE.
3857 (TYPE is null for libcalls where that information may not be available.) */
3858
3859 void
3860 function_arg_advance (CUMULATIVE_ARGS *cum, enum machine_mode mode,
3861 tree type, int named)
3862 {
3863 int bytes =
3864 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
3865 int words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
3866
3867 if (type)
3868 mode = type_natural_mode (type);
3869
3870 if (TARGET_DEBUG_ARG)
3871 fprintf (stderr, "function_adv (sz=%d, wds=%2d, nregs=%d, ssenregs=%d, "
3872 "mode=%s, named=%d)\n\n",
3873 words, cum->words, cum->nregs, cum->sse_nregs,
3874 GET_MODE_NAME (mode), named);
3875
3876 if (TARGET_64BIT)
3877 {
3878 int int_nregs, sse_nregs;
3879 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs))
3880 cum->words += words;
3881 else if (sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
3882 {
3883 cum->nregs -= int_nregs;
3884 cum->sse_nregs -= sse_nregs;
3885 cum->regno += int_nregs;
3886 cum->sse_regno += sse_nregs;
3887 }
3888 else
3889 cum->words += words;
3890 }
3891 else
3892 {
3893 switch (mode)
3894 {
3895 default:
3896 break;
3897
3898 case BLKmode:
3899 if (bytes < 0)
3900 break;
3901 /* FALLTHRU */
3902
3903 case DImode:
3904 case SImode:
3905 case HImode:
3906 case QImode:
3907 cum->words += words;
3908 cum->nregs -= words;
3909 cum->regno += words;
3910
3911 if (cum->nregs <= 0)
3912 {
3913 cum->nregs = 0;
3914 cum->regno = 0;
3915 }
3916 break;
3917
3918 case DFmode:
3919 if (cum->float_in_sse < 2)
3920 break;
3921 case SFmode:
3922 if (cum->float_in_sse < 1)
3923 break;
3924 /* FALLTHRU */
3925
3926 case TImode:
3927 case V16QImode:
3928 case V8HImode:
3929 case V4SImode:
3930 case V2DImode:
3931 case V4SFmode:
3932 case V2DFmode:
3933 if (!type || !AGGREGATE_TYPE_P (type))
3934 {
3935 cum->sse_words += words;
3936 cum->sse_nregs -= 1;
3937 cum->sse_regno += 1;
3938 if (cum->sse_nregs <= 0)
3939 {
3940 cum->sse_nregs = 0;
3941 cum->sse_regno = 0;
3942 }
3943 }
3944 break;
3945
3946 case V8QImode:
3947 case V4HImode:
3948 case V2SImode:
3949 case V2SFmode:
3950 if (!type || !AGGREGATE_TYPE_P (type))
3951 {
3952 cum->mmx_words += words;
3953 cum->mmx_nregs -= 1;
3954 cum->mmx_regno += 1;
3955 if (cum->mmx_nregs <= 0)
3956 {
3957 cum->mmx_nregs = 0;
3958 cum->mmx_regno = 0;
3959 }
3960 }
3961 break;
3962 }
3963 }
3964 }
3965
3966 /* Define where to put the arguments to a function.
3967 Value is zero to push the argument on the stack,
3968 or a hard register in which to store the argument.
3969
3970 MODE is the argument's machine mode.
3971 TYPE is the data type of the argument (as a tree).
3972 This is null for libcalls where that information may
3973 not be available.
3974 CUM is a variable of type CUMULATIVE_ARGS which gives info about
3975 the preceding args and about the function being called.
3976 NAMED is nonzero if this argument is a named parameter
3977 (otherwise it is an extra parameter matching an ellipsis). */
3978
3979 rtx
3980 function_arg (CUMULATIVE_ARGS *cum, enum machine_mode orig_mode,
3981 tree type, int named)
3982 {
3983 enum machine_mode mode = orig_mode;
3984 rtx ret = NULL_RTX;
3985 int bytes =
3986 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
3987 int words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
3988 static bool warnedsse, warnedmmx;
3989
3990 /* To simplify the code below, represent vector types with a vector mode
3991 even if MMX/SSE are not active. */
3992 if (type && TREE_CODE (type) == VECTOR_TYPE)
3993 mode = type_natural_mode (type);
3994
3995 /* Handle a hidden AL argument containing number of registers for varargs
3996 x86-64 functions. For i386 ABI just return constm1_rtx to avoid
3997 any AL settings. */
3998 if (mode == VOIDmode)
3999 {
4000 if (TARGET_64BIT)
4001 return GEN_INT (cum->maybe_vaarg
4002 ? (cum->sse_nregs < 0
4003 ? SSE_REGPARM_MAX
4004 : cum->sse_regno)
4005 : -1);
4006 else
4007 return constm1_rtx;
4008 }
4009 if (TARGET_64BIT)
4010 ret = construct_container (mode, orig_mode, type, 0, cum->nregs,
4011 cum->sse_nregs,
4012 &x86_64_int_parameter_registers [cum->regno],
4013 cum->sse_regno);
4014 else
4015 switch (mode)
4016 {
4017 /* For now, pass fp/complex values on the stack. */
4018 default:
4019 break;
4020
4021 case BLKmode:
4022 if (bytes < 0)
4023 break;
4024 /* FALLTHRU */
4025 case DImode:
4026 case SImode:
4027 case HImode:
4028 case QImode:
4029 if (words <= cum->nregs)
4030 {
4031 int regno = cum->regno;
4032
4033 /* Fastcall allocates the first two DWORD (SImode) or
4034 smaller arguments to ECX and EDX. */
4035 if (cum->fastcall)
4036 {
4037 if (mode == BLKmode || mode == DImode)
4038 break;
4039
4040 /* ECX not EAX is the first allocated register. */
4041 if (regno == 0)
4042 regno = 2;
4043 }
4044 ret = gen_rtx_REG (mode, regno);
4045 }
4046 break;
4047 case DFmode:
4048 if (cum->float_in_sse < 2)
4049 break;
4050 case SFmode:
4051 if (cum->float_in_sse < 1)
4052 break;
4053 /* FALLTHRU */
4054 case TImode:
4055 case V16QImode:
4056 case V8HImode:
4057 case V4SImode:
4058 case V2DImode:
4059 case V4SFmode:
4060 case V2DFmode:
4061 if (!type || !AGGREGATE_TYPE_P (type))
4062 {
4063 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
4064 {
4065 warnedsse = true;
4066 warning (0, "SSE vector argument without SSE enabled "
4067 "changes the ABI");
4068 }
4069 if (cum->sse_nregs)
4070 ret = gen_reg_or_parallel (mode, orig_mode,
4071 cum->sse_regno + FIRST_SSE_REG);
4072 }
4073 break;
4074 case V8QImode:
4075 case V4HImode:
4076 case V2SImode:
4077 case V2SFmode:
4078 if (!type || !AGGREGATE_TYPE_P (type))
4079 {
4080 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
4081 {
4082 warnedmmx = true;
4083 warning (0, "MMX vector argument without MMX enabled "
4084 "changes the ABI");
4085 }
4086 if (cum->mmx_nregs)
4087 ret = gen_reg_or_parallel (mode, orig_mode,
4088 cum->mmx_regno + FIRST_MMX_REG);
4089 }
4090 break;
4091 }
4092
4093 if (TARGET_DEBUG_ARG)
4094 {
4095 fprintf (stderr,
4096 "function_arg (size=%d, wds=%2d, nregs=%d, mode=%4s, named=%d, ",
4097 words, cum->words, cum->nregs, GET_MODE_NAME (mode), named);
4098
4099 if (ret)
4100 print_simple_rtl (stderr, ret);
4101 else
4102 fprintf (stderr, ", stack");
4103
4104 fprintf (stderr, " )\n");
4105 }
4106
4107 return ret;
4108 }
4109
4110 /* A C expression that indicates when an argument must be passed by
4111 reference. If nonzero for an argument, a copy of that argument is
4112 made in memory and a pointer to the argument is passed instead of
4113 the argument itself. The pointer is passed in whatever way is
4114 appropriate for passing a pointer to that type. */
4115
4116 static bool
4117 ix86_pass_by_reference (CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED,
4118 enum machine_mode mode ATTRIBUTE_UNUSED,
4119 tree type, bool named ATTRIBUTE_UNUSED)
4120 {
4121 if (!TARGET_64BIT)
4122 return 0;
4123
4124 if (type && int_size_in_bytes (type) == -1)
4125 {
4126 if (TARGET_DEBUG_ARG)
4127 fprintf (stderr, "function_arg_pass_by_reference\n");
4128 return 1;
4129 }
4130
4131 return 0;
4132 }
4133
4134 /* Return true when TYPE should be 128bit aligned for 32bit argument passing
4135 ABI. Only called if TARGET_SSE. */
4136 static bool
4137 contains_128bit_aligned_vector_p (tree type)
4138 {
4139 enum machine_mode mode = TYPE_MODE (type);
4140 if (SSE_REG_MODE_P (mode)
4141 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
4142 return true;
4143 if (TYPE_ALIGN (type) < 128)
4144 return false;
4145
4146 if (AGGREGATE_TYPE_P (type))
4147 {
4148 /* Walk the aggregates recursively. */
4149 switch (TREE_CODE (type))
4150 {
4151 case RECORD_TYPE:
4152 case UNION_TYPE:
4153 case QUAL_UNION_TYPE:
4154 {
4155 tree field;
4156
4157 /* Walk all the structure fields. */
4158 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
4159 {
4160 if (TREE_CODE (field) == FIELD_DECL
4161 && contains_128bit_aligned_vector_p (TREE_TYPE (field)))
4162 return true;
4163 }
4164 break;
4165 }
4166
4167 case ARRAY_TYPE:
4168 /* Just for use if some languages passes arrays by value. */
4169 if (contains_128bit_aligned_vector_p (TREE_TYPE (type)))
4170 return true;
4171 break;
4172
4173 default:
4174 gcc_unreachable ();
4175 }
4176 }
4177 return false;
4178 }
4179
4180 /* Gives the alignment boundary, in bits, of an argument with the
4181 specified mode and type. */
4182
4183 int
4184 ix86_function_arg_boundary (enum machine_mode mode, tree type)
4185 {
4186 int align;
4187 if (type)
4188 align = TYPE_ALIGN (type);
4189 else
4190 align = GET_MODE_ALIGNMENT (mode);
4191 if (align < PARM_BOUNDARY)
4192 align = PARM_BOUNDARY;
4193 if (!TARGET_64BIT)
4194 {
4195 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
4196 make an exception for SSE modes since these require 128bit
4197 alignment.
4198
4199 The handling here differs from field_alignment. ICC aligns MMX
4200 arguments to 4 byte boundaries, while structure fields are aligned
4201 to 8 byte boundaries. */
4202 if (!TARGET_SSE)
4203 align = PARM_BOUNDARY;
4204 else if (!type)
4205 {
4206 if (!SSE_REG_MODE_P (mode))
4207 align = PARM_BOUNDARY;
4208 }
4209 else
4210 {
4211 if (!contains_128bit_aligned_vector_p (type))
4212 align = PARM_BOUNDARY;
4213 }
4214 }
4215 if (align > 128)
4216 align = 128;
4217 return align;
4218 }
4219
4220 /* Return true if N is a possible register number of function value. */
4221 bool
4222 ix86_function_value_regno_p (int regno)
4223 {
4224 if (TARGET_MACHO)
4225 {
4226 if (!TARGET_64BIT)
4227 {
4228 return ((regno) == 0
4229 || ((regno) == FIRST_FLOAT_REG && TARGET_FLOAT_RETURNS_IN_80387)
4230 || ((regno) == FIRST_SSE_REG && TARGET_SSE));
4231 }
4232 return ((regno) == 0 || (regno) == FIRST_FLOAT_REG
4233 || ((regno) == FIRST_SSE_REG && TARGET_SSE)
4234 || ((regno) == FIRST_FLOAT_REG && TARGET_FLOAT_RETURNS_IN_80387));
4235 }
4236 else
4237 {
4238 if (regno == 0
4239 || (regno == FIRST_FLOAT_REG && TARGET_FLOAT_RETURNS_IN_80387)
4240 || (regno == FIRST_SSE_REG && TARGET_SSE))
4241 return true;
4242
4243 if (!TARGET_64BIT
4244 && (regno == FIRST_MMX_REG && TARGET_MMX))
4245 return true;
4246
4247 return false;
4248 }
4249 }
4250
4251 /* Define how to find the value returned by a function.
4252 VALTYPE is the data type of the value (as a tree).
4253 If the precise function being called is known, FUNC is its FUNCTION_DECL;
4254 otherwise, FUNC is 0. */
4255 rtx
4256 ix86_function_value (tree valtype, tree fntype_or_decl,
4257 bool outgoing ATTRIBUTE_UNUSED)
4258 {
4259 enum machine_mode natmode = type_natural_mode (valtype);
4260
4261 if (TARGET_64BIT)
4262 {
4263 rtx ret = construct_container (natmode, TYPE_MODE (valtype), valtype,
4264 1, REGPARM_MAX, SSE_REGPARM_MAX,
4265 x86_64_int_return_registers, 0);
4266 /* For zero sized structures, construct_container return NULL, but we
4267 need to keep rest of compiler happy by returning meaningful value. */
4268 if (!ret)
4269 ret = gen_rtx_REG (TYPE_MODE (valtype), 0);
4270 return ret;
4271 }
4272 else
4273 {
4274 tree fn = NULL_TREE, fntype;
4275 if (fntype_or_decl
4276 && DECL_P (fntype_or_decl))
4277 fn = fntype_or_decl;
4278 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
4279 return gen_rtx_REG (TYPE_MODE (valtype),
4280 ix86_value_regno (natmode, fn, fntype));
4281 }
4282 }
4283
4284 /* Return true iff type is returned in memory. */
4285 int
4286 ix86_return_in_memory (tree type)
4287 {
4288 int needed_intregs, needed_sseregs, size;
4289 enum machine_mode mode = type_natural_mode (type);
4290
4291 if (TARGET_64BIT)
4292 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
4293
4294 if (mode == BLKmode)
4295 return 1;
4296
4297 size = int_size_in_bytes (type);
4298
4299 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
4300 return 0;
4301
4302 if (VECTOR_MODE_P (mode) || mode == TImode)
4303 {
4304 /* User-created vectors small enough to fit in EAX. */
4305 if (size < 8)
4306 return 0;
4307
4308 /* MMX/3dNow values are returned in MM0,
4309 except when it doesn't exits. */
4310 if (size == 8)
4311 return (TARGET_MMX ? 0 : 1);
4312
4313 /* SSE values are returned in XMM0, except when it doesn't exist. */
4314 if (size == 16)
4315 return (TARGET_SSE ? 0 : 1);
4316 }
4317
4318 if (mode == XFmode)
4319 return 0;
4320
4321 if (mode == TDmode)
4322 return 1;
4323
4324 if (size > 12)
4325 return 1;
4326 return 0;
4327 }
4328
4329 /* When returning SSE vector types, we have a choice of either
4330 (1) being abi incompatible with a -march switch, or
4331 (2) generating an error.
4332 Given no good solution, I think the safest thing is one warning.
4333 The user won't be able to use -Werror, but....
4334
4335 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
4336 called in response to actually generating a caller or callee that
4337 uses such a type. As opposed to RETURN_IN_MEMORY, which is called
4338 via aggregate_value_p for general type probing from tree-ssa. */
4339
4340 static rtx
4341 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
4342 {
4343 static bool warnedsse, warnedmmx;
4344
4345 if (type)
4346 {
4347 /* Look at the return type of the function, not the function type. */
4348 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
4349
4350 if (!TARGET_SSE && !warnedsse)
4351 {
4352 if (mode == TImode
4353 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
4354 {
4355 warnedsse = true;
4356 warning (0, "SSE vector return without SSE enabled "
4357 "changes the ABI");
4358 }
4359 }
4360
4361 if (!TARGET_MMX && !warnedmmx)
4362 {
4363 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
4364 {
4365 warnedmmx = true;
4366 warning (0, "MMX vector return without MMX enabled "
4367 "changes the ABI");
4368 }
4369 }
4370 }
4371
4372 return NULL;
4373 }
4374
4375 /* Define how to find the value returned by a library function
4376 assuming the value has mode MODE. */
4377 rtx
4378 ix86_libcall_value (enum machine_mode mode)
4379 {
4380 if (TARGET_64BIT)
4381 {
4382 switch (mode)
4383 {
4384 case SFmode:
4385 case SCmode:
4386 case DFmode:
4387 case DCmode:
4388 case TFmode:
4389 case SDmode:
4390 case DDmode:
4391 case TDmode:
4392 return gen_rtx_REG (mode, FIRST_SSE_REG);
4393 case XFmode:
4394 case XCmode:
4395 return gen_rtx_REG (mode, FIRST_FLOAT_REG);
4396 case TCmode:
4397 return NULL;
4398 default:
4399 return gen_rtx_REG (mode, 0);
4400 }
4401 }
4402 else
4403 return gen_rtx_REG (mode, ix86_value_regno (mode, NULL, NULL));
4404 }
4405
4406 /* Given a mode, return the register to use for a return value. */
4407
4408 static int
4409 ix86_value_regno (enum machine_mode mode, tree func, tree fntype)
4410 {
4411 gcc_assert (!TARGET_64BIT);
4412
4413 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
4414 we normally prevent this case when mmx is not available. However
4415 some ABIs may require the result to be returned like DImode. */
4416 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
4417 return TARGET_MMX ? FIRST_MMX_REG : 0;
4418
4419 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
4420 we prevent this case when sse is not available. However some ABIs
4421 may require the result to be returned like integer TImode. */
4422 if (mode == TImode || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
4423 return TARGET_SSE ? FIRST_SSE_REG : 0;
4424
4425 /* Decimal floating point values can go in %eax, unlike other float modes. */
4426 if (DECIMAL_FLOAT_MODE_P (mode))
4427 return 0;
4428
4429 /* Most things go in %eax, except (unless -mno-fp-ret-in-387) fp values. */
4430 if (!SCALAR_FLOAT_MODE_P (mode) || !TARGET_FLOAT_RETURNS_IN_80387)
4431 return 0;
4432
4433 /* Floating point return values in %st(0), except for local functions when
4434 SSE math is enabled or for functions with sseregparm attribute. */
4435 if ((func || fntype)
4436 && (mode == SFmode || mode == DFmode))
4437 {
4438 int sse_level = ix86_function_sseregparm (fntype, func);
4439 if ((sse_level >= 1 && mode == SFmode)
4440 || (sse_level == 2 && mode == DFmode))
4441 return FIRST_SSE_REG;
4442 }
4443
4444 return FIRST_FLOAT_REG;
4445 }
4446 \f
4447 /* Create the va_list data type. */
4448
4449 static tree
4450 ix86_build_builtin_va_list (void)
4451 {
4452 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
4453
4454 /* For i386 we use plain pointer to argument area. */
4455 if (!TARGET_64BIT)
4456 return build_pointer_type (char_type_node);
4457
4458 record = (*lang_hooks.types.make_type) (RECORD_TYPE);
4459 type_decl = build_decl (TYPE_DECL, get_identifier ("__va_list_tag"), record);
4460
4461 f_gpr = build_decl (FIELD_DECL, get_identifier ("gp_offset"),
4462 unsigned_type_node);
4463 f_fpr = build_decl (FIELD_DECL, get_identifier ("fp_offset"),
4464 unsigned_type_node);
4465 f_ovf = build_decl (FIELD_DECL, get_identifier ("overflow_arg_area"),
4466 ptr_type_node);
4467 f_sav = build_decl (FIELD_DECL, get_identifier ("reg_save_area"),
4468 ptr_type_node);
4469
4470 va_list_gpr_counter_field = f_gpr;
4471 va_list_fpr_counter_field = f_fpr;
4472
4473 DECL_FIELD_CONTEXT (f_gpr) = record;
4474 DECL_FIELD_CONTEXT (f_fpr) = record;
4475 DECL_FIELD_CONTEXT (f_ovf) = record;
4476 DECL_FIELD_CONTEXT (f_sav) = record;
4477
4478 TREE_CHAIN (record) = type_decl;
4479 TYPE_NAME (record) = type_decl;
4480 TYPE_FIELDS (record) = f_gpr;
4481 TREE_CHAIN (f_gpr) = f_fpr;
4482 TREE_CHAIN (f_fpr) = f_ovf;
4483 TREE_CHAIN (f_ovf) = f_sav;
4484
4485 layout_type (record);
4486
4487 /* The correct type is an array type of one element. */
4488 return build_array_type (record, build_index_type (size_zero_node));
4489 }
4490
4491 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
4492
4493 static void
4494 ix86_setup_incoming_varargs (CUMULATIVE_ARGS *cum, enum machine_mode mode,
4495 tree type, int *pretend_size ATTRIBUTE_UNUSED,
4496 int no_rtl)
4497 {
4498 CUMULATIVE_ARGS next_cum;
4499 rtx save_area = NULL_RTX, mem;
4500 rtx label;
4501 rtx label_ref;
4502 rtx tmp_reg;
4503 rtx nsse_reg;
4504 int set;
4505 tree fntype;
4506 int stdarg_p;
4507 int i;
4508
4509 if (!TARGET_64BIT)
4510 return;
4511
4512 if (! cfun->va_list_gpr_size && ! cfun->va_list_fpr_size)
4513 return;
4514
4515 /* Indicate to allocate space on the stack for varargs save area. */
4516 ix86_save_varrargs_registers = 1;
4517
4518 cfun->stack_alignment_needed = 128;
4519
4520 fntype = TREE_TYPE (current_function_decl);
4521 stdarg_p = (TYPE_ARG_TYPES (fntype) != 0
4522 && (TREE_VALUE (tree_last (TYPE_ARG_TYPES (fntype)))
4523 != void_type_node));
4524
4525 /* For varargs, we do not want to skip the dummy va_dcl argument.
4526 For stdargs, we do want to skip the last named argument. */
4527 next_cum = *cum;
4528 if (stdarg_p)
4529 function_arg_advance (&next_cum, mode, type, 1);
4530
4531 if (!no_rtl)
4532 save_area = frame_pointer_rtx;
4533
4534 set = get_varargs_alias_set ();
4535
4536 for (i = next_cum.regno;
4537 i < ix86_regparm
4538 && i < next_cum.regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
4539 i++)
4540 {
4541 mem = gen_rtx_MEM (Pmode,
4542 plus_constant (save_area, i * UNITS_PER_WORD));
4543 MEM_NOTRAP_P (mem) = 1;
4544 set_mem_alias_set (mem, set);
4545 emit_move_insn (mem, gen_rtx_REG (Pmode,
4546 x86_64_int_parameter_registers[i]));
4547 }
4548
4549 if (next_cum.sse_nregs && cfun->va_list_fpr_size)
4550 {
4551 /* Now emit code to save SSE registers. The AX parameter contains number
4552 of SSE parameter registers used to call this function. We use
4553 sse_prologue_save insn template that produces computed jump across
4554 SSE saves. We need some preparation work to get this working. */
4555
4556 label = gen_label_rtx ();
4557 label_ref = gen_rtx_LABEL_REF (Pmode, label);
4558
4559 /* Compute address to jump to :
4560 label - 5*eax + nnamed_sse_arguments*5 */
4561 tmp_reg = gen_reg_rtx (Pmode);
4562 nsse_reg = gen_reg_rtx (Pmode);
4563 emit_insn (gen_zero_extendqidi2 (nsse_reg, gen_rtx_REG (QImode, 0)));
4564 emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
4565 gen_rtx_MULT (Pmode, nsse_reg,
4566 GEN_INT (4))));
4567 if (next_cum.sse_regno)
4568 emit_move_insn
4569 (nsse_reg,
4570 gen_rtx_CONST (DImode,
4571 gen_rtx_PLUS (DImode,
4572 label_ref,
4573 GEN_INT (next_cum.sse_regno * 4))));
4574 else
4575 emit_move_insn (nsse_reg, label_ref);
4576 emit_insn (gen_subdi3 (nsse_reg, nsse_reg, tmp_reg));
4577
4578 /* Compute address of memory block we save into. We always use pointer
4579 pointing 127 bytes after first byte to store - this is needed to keep
4580 instruction size limited by 4 bytes. */
4581 tmp_reg = gen_reg_rtx (Pmode);
4582 emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
4583 plus_constant (save_area,
4584 8 * REGPARM_MAX + 127)));
4585 mem = gen_rtx_MEM (BLKmode, plus_constant (tmp_reg, -127));
4586 MEM_NOTRAP_P (mem) = 1;
4587 set_mem_alias_set (mem, set);
4588 set_mem_align (mem, BITS_PER_WORD);
4589
4590 /* And finally do the dirty job! */
4591 emit_insn (gen_sse_prologue_save (mem, nsse_reg,
4592 GEN_INT (next_cum.sse_regno), label));
4593 }
4594
4595 }
4596
4597 /* Implement va_start. */
4598
4599 void
4600 ix86_va_start (tree valist, rtx nextarg)
4601 {
4602 HOST_WIDE_INT words, n_gpr, n_fpr;
4603 tree f_gpr, f_fpr, f_ovf, f_sav;
4604 tree gpr, fpr, ovf, sav, t;
4605 tree type;
4606
4607 /* Only 64bit target needs something special. */
4608 if (!TARGET_64BIT)
4609 {
4610 std_expand_builtin_va_start (valist, nextarg);
4611 return;
4612 }
4613
4614 f_gpr = TYPE_FIELDS (TREE_TYPE (va_list_type_node));
4615 f_fpr = TREE_CHAIN (f_gpr);
4616 f_ovf = TREE_CHAIN (f_fpr);
4617 f_sav = TREE_CHAIN (f_ovf);
4618
4619 valist = build1 (INDIRECT_REF, TREE_TYPE (TREE_TYPE (valist)), valist);
4620 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), valist, f_gpr, NULL_TREE);
4621 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
4622 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
4623 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
4624
4625 /* Count number of gp and fp argument registers used. */
4626 words = current_function_args_info.words;
4627 n_gpr = current_function_args_info.regno;
4628 n_fpr = current_function_args_info.sse_regno;
4629
4630 if (TARGET_DEBUG_ARG)
4631 fprintf (stderr, "va_start: words = %d, n_gpr = %d, n_fpr = %d\n",
4632 (int) words, (int) n_gpr, (int) n_fpr);
4633
4634 if (cfun->va_list_gpr_size)
4635 {
4636 type = TREE_TYPE (gpr);
4637 t = build2 (GIMPLE_MODIFY_STMT, type, gpr,
4638 build_int_cst (type, n_gpr * 8));
4639 TREE_SIDE_EFFECTS (t) = 1;
4640 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
4641 }
4642
4643 if (cfun->va_list_fpr_size)
4644 {
4645 type = TREE_TYPE (fpr);
4646 t = build2 (GIMPLE_MODIFY_STMT, type, fpr,
4647 build_int_cst (type, n_fpr * 16 + 8*REGPARM_MAX));
4648 TREE_SIDE_EFFECTS (t) = 1;
4649 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
4650 }
4651
4652 /* Find the overflow area. */
4653 type = TREE_TYPE (ovf);
4654 t = make_tree (type, virtual_incoming_args_rtx);
4655 if (words != 0)
4656 t = build2 (PLUS_EXPR, type, t,
4657 build_int_cst (type, words * UNITS_PER_WORD));
4658 t = build2 (GIMPLE_MODIFY_STMT, type, ovf, t);
4659 TREE_SIDE_EFFECTS (t) = 1;
4660 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
4661
4662 if (cfun->va_list_gpr_size || cfun->va_list_fpr_size)
4663 {
4664 /* Find the register save area.
4665 Prologue of the function save it right above stack frame. */
4666 type = TREE_TYPE (sav);
4667 t = make_tree (type, frame_pointer_rtx);
4668 t = build2 (GIMPLE_MODIFY_STMT, type, sav, t);
4669 TREE_SIDE_EFFECTS (t) = 1;
4670 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
4671 }
4672 }
4673
4674 /* Implement va_arg. */
4675
4676 tree
4677 ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p)
4678 {
4679 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
4680 tree f_gpr, f_fpr, f_ovf, f_sav;
4681 tree gpr, fpr, ovf, sav, t;
4682 int size, rsize;
4683 tree lab_false, lab_over = NULL_TREE;
4684 tree addr, t2;
4685 rtx container;
4686 int indirect_p = 0;
4687 tree ptrtype;
4688 enum machine_mode nat_mode;
4689
4690 /* Only 64bit target needs something special. */
4691 if (!TARGET_64BIT)
4692 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
4693
4694 f_gpr = TYPE_FIELDS (TREE_TYPE (va_list_type_node));
4695 f_fpr = TREE_CHAIN (f_gpr);
4696 f_ovf = TREE_CHAIN (f_fpr);
4697 f_sav = TREE_CHAIN (f_ovf);
4698
4699 valist = build_va_arg_indirect_ref (valist);
4700 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), valist, f_gpr, NULL_TREE);
4701 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
4702 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
4703 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
4704
4705 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
4706 if (indirect_p)
4707 type = build_pointer_type (type);
4708 size = int_size_in_bytes (type);
4709 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
4710
4711 nat_mode = type_natural_mode (type);
4712 container = construct_container (nat_mode, TYPE_MODE (type), type, 0,
4713 REGPARM_MAX, SSE_REGPARM_MAX, intreg, 0);
4714
4715 /* Pull the value out of the saved registers. */
4716
4717 addr = create_tmp_var (ptr_type_node, "addr");
4718 DECL_POINTER_ALIAS_SET (addr) = get_varargs_alias_set ();
4719
4720 if (container)
4721 {
4722 int needed_intregs, needed_sseregs;
4723 bool need_temp;
4724 tree int_addr, sse_addr;
4725
4726 lab_false = create_artificial_label ();
4727 lab_over = create_artificial_label ();
4728
4729 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
4730
4731 need_temp = (!REG_P (container)
4732 && ((needed_intregs && TYPE_ALIGN (type) > 64)
4733 || TYPE_ALIGN (type) > 128));
4734
4735 /* In case we are passing structure, verify that it is consecutive block
4736 on the register save area. If not we need to do moves. */
4737 if (!need_temp && !REG_P (container))
4738 {
4739 /* Verify that all registers are strictly consecutive */
4740 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
4741 {
4742 int i;
4743
4744 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
4745 {
4746 rtx slot = XVECEXP (container, 0, i);
4747 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
4748 || INTVAL (XEXP (slot, 1)) != i * 16)
4749 need_temp = 1;
4750 }
4751 }
4752 else
4753 {
4754 int i;
4755
4756 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
4757 {
4758 rtx slot = XVECEXP (container, 0, i);
4759 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
4760 || INTVAL (XEXP (slot, 1)) != i * 8)
4761 need_temp = 1;
4762 }
4763 }
4764 }
4765 if (!need_temp)
4766 {
4767 int_addr = addr;
4768 sse_addr = addr;
4769 }
4770 else
4771 {
4772 int_addr = create_tmp_var (ptr_type_node, "int_addr");
4773 DECL_POINTER_ALIAS_SET (int_addr) = get_varargs_alias_set ();
4774 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
4775 DECL_POINTER_ALIAS_SET (sse_addr) = get_varargs_alias_set ();
4776 }
4777
4778 /* First ensure that we fit completely in registers. */
4779 if (needed_intregs)
4780 {
4781 t = build_int_cst (TREE_TYPE (gpr),
4782 (REGPARM_MAX - needed_intregs + 1) * 8);
4783 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
4784 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
4785 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
4786 gimplify_and_add (t, pre_p);
4787 }
4788 if (needed_sseregs)
4789 {
4790 t = build_int_cst (TREE_TYPE (fpr),
4791 (SSE_REGPARM_MAX - needed_sseregs + 1) * 16
4792 + REGPARM_MAX * 8);
4793 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
4794 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
4795 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
4796 gimplify_and_add (t, pre_p);
4797 }
4798
4799 /* Compute index to start of area used for integer regs. */
4800 if (needed_intregs)
4801 {
4802 /* int_addr = gpr + sav; */
4803 t = fold_convert (ptr_type_node, gpr);
4804 t = build2 (PLUS_EXPR, ptr_type_node, sav, t);
4805 t = build2 (GIMPLE_MODIFY_STMT, void_type_node, int_addr, t);
4806 gimplify_and_add (t, pre_p);
4807 }
4808 if (needed_sseregs)
4809 {
4810 /* sse_addr = fpr + sav; */
4811 t = fold_convert (ptr_type_node, fpr);
4812 t = build2 (PLUS_EXPR, ptr_type_node, sav, t);
4813 t = build2 (GIMPLE_MODIFY_STMT, void_type_node, sse_addr, t);
4814 gimplify_and_add (t, pre_p);
4815 }
4816 if (need_temp)
4817 {
4818 int i;
4819 tree temp = create_tmp_var (type, "va_arg_tmp");
4820
4821 /* addr = &temp; */
4822 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
4823 t = build2 (GIMPLE_MODIFY_STMT, void_type_node, addr, t);
4824 gimplify_and_add (t, pre_p);
4825
4826 for (i = 0; i < XVECLEN (container, 0); i++)
4827 {
4828 rtx slot = XVECEXP (container, 0, i);
4829 rtx reg = XEXP (slot, 0);
4830 enum machine_mode mode = GET_MODE (reg);
4831 tree piece_type = lang_hooks.types.type_for_mode (mode, 1);
4832 tree addr_type = build_pointer_type (piece_type);
4833 tree src_addr, src;
4834 int src_offset;
4835 tree dest_addr, dest;
4836
4837 if (SSE_REGNO_P (REGNO (reg)))
4838 {
4839 src_addr = sse_addr;
4840 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
4841 }
4842 else
4843 {
4844 src_addr = int_addr;
4845 src_offset = REGNO (reg) * 8;
4846 }
4847 src_addr = fold_convert (addr_type, src_addr);
4848 src_addr = fold (build2 (PLUS_EXPR, addr_type, src_addr,
4849 size_int (src_offset)));
4850 src = build_va_arg_indirect_ref (src_addr);
4851
4852 dest_addr = fold_convert (addr_type, addr);
4853 dest_addr = fold (build2 (PLUS_EXPR, addr_type, dest_addr,
4854 size_int (INTVAL (XEXP (slot, 1)))));
4855 dest = build_va_arg_indirect_ref (dest_addr);
4856
4857 t = build2 (GIMPLE_MODIFY_STMT, void_type_node, dest, src);
4858 gimplify_and_add (t, pre_p);
4859 }
4860 }
4861
4862 if (needed_intregs)
4863 {
4864 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
4865 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
4866 t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (gpr), gpr, t);
4867 gimplify_and_add (t, pre_p);
4868 }
4869 if (needed_sseregs)
4870 {
4871 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
4872 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
4873 t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (fpr), fpr, t);
4874 gimplify_and_add (t, pre_p);
4875 }
4876
4877 t = build1 (GOTO_EXPR, void_type_node, lab_over);
4878 gimplify_and_add (t, pre_p);
4879
4880 t = build1 (LABEL_EXPR, void_type_node, lab_false);
4881 append_to_statement_list (t, pre_p);
4882 }
4883
4884 /* ... otherwise out of the overflow area. */
4885
4886 /* Care for on-stack alignment if needed. */
4887 if (FUNCTION_ARG_BOUNDARY (VOIDmode, type) <= 64
4888 || integer_zerop (TYPE_SIZE (type)))
4889 t = ovf;
4890 else
4891 {
4892 HOST_WIDE_INT align = FUNCTION_ARG_BOUNDARY (VOIDmode, type) / 8;
4893 t = build2 (PLUS_EXPR, TREE_TYPE (ovf), ovf,
4894 build_int_cst (TREE_TYPE (ovf), align - 1));
4895 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
4896 build_int_cst (TREE_TYPE (t), -align));
4897 }
4898 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
4899
4900 t2 = build2 (GIMPLE_MODIFY_STMT, void_type_node, addr, t);
4901 gimplify_and_add (t2, pre_p);
4902
4903 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
4904 build_int_cst (TREE_TYPE (t), rsize * UNITS_PER_WORD));
4905 t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (ovf), ovf, t);
4906 gimplify_and_add (t, pre_p);
4907
4908 if (container)
4909 {
4910 t = build1 (LABEL_EXPR, void_type_node, lab_over);
4911 append_to_statement_list (t, pre_p);
4912 }
4913
4914 ptrtype = build_pointer_type (type);
4915 addr = fold_convert (ptrtype, addr);
4916
4917 if (indirect_p)
4918 addr = build_va_arg_indirect_ref (addr);
4919 return build_va_arg_indirect_ref (addr);
4920 }
4921 \f
4922 /* Return nonzero if OPNUM's MEM should be matched
4923 in movabs* patterns. */
4924
4925 int
4926 ix86_check_movabs (rtx insn, int opnum)
4927 {
4928 rtx set, mem;
4929
4930 set = PATTERN (insn);
4931 if (GET_CODE (set) == PARALLEL)
4932 set = XVECEXP (set, 0, 0);
4933 gcc_assert (GET_CODE (set) == SET);
4934 mem = XEXP (set, opnum);
4935 while (GET_CODE (mem) == SUBREG)
4936 mem = SUBREG_REG (mem);
4937 gcc_assert (MEM_P (mem));
4938 return (volatile_ok || !MEM_VOLATILE_P (mem));
4939 }
4940 \f
4941 /* Initialize the table of extra 80387 mathematical constants. */
4942
4943 static void
4944 init_ext_80387_constants (void)
4945 {
4946 static const char * cst[5] =
4947 {
4948 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
4949 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
4950 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
4951 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
4952 "3.1415926535897932385128089594061862044", /* 4: fldpi */
4953 };
4954 int i;
4955
4956 for (i = 0; i < 5; i++)
4957 {
4958 real_from_string (&ext_80387_constants_table[i], cst[i]);
4959 /* Ensure each constant is rounded to XFmode precision. */
4960 real_convert (&ext_80387_constants_table[i],
4961 XFmode, &ext_80387_constants_table[i]);
4962 }
4963
4964 ext_80387_constants_init = 1;
4965 }
4966
4967 /* Return true if the constant is something that can be loaded with
4968 a special instruction. */
4969
4970 int
4971 standard_80387_constant_p (rtx x)
4972 {
4973 REAL_VALUE_TYPE r;
4974
4975 if (GET_CODE (x) != CONST_DOUBLE || !FLOAT_MODE_P (GET_MODE (x)))
4976 return -1;
4977
4978 if (x == CONST0_RTX (GET_MODE (x)))
4979 return 1;
4980 if (x == CONST1_RTX (GET_MODE (x)))
4981 return 2;
4982
4983 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4984
4985 /* For XFmode constants, try to find a special 80387 instruction when
4986 optimizing for size or on those CPUs that benefit from them. */
4987 if (GET_MODE (x) == XFmode
4988 && (optimize_size || x86_ext_80387_constants & TUNEMASK))
4989 {
4990 int i;
4991
4992 if (! ext_80387_constants_init)
4993 init_ext_80387_constants ();
4994
4995 for (i = 0; i < 5; i++)
4996 if (real_identical (&r, &ext_80387_constants_table[i]))
4997 return i + 3;
4998 }
4999
5000 /* Load of the constant -0.0 or -1.0 will be split as
5001 fldz;fchs or fld1;fchs sequence. */
5002 if (real_isnegzero (&r))
5003 return 8;
5004 if (real_identical (&r, &dconstm1))
5005 return 9;
5006
5007 return 0;
5008 }
5009
5010 /* Return the opcode of the special instruction to be used to load
5011 the constant X. */
5012
5013 const char *
5014 standard_80387_constant_opcode (rtx x)
5015 {
5016 switch (standard_80387_constant_p (x))
5017 {
5018 case 1:
5019 return "fldz";
5020 case 2:
5021 return "fld1";
5022 case 3:
5023 return "fldlg2";
5024 case 4:
5025 return "fldln2";
5026 case 5:
5027 return "fldl2e";
5028 case 6:
5029 return "fldl2t";
5030 case 7:
5031 return "fldpi";
5032 case 8:
5033 case 9:
5034 return "#";
5035 default:
5036 gcc_unreachable ();
5037 }
5038 }
5039
5040 /* Return the CONST_DOUBLE representing the 80387 constant that is
5041 loaded by the specified special instruction. The argument IDX
5042 matches the return value from standard_80387_constant_p. */
5043
5044 rtx
5045 standard_80387_constant_rtx (int idx)
5046 {
5047 int i;
5048
5049 if (! ext_80387_constants_init)
5050 init_ext_80387_constants ();
5051
5052 switch (idx)
5053 {
5054 case 3:
5055 case 4:
5056 case 5:
5057 case 6:
5058 case 7:
5059 i = idx - 3;
5060 break;
5061
5062 default:
5063 gcc_unreachable ();
5064 }
5065
5066 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
5067 XFmode);
5068 }
5069
5070 /* Return 1 if mode is a valid mode for sse. */
5071 static int
5072 standard_sse_mode_p (enum machine_mode mode)
5073 {
5074 switch (mode)
5075 {
5076 case V16QImode:
5077 case V8HImode:
5078 case V4SImode:
5079 case V2DImode:
5080 case V4SFmode:
5081 case V2DFmode:
5082 return 1;
5083
5084 default:
5085 return 0;
5086 }
5087 }
5088
5089 /* Return 1 if X is FP constant we can load to SSE register w/o using memory.
5090 */
5091 int
5092 standard_sse_constant_p (rtx x)
5093 {
5094 enum machine_mode mode = GET_MODE (x);
5095
5096 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
5097 return 1;
5098 if (vector_all_ones_operand (x, mode)
5099 && standard_sse_mode_p (mode))
5100 return TARGET_SSE2 ? 2 : -1;
5101
5102 return 0;
5103 }
5104
5105 /* Return the opcode of the special instruction to be used to load
5106 the constant X. */
5107
5108 const char *
5109 standard_sse_constant_opcode (rtx insn, rtx x)
5110 {
5111 switch (standard_sse_constant_p (x))
5112 {
5113 case 1:
5114 if (get_attr_mode (insn) == MODE_V4SF)
5115 return "xorps\t%0, %0";
5116 else if (get_attr_mode (insn) == MODE_V2DF)
5117 return "xorpd\t%0, %0";
5118 else
5119 return "pxor\t%0, %0";
5120 case 2:
5121 return "pcmpeqd\t%0, %0";
5122 }
5123 gcc_unreachable ();
5124 }
5125
5126 /* Returns 1 if OP contains a symbol reference */
5127
5128 int
5129 symbolic_reference_mentioned_p (rtx op)
5130 {
5131 const char *fmt;
5132 int i;
5133
5134 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
5135 return 1;
5136
5137 fmt = GET_RTX_FORMAT (GET_CODE (op));
5138 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
5139 {
5140 if (fmt[i] == 'E')
5141 {
5142 int j;
5143
5144 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
5145 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
5146 return 1;
5147 }
5148
5149 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
5150 return 1;
5151 }
5152
5153 return 0;
5154 }
5155
5156 /* Return 1 if it is appropriate to emit `ret' instructions in the
5157 body of a function. Do this only if the epilogue is simple, needing a
5158 couple of insns. Prior to reloading, we can't tell how many registers
5159 must be saved, so return 0 then. Return 0 if there is no frame
5160 marker to de-allocate. */
5161
5162 int
5163 ix86_can_use_return_insn_p (void)
5164 {
5165 struct ix86_frame frame;
5166
5167 if (! reload_completed || frame_pointer_needed)
5168 return 0;
5169
5170 /* Don't allow more than 32 pop, since that's all we can do
5171 with one instruction. */
5172 if (current_function_pops_args
5173 && current_function_args_size >= 32768)
5174 return 0;
5175
5176 ix86_compute_frame_layout (&frame);
5177 return frame.to_allocate == 0 && frame.nregs == 0;
5178 }
5179 \f
5180 /* Value should be nonzero if functions must have frame pointers.
5181 Zero means the frame pointer need not be set up (and parms may
5182 be accessed via the stack pointer) in functions that seem suitable. */
5183
5184 int
5185 ix86_frame_pointer_required (void)
5186 {
5187 /* If we accessed previous frames, then the generated code expects
5188 to be able to access the saved ebp value in our frame. */
5189 if (cfun->machine->accesses_prev_frame)
5190 return 1;
5191
5192 /* Several x86 os'es need a frame pointer for other reasons,
5193 usually pertaining to setjmp. */
5194 if (SUBTARGET_FRAME_POINTER_REQUIRED)
5195 return 1;
5196
5197 /* In override_options, TARGET_OMIT_LEAF_FRAME_POINTER turns off
5198 the frame pointer by default. Turn it back on now if we've not
5199 got a leaf function. */
5200 if (TARGET_OMIT_LEAF_FRAME_POINTER
5201 && (!current_function_is_leaf
5202 || ix86_current_function_calls_tls_descriptor))
5203 return 1;
5204
5205 if (current_function_profile)
5206 return 1;
5207
5208 return 0;
5209 }
5210
5211 /* Record that the current function accesses previous call frames. */
5212
5213 void
5214 ix86_setup_frame_addresses (void)
5215 {
5216 cfun->machine->accesses_prev_frame = 1;
5217 }
5218 \f
5219 #if (defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)) || TARGET_MACHO
5220 # define USE_HIDDEN_LINKONCE 1
5221 #else
5222 # define USE_HIDDEN_LINKONCE 0
5223 #endif
5224
5225 static int pic_labels_used;
5226
5227 /* Fills in the label name that should be used for a pc thunk for
5228 the given register. */
5229
5230 static void
5231 get_pc_thunk_name (char name[32], unsigned int regno)
5232 {
5233 gcc_assert (!TARGET_64BIT);
5234
5235 if (USE_HIDDEN_LINKONCE)
5236 sprintf (name, "__i686.get_pc_thunk.%s", reg_names[regno]);
5237 else
5238 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
5239 }
5240
5241
5242 /* This function generates code for -fpic that loads %ebx with
5243 the return address of the caller and then returns. */
5244
5245 void
5246 ix86_file_end (void)
5247 {
5248 rtx xops[2];
5249 int regno;
5250
5251 for (regno = 0; regno < 8; ++regno)
5252 {
5253 char name[32];
5254
5255 if (! ((pic_labels_used >> regno) & 1))
5256 continue;
5257
5258 get_pc_thunk_name (name, regno);
5259
5260 #if TARGET_MACHO
5261 if (TARGET_MACHO)
5262 {
5263 switch_to_section (darwin_sections[text_coal_section]);
5264 fputs ("\t.weak_definition\t", asm_out_file);
5265 assemble_name (asm_out_file, name);
5266 fputs ("\n\t.private_extern\t", asm_out_file);
5267 assemble_name (asm_out_file, name);
5268 fputs ("\n", asm_out_file);
5269 ASM_OUTPUT_LABEL (asm_out_file, name);
5270 }
5271 else
5272 #endif
5273 if (USE_HIDDEN_LINKONCE)
5274 {
5275 tree decl;
5276
5277 decl = build_decl (FUNCTION_DECL, get_identifier (name),
5278 error_mark_node);
5279 TREE_PUBLIC (decl) = 1;
5280 TREE_STATIC (decl) = 1;
5281 DECL_ONE_ONLY (decl) = 1;
5282
5283 (*targetm.asm_out.unique_section) (decl, 0);
5284 switch_to_section (get_named_section (decl, NULL, 0));
5285
5286 (*targetm.asm_out.globalize_label) (asm_out_file, name);
5287 fputs ("\t.hidden\t", asm_out_file);
5288 assemble_name (asm_out_file, name);
5289 fputc ('\n', asm_out_file);
5290 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
5291 }
5292 else
5293 {
5294 switch_to_section (text_section);
5295 ASM_OUTPUT_LABEL (asm_out_file, name);
5296 }
5297
5298 xops[0] = gen_rtx_REG (SImode, regno);
5299 xops[1] = gen_rtx_MEM (SImode, stack_pointer_rtx);
5300 output_asm_insn ("mov{l}\t{%1, %0|%0, %1}", xops);
5301 output_asm_insn ("ret", xops);
5302 }
5303
5304 if (NEED_INDICATE_EXEC_STACK)
5305 file_end_indicate_exec_stack ();
5306 }
5307
5308 /* Emit code for the SET_GOT patterns. */
5309
5310 const char *
5311 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
5312 {
5313 rtx xops[3];
5314
5315 xops[0] = dest;
5316 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
5317
5318 if (! TARGET_DEEP_BRANCH_PREDICTION || !flag_pic)
5319 {
5320 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
5321
5322 if (!flag_pic)
5323 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
5324 else
5325 output_asm_insn ("call\t%a2", xops);
5326
5327 #if TARGET_MACHO
5328 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
5329 is what will be referenced by the Mach-O PIC subsystem. */
5330 if (!label)
5331 ASM_OUTPUT_LABEL (asm_out_file, machopic_function_base_name ());
5332 #endif
5333
5334 (*targetm.asm_out.internal_label) (asm_out_file, "L",
5335 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
5336
5337 if (flag_pic)
5338 output_asm_insn ("pop{l}\t%0", xops);
5339 }
5340 else
5341 {
5342 char name[32];
5343 get_pc_thunk_name (name, REGNO (dest));
5344 pic_labels_used |= 1 << REGNO (dest);
5345
5346 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
5347 xops[2] = gen_rtx_MEM (QImode, xops[2]);
5348 output_asm_insn ("call\t%X2", xops);
5349 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
5350 is what will be referenced by the Mach-O PIC subsystem. */
5351 #if TARGET_MACHO
5352 if (!label)
5353 ASM_OUTPUT_LABEL (asm_out_file, machopic_function_base_name ());
5354 else
5355 targetm.asm_out.internal_label (asm_out_file, "L",
5356 CODE_LABEL_NUMBER (label));
5357 #endif
5358 }
5359
5360 if (TARGET_MACHO)
5361 return "";
5362
5363 if (!flag_pic || TARGET_DEEP_BRANCH_PREDICTION)
5364 output_asm_insn ("add{l}\t{%1, %0|%0, %1}", xops);
5365 else
5366 output_asm_insn ("add{l}\t{%1+[.-%a2], %0|%0, %1+(.-%a2)}", xops);
5367
5368 return "";
5369 }
5370
5371 /* Generate an "push" pattern for input ARG. */
5372
5373 static rtx
5374 gen_push (rtx arg)
5375 {
5376 return gen_rtx_SET (VOIDmode,
5377 gen_rtx_MEM (Pmode,
5378 gen_rtx_PRE_DEC (Pmode,
5379 stack_pointer_rtx)),
5380 arg);
5381 }
5382
5383 /* Return >= 0 if there is an unused call-clobbered register available
5384 for the entire function. */
5385
5386 static unsigned int
5387 ix86_select_alt_pic_regnum (void)
5388 {
5389 if (current_function_is_leaf && !current_function_profile
5390 && !ix86_current_function_calls_tls_descriptor)
5391 {
5392 int i;
5393 for (i = 2; i >= 0; --i)
5394 if (!regs_ever_live[i])
5395 return i;
5396 }
5397
5398 return INVALID_REGNUM;
5399 }
5400
5401 /* Return 1 if we need to save REGNO. */
5402 static int
5403 ix86_save_reg (unsigned int regno, int maybe_eh_return)
5404 {
5405 if (pic_offset_table_rtx
5406 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
5407 && (regs_ever_live[REAL_PIC_OFFSET_TABLE_REGNUM]
5408 || current_function_profile
5409 || current_function_calls_eh_return
5410 || current_function_uses_const_pool))
5411 {
5412 if (ix86_select_alt_pic_regnum () != INVALID_REGNUM)
5413 return 0;
5414 return 1;
5415 }
5416
5417 if (current_function_calls_eh_return && maybe_eh_return)
5418 {
5419 unsigned i;
5420 for (i = 0; ; i++)
5421 {
5422 unsigned test = EH_RETURN_DATA_REGNO (i);
5423 if (test == INVALID_REGNUM)
5424 break;
5425 if (test == regno)
5426 return 1;
5427 }
5428 }
5429
5430 if (cfun->machine->force_align_arg_pointer
5431 && regno == REGNO (cfun->machine->force_align_arg_pointer))
5432 return 1;
5433
5434 return (regs_ever_live[regno]
5435 && !call_used_regs[regno]
5436 && !fixed_regs[regno]
5437 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
5438 }
5439
5440 /* Return number of registers to be saved on the stack. */
5441
5442 static int
5443 ix86_nsaved_regs (void)
5444 {
5445 int nregs = 0;
5446 int regno;
5447
5448 for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; regno--)
5449 if (ix86_save_reg (regno, true))
5450 nregs++;
5451 return nregs;
5452 }
5453
5454 /* Return the offset between two registers, one to be eliminated, and the other
5455 its replacement, at the start of a routine. */
5456
5457 HOST_WIDE_INT
5458 ix86_initial_elimination_offset (int from, int to)
5459 {
5460 struct ix86_frame frame;
5461 ix86_compute_frame_layout (&frame);
5462
5463 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5464 return frame.hard_frame_pointer_offset;
5465 else if (from == FRAME_POINTER_REGNUM
5466 && to == HARD_FRAME_POINTER_REGNUM)
5467 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
5468 else
5469 {
5470 gcc_assert (to == STACK_POINTER_REGNUM);
5471
5472 if (from == ARG_POINTER_REGNUM)
5473 return frame.stack_pointer_offset;
5474
5475 gcc_assert (from == FRAME_POINTER_REGNUM);
5476 return frame.stack_pointer_offset - frame.frame_pointer_offset;
5477 }
5478 }
5479
5480 /* Fill structure ix86_frame about frame of currently computed function. */
5481
5482 static void
5483 ix86_compute_frame_layout (struct ix86_frame *frame)
5484 {
5485 HOST_WIDE_INT total_size;
5486 unsigned int stack_alignment_needed;
5487 HOST_WIDE_INT offset;
5488 unsigned int preferred_alignment;
5489 HOST_WIDE_INT size = get_frame_size ();
5490
5491 frame->nregs = ix86_nsaved_regs ();
5492 total_size = size;
5493
5494 stack_alignment_needed = cfun->stack_alignment_needed / BITS_PER_UNIT;
5495 preferred_alignment = cfun->preferred_stack_boundary / BITS_PER_UNIT;
5496
5497 /* During reload iteration the amount of registers saved can change.
5498 Recompute the value as needed. Do not recompute when amount of registers
5499 didn't change as reload does multiple calls to the function and does not
5500 expect the decision to change within single iteration. */
5501 if (!optimize_size
5502 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
5503 {
5504 int count = frame->nregs;
5505
5506 cfun->machine->use_fast_prologue_epilogue_nregs = count;
5507 /* The fast prologue uses move instead of push to save registers. This
5508 is significantly longer, but also executes faster as modern hardware
5509 can execute the moves in parallel, but can't do that for push/pop.
5510
5511 Be careful about choosing what prologue to emit: When function takes
5512 many instructions to execute we may use slow version as well as in
5513 case function is known to be outside hot spot (this is known with
5514 feedback only). Weight the size of function by number of registers
5515 to save as it is cheap to use one or two push instructions but very
5516 slow to use many of them. */
5517 if (count)
5518 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
5519 if (cfun->function_frequency < FUNCTION_FREQUENCY_NORMAL
5520 || (flag_branch_probabilities
5521 && cfun->function_frequency < FUNCTION_FREQUENCY_HOT))
5522 cfun->machine->use_fast_prologue_epilogue = false;
5523 else
5524 cfun->machine->use_fast_prologue_epilogue
5525 = !expensive_function_p (count);
5526 }
5527 if (TARGET_PROLOGUE_USING_MOVE
5528 && cfun->machine->use_fast_prologue_epilogue)
5529 frame->save_regs_using_mov = true;
5530 else
5531 frame->save_regs_using_mov = false;
5532
5533
5534 /* Skip return address and saved base pointer. */
5535 offset = frame_pointer_needed ? UNITS_PER_WORD * 2 : UNITS_PER_WORD;
5536
5537 frame->hard_frame_pointer_offset = offset;
5538
5539 /* Do some sanity checking of stack_alignment_needed and
5540 preferred_alignment, since i386 port is the only using those features
5541 that may break easily. */
5542
5543 gcc_assert (!size || stack_alignment_needed);
5544 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
5545 gcc_assert (preferred_alignment <= PREFERRED_STACK_BOUNDARY / BITS_PER_UNIT);
5546 gcc_assert (stack_alignment_needed
5547 <= PREFERRED_STACK_BOUNDARY / BITS_PER_UNIT);
5548
5549 if (stack_alignment_needed < STACK_BOUNDARY / BITS_PER_UNIT)
5550 stack_alignment_needed = STACK_BOUNDARY / BITS_PER_UNIT;
5551
5552 /* Register save area */
5553 offset += frame->nregs * UNITS_PER_WORD;
5554
5555 /* Va-arg area */
5556 if (ix86_save_varrargs_registers)
5557 {
5558 offset += X86_64_VARARGS_SIZE;
5559 frame->va_arg_size = X86_64_VARARGS_SIZE;
5560 }
5561 else
5562 frame->va_arg_size = 0;
5563
5564 /* Align start of frame for local function. */
5565 frame->padding1 = ((offset + stack_alignment_needed - 1)
5566 & -stack_alignment_needed) - offset;
5567
5568 offset += frame->padding1;
5569
5570 /* Frame pointer points here. */
5571 frame->frame_pointer_offset = offset;
5572
5573 offset += size;
5574
5575 /* Add outgoing arguments area. Can be skipped if we eliminated
5576 all the function calls as dead code.
5577 Skipping is however impossible when function calls alloca. Alloca
5578 expander assumes that last current_function_outgoing_args_size
5579 of stack frame are unused. */
5580 if (ACCUMULATE_OUTGOING_ARGS
5581 && (!current_function_is_leaf || current_function_calls_alloca
5582 || ix86_current_function_calls_tls_descriptor))
5583 {
5584 offset += current_function_outgoing_args_size;
5585 frame->outgoing_arguments_size = current_function_outgoing_args_size;
5586 }
5587 else
5588 frame->outgoing_arguments_size = 0;
5589
5590 /* Align stack boundary. Only needed if we're calling another function
5591 or using alloca. */
5592 if (!current_function_is_leaf || current_function_calls_alloca
5593 || ix86_current_function_calls_tls_descriptor)
5594 frame->padding2 = ((offset + preferred_alignment - 1)
5595 & -preferred_alignment) - offset;
5596 else
5597 frame->padding2 = 0;
5598
5599 offset += frame->padding2;
5600
5601 /* We've reached end of stack frame. */
5602 frame->stack_pointer_offset = offset;
5603
5604 /* Size prologue needs to allocate. */
5605 frame->to_allocate =
5606 (size + frame->padding1 + frame->padding2
5607 + frame->outgoing_arguments_size + frame->va_arg_size);
5608
5609 if ((!frame->to_allocate && frame->nregs <= 1)
5610 || (TARGET_64BIT && frame->to_allocate >= (HOST_WIDE_INT) 0x80000000))
5611 frame->save_regs_using_mov = false;
5612
5613 if (TARGET_RED_ZONE && current_function_sp_is_unchanging
5614 && current_function_is_leaf
5615 && !ix86_current_function_calls_tls_descriptor)
5616 {
5617 frame->red_zone_size = frame->to_allocate;
5618 if (frame->save_regs_using_mov)
5619 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
5620 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
5621 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
5622 }
5623 else
5624 frame->red_zone_size = 0;
5625 frame->to_allocate -= frame->red_zone_size;
5626 frame->stack_pointer_offset -= frame->red_zone_size;
5627 #if 0
5628 fprintf (stderr, "\n");
5629 fprintf (stderr, "nregs: %ld\n", (long)frame->nregs);
5630 fprintf (stderr, "size: %ld\n", (long)size);
5631 fprintf (stderr, "alignment1: %ld\n", (long)stack_alignment_needed);
5632 fprintf (stderr, "padding1: %ld\n", (long)frame->padding1);
5633 fprintf (stderr, "va_arg: %ld\n", (long)frame->va_arg_size);
5634 fprintf (stderr, "padding2: %ld\n", (long)frame->padding2);
5635 fprintf (stderr, "to_allocate: %ld\n", (long)frame->to_allocate);
5636 fprintf (stderr, "red_zone_size: %ld\n", (long)frame->red_zone_size);
5637 fprintf (stderr, "frame_pointer_offset: %ld\n", (long)frame->frame_pointer_offset);
5638 fprintf (stderr, "hard_frame_pointer_offset: %ld\n",
5639 (long)frame->hard_frame_pointer_offset);
5640 fprintf (stderr, "stack_pointer_offset: %ld\n", (long)frame->stack_pointer_offset);
5641 fprintf (stderr, "current_function_is_leaf: %ld\n", (long)current_function_is_leaf);
5642 fprintf (stderr, "current_function_calls_alloca: %ld\n", (long)current_function_calls_alloca);
5643 fprintf (stderr, "x86_current_function_calls_tls_descriptor: %ld\n", (long)ix86_current_function_calls_tls_descriptor);
5644 #endif
5645 }
5646
5647 /* Emit code to save registers in the prologue. */
5648
5649 static void
5650 ix86_emit_save_regs (void)
5651 {
5652 unsigned int regno;
5653 rtx insn;
5654
5655 for (regno = FIRST_PSEUDO_REGISTER; regno-- > 0; )
5656 if (ix86_save_reg (regno, true))
5657 {
5658 insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
5659 RTX_FRAME_RELATED_P (insn) = 1;
5660 }
5661 }
5662
5663 /* Emit code to save registers using MOV insns. First register
5664 is restored from POINTER + OFFSET. */
5665 static void
5666 ix86_emit_save_regs_using_mov (rtx pointer, HOST_WIDE_INT offset)
5667 {
5668 unsigned int regno;
5669 rtx insn;
5670
5671 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
5672 if (ix86_save_reg (regno, true))
5673 {
5674 insn = emit_move_insn (adjust_address (gen_rtx_MEM (Pmode, pointer),
5675 Pmode, offset),
5676 gen_rtx_REG (Pmode, regno));
5677 RTX_FRAME_RELATED_P (insn) = 1;
5678 offset += UNITS_PER_WORD;
5679 }
5680 }
5681
5682 /* Expand prologue or epilogue stack adjustment.
5683 The pattern exist to put a dependency on all ebp-based memory accesses.
5684 STYLE should be negative if instructions should be marked as frame related,
5685 zero if %r11 register is live and cannot be freely used and positive
5686 otherwise. */
5687
5688 static void
5689 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset, int style)
5690 {
5691 rtx insn;
5692
5693 if (! TARGET_64BIT)
5694 insn = emit_insn (gen_pro_epilogue_adjust_stack_1 (dest, src, offset));
5695 else if (x86_64_immediate_operand (offset, DImode))
5696 insn = emit_insn (gen_pro_epilogue_adjust_stack_rex64 (dest, src, offset));
5697 else
5698 {
5699 rtx r11;
5700 /* r11 is used by indirect sibcall return as well, set before the
5701 epilogue and used after the epilogue. ATM indirect sibcall
5702 shouldn't be used together with huge frame sizes in one
5703 function because of the frame_size check in sibcall.c. */
5704 gcc_assert (style);
5705 r11 = gen_rtx_REG (DImode, R11_REG);
5706 insn = emit_insn (gen_rtx_SET (DImode, r11, offset));
5707 if (style < 0)
5708 RTX_FRAME_RELATED_P (insn) = 1;
5709 insn = emit_insn (gen_pro_epilogue_adjust_stack_rex64_2 (dest, src, r11,
5710 offset));
5711 }
5712 if (style < 0)
5713 RTX_FRAME_RELATED_P (insn) = 1;
5714 }
5715
5716 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
5717
5718 static rtx
5719 ix86_internal_arg_pointer (void)
5720 {
5721 bool has_force_align_arg_pointer =
5722 (0 != lookup_attribute (ix86_force_align_arg_pointer_string,
5723 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))));
5724 if ((FORCE_PREFERRED_STACK_BOUNDARY_IN_MAIN
5725 && DECL_NAME (current_function_decl)
5726 && MAIN_NAME_P (DECL_NAME (current_function_decl))
5727 && DECL_FILE_SCOPE_P (current_function_decl))
5728 || ix86_force_align_arg_pointer
5729 || has_force_align_arg_pointer)
5730 {
5731 /* Nested functions can't realign the stack due to a register
5732 conflict. */
5733 if (DECL_CONTEXT (current_function_decl)
5734 && TREE_CODE (DECL_CONTEXT (current_function_decl)) == FUNCTION_DECL)
5735 {
5736 if (ix86_force_align_arg_pointer)
5737 warning (0, "-mstackrealign ignored for nested functions");
5738 if (has_force_align_arg_pointer)
5739 error ("%s not supported for nested functions",
5740 ix86_force_align_arg_pointer_string);
5741 return virtual_incoming_args_rtx;
5742 }
5743 cfun->machine->force_align_arg_pointer = gen_rtx_REG (Pmode, 2);
5744 return copy_to_reg (cfun->machine->force_align_arg_pointer);
5745 }
5746 else
5747 return virtual_incoming_args_rtx;
5748 }
5749
5750 /* Handle the TARGET_DWARF_HANDLE_FRAME_UNSPEC hook.
5751 This is called from dwarf2out.c to emit call frame instructions
5752 for frame-related insns containing UNSPECs and UNSPEC_VOLATILEs. */
5753 static void
5754 ix86_dwarf_handle_frame_unspec (const char *label, rtx pattern, int index)
5755 {
5756 rtx unspec = SET_SRC (pattern);
5757 gcc_assert (GET_CODE (unspec) == UNSPEC);
5758
5759 switch (index)
5760 {
5761 case UNSPEC_REG_SAVE:
5762 dwarf2out_reg_save_reg (label, XVECEXP (unspec, 0, 0),
5763 SET_DEST (pattern));
5764 break;
5765 case UNSPEC_DEF_CFA:
5766 dwarf2out_def_cfa (label, REGNO (SET_DEST (pattern)),
5767 INTVAL (XVECEXP (unspec, 0, 0)));
5768 break;
5769 default:
5770 gcc_unreachable ();
5771 }
5772 }
5773
5774 /* Expand the prologue into a bunch of separate insns. */
5775
5776 void
5777 ix86_expand_prologue (void)
5778 {
5779 rtx insn;
5780 bool pic_reg_used;
5781 struct ix86_frame frame;
5782 HOST_WIDE_INT allocate;
5783
5784 ix86_compute_frame_layout (&frame);
5785
5786 if (cfun->machine->force_align_arg_pointer)
5787 {
5788 rtx x, y;
5789
5790 /* Grab the argument pointer. */
5791 x = plus_constant (stack_pointer_rtx, 4);
5792 y = cfun->machine->force_align_arg_pointer;
5793 insn = emit_insn (gen_rtx_SET (VOIDmode, y, x));
5794 RTX_FRAME_RELATED_P (insn) = 1;
5795
5796 /* The unwind info consists of two parts: install the fafp as the cfa,
5797 and record the fafp as the "save register" of the stack pointer.
5798 The later is there in order that the unwinder can see where it
5799 should restore the stack pointer across the and insn. */
5800 x = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx), UNSPEC_DEF_CFA);
5801 x = gen_rtx_SET (VOIDmode, y, x);
5802 RTX_FRAME_RELATED_P (x) = 1;
5803 y = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, stack_pointer_rtx),
5804 UNSPEC_REG_SAVE);
5805 y = gen_rtx_SET (VOIDmode, cfun->machine->force_align_arg_pointer, y);
5806 RTX_FRAME_RELATED_P (y) = 1;
5807 x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, x, y));
5808 x = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR, x, NULL);
5809 REG_NOTES (insn) = x;
5810
5811 /* Align the stack. */
5812 emit_insn (gen_andsi3 (stack_pointer_rtx, stack_pointer_rtx,
5813 GEN_INT (-16)));
5814
5815 /* And here we cheat like madmen with the unwind info. We force the
5816 cfa register back to sp+4, which is exactly what it was at the
5817 start of the function. Re-pushing the return address results in
5818 the return at the same spot relative to the cfa, and thus is
5819 correct wrt the unwind info. */
5820 x = cfun->machine->force_align_arg_pointer;
5821 x = gen_frame_mem (Pmode, plus_constant (x, -4));
5822 insn = emit_insn (gen_push (x));
5823 RTX_FRAME_RELATED_P (insn) = 1;
5824
5825 x = GEN_INT (4);
5826 x = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, x), UNSPEC_DEF_CFA);
5827 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
5828 x = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR, x, NULL);
5829 REG_NOTES (insn) = x;
5830 }
5831
5832 /* Note: AT&T enter does NOT have reversed args. Enter is probably
5833 slower on all targets. Also sdb doesn't like it. */
5834
5835 if (frame_pointer_needed)
5836 {
5837 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
5838 RTX_FRAME_RELATED_P (insn) = 1;
5839
5840 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
5841 RTX_FRAME_RELATED_P (insn) = 1;
5842 }
5843
5844 allocate = frame.to_allocate;
5845
5846 if (!frame.save_regs_using_mov)
5847 ix86_emit_save_regs ();
5848 else
5849 allocate += frame.nregs * UNITS_PER_WORD;
5850
5851 /* When using red zone we may start register saving before allocating
5852 the stack frame saving one cycle of the prologue. */
5853 if (TARGET_RED_ZONE && frame.save_regs_using_mov)
5854 ix86_emit_save_regs_using_mov (frame_pointer_needed ? hard_frame_pointer_rtx
5855 : stack_pointer_rtx,
5856 -frame.nregs * UNITS_PER_WORD);
5857
5858 if (allocate == 0)
5859 ;
5860 else if (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT)
5861 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
5862 GEN_INT (-allocate), -1);
5863 else
5864 {
5865 /* Only valid for Win32. */
5866 rtx eax = gen_rtx_REG (SImode, 0);
5867 bool eax_live = ix86_eax_live_at_start_p ();
5868 rtx t;
5869
5870 gcc_assert (!TARGET_64BIT);
5871
5872 if (eax_live)
5873 {
5874 emit_insn (gen_push (eax));
5875 allocate -= 4;
5876 }
5877
5878 emit_move_insn (eax, GEN_INT (allocate));
5879
5880 insn = emit_insn (gen_allocate_stack_worker (eax));
5881 RTX_FRAME_RELATED_P (insn) = 1;
5882 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (-allocate));
5883 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
5884 REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR,
5885 t, REG_NOTES (insn));
5886
5887 if (eax_live)
5888 {
5889 if (frame_pointer_needed)
5890 t = plus_constant (hard_frame_pointer_rtx,
5891 allocate
5892 - frame.to_allocate
5893 - frame.nregs * UNITS_PER_WORD);
5894 else
5895 t = plus_constant (stack_pointer_rtx, allocate);
5896 emit_move_insn (eax, gen_rtx_MEM (SImode, t));
5897 }
5898 }
5899
5900 if (frame.save_regs_using_mov && !TARGET_RED_ZONE)
5901 {
5902 if (!frame_pointer_needed || !frame.to_allocate)
5903 ix86_emit_save_regs_using_mov (stack_pointer_rtx, frame.to_allocate);
5904 else
5905 ix86_emit_save_regs_using_mov (hard_frame_pointer_rtx,
5906 -frame.nregs * UNITS_PER_WORD);
5907 }
5908
5909 pic_reg_used = false;
5910 if (pic_offset_table_rtx
5911 && (regs_ever_live[REAL_PIC_OFFSET_TABLE_REGNUM]
5912 || current_function_profile))
5913 {
5914 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
5915
5916 if (alt_pic_reg_used != INVALID_REGNUM)
5917 REGNO (pic_offset_table_rtx) = alt_pic_reg_used;
5918
5919 pic_reg_used = true;
5920 }
5921
5922 if (pic_reg_used)
5923 {
5924 if (TARGET_64BIT)
5925 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
5926 else
5927 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
5928
5929 /* Even with accurate pre-reload life analysis, we can wind up
5930 deleting all references to the pic register after reload.
5931 Consider if cross-jumping unifies two sides of a branch
5932 controlled by a comparison vs the only read from a global.
5933 In which case, allow the set_got to be deleted, though we're
5934 too late to do anything about the ebx save in the prologue. */
5935 REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_MAYBE_DEAD, const0_rtx, NULL);
5936 }
5937
5938 /* Prevent function calls from be scheduled before the call to mcount.
5939 In the pic_reg_used case, make sure that the got load isn't deleted. */
5940 if (current_function_profile)
5941 emit_insn (gen_blockage (pic_reg_used ? pic_offset_table_rtx : const0_rtx));
5942 }
5943
5944 /* Emit code to restore saved registers using MOV insns. First register
5945 is restored from POINTER + OFFSET. */
5946 static void
5947 ix86_emit_restore_regs_using_mov (rtx pointer, HOST_WIDE_INT offset,
5948 int maybe_eh_return)
5949 {
5950 int regno;
5951 rtx base_address = gen_rtx_MEM (Pmode, pointer);
5952
5953 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
5954 if (ix86_save_reg (regno, maybe_eh_return))
5955 {
5956 /* Ensure that adjust_address won't be forced to produce pointer
5957 out of range allowed by x86-64 instruction set. */
5958 if (TARGET_64BIT && offset != trunc_int_for_mode (offset, SImode))
5959 {
5960 rtx r11;
5961
5962 r11 = gen_rtx_REG (DImode, R11_REG);
5963 emit_move_insn (r11, GEN_INT (offset));
5964 emit_insn (gen_adddi3 (r11, r11, pointer));
5965 base_address = gen_rtx_MEM (Pmode, r11);
5966 offset = 0;
5967 }
5968 emit_move_insn (gen_rtx_REG (Pmode, regno),
5969 adjust_address (base_address, Pmode, offset));
5970 offset += UNITS_PER_WORD;
5971 }
5972 }
5973
5974 /* Restore function stack, frame, and registers. */
5975
5976 void
5977 ix86_expand_epilogue (int style)
5978 {
5979 int regno;
5980 int sp_valid = !frame_pointer_needed || current_function_sp_is_unchanging;
5981 struct ix86_frame frame;
5982 HOST_WIDE_INT offset;
5983
5984 ix86_compute_frame_layout (&frame);
5985
5986 /* Calculate start of saved registers relative to ebp. Special care
5987 must be taken for the normal return case of a function using
5988 eh_return: the eax and edx registers are marked as saved, but not
5989 restored along this path. */
5990 offset = frame.nregs;
5991 if (current_function_calls_eh_return && style != 2)
5992 offset -= 2;
5993 offset *= -UNITS_PER_WORD;
5994
5995 /* If we're only restoring one register and sp is not valid then
5996 using a move instruction to restore the register since it's
5997 less work than reloading sp and popping the register.
5998
5999 The default code result in stack adjustment using add/lea instruction,
6000 while this code results in LEAVE instruction (or discrete equivalent),
6001 so it is profitable in some other cases as well. Especially when there
6002 are no registers to restore. We also use this code when TARGET_USE_LEAVE
6003 and there is exactly one register to pop. This heuristic may need some
6004 tuning in future. */
6005 if ((!sp_valid && frame.nregs <= 1)
6006 || (TARGET_EPILOGUE_USING_MOVE
6007 && cfun->machine->use_fast_prologue_epilogue
6008 && (frame.nregs > 1 || frame.to_allocate))
6009 || (frame_pointer_needed && !frame.nregs && frame.to_allocate)
6010 || (frame_pointer_needed && TARGET_USE_LEAVE
6011 && cfun->machine->use_fast_prologue_epilogue
6012 && frame.nregs == 1)
6013 || current_function_calls_eh_return)
6014 {
6015 /* Restore registers. We can use ebp or esp to address the memory
6016 locations. If both are available, default to ebp, since offsets
6017 are known to be small. Only exception is esp pointing directly to the
6018 end of block of saved registers, where we may simplify addressing
6019 mode. */
6020
6021 if (!frame_pointer_needed || (sp_valid && !frame.to_allocate))
6022 ix86_emit_restore_regs_using_mov (stack_pointer_rtx,
6023 frame.to_allocate, style == 2);
6024 else
6025 ix86_emit_restore_regs_using_mov (hard_frame_pointer_rtx,
6026 offset, style == 2);
6027
6028 /* eh_return epilogues need %ecx added to the stack pointer. */
6029 if (style == 2)
6030 {
6031 rtx tmp, sa = EH_RETURN_STACKADJ_RTX;
6032
6033 if (frame_pointer_needed)
6034 {
6035 tmp = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
6036 tmp = plus_constant (tmp, UNITS_PER_WORD);
6037 emit_insn (gen_rtx_SET (VOIDmode, sa, tmp));
6038
6039 tmp = gen_rtx_MEM (Pmode, hard_frame_pointer_rtx);
6040 emit_move_insn (hard_frame_pointer_rtx, tmp);
6041
6042 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
6043 const0_rtx, style);
6044 }
6045 else
6046 {
6047 tmp = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
6048 tmp = plus_constant (tmp, (frame.to_allocate
6049 + frame.nregs * UNITS_PER_WORD));
6050 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, tmp));
6051 }
6052 }
6053 else if (!frame_pointer_needed)
6054 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
6055 GEN_INT (frame.to_allocate
6056 + frame.nregs * UNITS_PER_WORD),
6057 style);
6058 /* If not an i386, mov & pop is faster than "leave". */
6059 else if (TARGET_USE_LEAVE || optimize_size
6060 || !cfun->machine->use_fast_prologue_epilogue)
6061 emit_insn (TARGET_64BIT ? gen_leave_rex64 () : gen_leave ());
6062 else
6063 {
6064 pro_epilogue_adjust_stack (stack_pointer_rtx,
6065 hard_frame_pointer_rtx,
6066 const0_rtx, style);
6067 if (TARGET_64BIT)
6068 emit_insn (gen_popdi1 (hard_frame_pointer_rtx));
6069 else
6070 emit_insn (gen_popsi1 (hard_frame_pointer_rtx));
6071 }
6072 }
6073 else
6074 {
6075 /* First step is to deallocate the stack frame so that we can
6076 pop the registers. */
6077 if (!sp_valid)
6078 {
6079 gcc_assert (frame_pointer_needed);
6080 pro_epilogue_adjust_stack (stack_pointer_rtx,
6081 hard_frame_pointer_rtx,
6082 GEN_INT (offset), style);
6083 }
6084 else if (frame.to_allocate)
6085 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
6086 GEN_INT (frame.to_allocate), style);
6087
6088 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
6089 if (ix86_save_reg (regno, false))
6090 {
6091 if (TARGET_64BIT)
6092 emit_insn (gen_popdi1 (gen_rtx_REG (Pmode, regno)));
6093 else
6094 emit_insn (gen_popsi1 (gen_rtx_REG (Pmode, regno)));
6095 }
6096 if (frame_pointer_needed)
6097 {
6098 /* Leave results in shorter dependency chains on CPUs that are
6099 able to grok it fast. */
6100 if (TARGET_USE_LEAVE)
6101 emit_insn (TARGET_64BIT ? gen_leave_rex64 () : gen_leave ());
6102 else if (TARGET_64BIT)
6103 emit_insn (gen_popdi1 (hard_frame_pointer_rtx));
6104 else
6105 emit_insn (gen_popsi1 (hard_frame_pointer_rtx));
6106 }
6107 }
6108
6109 if (cfun->machine->force_align_arg_pointer)
6110 {
6111 emit_insn (gen_addsi3 (stack_pointer_rtx,
6112 cfun->machine->force_align_arg_pointer,
6113 GEN_INT (-4)));
6114 }
6115
6116 /* Sibcall epilogues don't want a return instruction. */
6117 if (style == 0)
6118 return;
6119
6120 if (current_function_pops_args && current_function_args_size)
6121 {
6122 rtx popc = GEN_INT (current_function_pops_args);
6123
6124 /* i386 can only pop 64K bytes. If asked to pop more, pop
6125 return address, do explicit add, and jump indirectly to the
6126 caller. */
6127
6128 if (current_function_pops_args >= 65536)
6129 {
6130 rtx ecx = gen_rtx_REG (SImode, 2);
6131
6132 /* There is no "pascal" calling convention in 64bit ABI. */
6133 gcc_assert (!TARGET_64BIT);
6134
6135 emit_insn (gen_popsi1 (ecx));
6136 emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, popc));
6137 emit_jump_insn (gen_return_indirect_internal (ecx));
6138 }
6139 else
6140 emit_jump_insn (gen_return_pop_internal (popc));
6141 }
6142 else
6143 emit_jump_insn (gen_return_internal ());
6144 }
6145
6146 /* Reset from the function's potential modifications. */
6147
6148 static void
6149 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
6150 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
6151 {
6152 if (pic_offset_table_rtx)
6153 REGNO (pic_offset_table_rtx) = REAL_PIC_OFFSET_TABLE_REGNUM;
6154 #if TARGET_MACHO
6155 /* Mach-O doesn't support labels at the end of objects, so if
6156 it looks like we might want one, insert a NOP. */
6157 {
6158 rtx insn = get_last_insn ();
6159 while (insn
6160 && NOTE_P (insn)
6161 && NOTE_LINE_NUMBER (insn) != NOTE_INSN_DELETED_LABEL)
6162 insn = PREV_INSN (insn);
6163 if (insn
6164 && (LABEL_P (insn)
6165 || (NOTE_P (insn)
6166 && NOTE_LINE_NUMBER (insn) == NOTE_INSN_DELETED_LABEL)))
6167 fputs ("\tnop\n", file);
6168 }
6169 #endif
6170
6171 }
6172 \f
6173 /* Extract the parts of an RTL expression that is a valid memory address
6174 for an instruction. Return 0 if the structure of the address is
6175 grossly off. Return -1 if the address contains ASHIFT, so it is not
6176 strictly valid, but still used for computing length of lea instruction. */
6177
6178 int
6179 ix86_decompose_address (rtx addr, struct ix86_address *out)
6180 {
6181 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
6182 rtx base_reg, index_reg;
6183 HOST_WIDE_INT scale = 1;
6184 rtx scale_rtx = NULL_RTX;
6185 int retval = 1;
6186 enum ix86_address_seg seg = SEG_DEFAULT;
6187
6188 if (REG_P (addr) || GET_CODE (addr) == SUBREG)
6189 base = addr;
6190 else if (GET_CODE (addr) == PLUS)
6191 {
6192 rtx addends[4], op;
6193 int n = 0, i;
6194
6195 op = addr;
6196 do
6197 {
6198 if (n >= 4)
6199 return 0;
6200 addends[n++] = XEXP (op, 1);
6201 op = XEXP (op, 0);
6202 }
6203 while (GET_CODE (op) == PLUS);
6204 if (n >= 4)
6205 return 0;
6206 addends[n] = op;
6207
6208 for (i = n; i >= 0; --i)
6209 {
6210 op = addends[i];
6211 switch (GET_CODE (op))
6212 {
6213 case MULT:
6214 if (index)
6215 return 0;
6216 index = XEXP (op, 0);
6217 scale_rtx = XEXP (op, 1);
6218 break;
6219
6220 case UNSPEC:
6221 if (XINT (op, 1) == UNSPEC_TP
6222 && TARGET_TLS_DIRECT_SEG_REFS
6223 && seg == SEG_DEFAULT)
6224 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
6225 else
6226 return 0;
6227 break;
6228
6229 case REG:
6230 case SUBREG:
6231 if (!base)
6232 base = op;
6233 else if (!index)
6234 index = op;
6235 else
6236 return 0;
6237 break;
6238
6239 case CONST:
6240 case CONST_INT:
6241 case SYMBOL_REF:
6242 case LABEL_REF:
6243 if (disp)
6244 return 0;
6245 disp = op;
6246 break;
6247
6248 default:
6249 return 0;
6250 }
6251 }
6252 }
6253 else if (GET_CODE (addr) == MULT)
6254 {
6255 index = XEXP (addr, 0); /* index*scale */
6256 scale_rtx = XEXP (addr, 1);
6257 }
6258 else if (GET_CODE (addr) == ASHIFT)
6259 {
6260 rtx tmp;
6261
6262 /* We're called for lea too, which implements ashift on occasion. */
6263 index = XEXP (addr, 0);
6264 tmp = XEXP (addr, 1);
6265 if (!CONST_INT_P (tmp))
6266 return 0;
6267 scale = INTVAL (tmp);
6268 if ((unsigned HOST_WIDE_INT) scale > 3)
6269 return 0;
6270 scale = 1 << scale;
6271 retval = -1;
6272 }
6273 else
6274 disp = addr; /* displacement */
6275
6276 /* Extract the integral value of scale. */
6277 if (scale_rtx)
6278 {
6279 if (!CONST_INT_P (scale_rtx))
6280 return 0;
6281 scale = INTVAL (scale_rtx);
6282 }
6283
6284 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
6285 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
6286
6287 /* Allow arg pointer and stack pointer as index if there is not scaling. */
6288 if (base_reg && index_reg && scale == 1
6289 && (index_reg == arg_pointer_rtx
6290 || index_reg == frame_pointer_rtx
6291 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
6292 {
6293 rtx tmp;
6294 tmp = base, base = index, index = tmp;
6295 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
6296 }
6297
6298 /* Special case: %ebp cannot be encoded as a base without a displacement. */
6299 if ((base_reg == hard_frame_pointer_rtx
6300 || base_reg == frame_pointer_rtx
6301 || base_reg == arg_pointer_rtx) && !disp)
6302 disp = const0_rtx;
6303
6304 /* Special case: on K6, [%esi] makes the instruction vector decoded.
6305 Avoid this by transforming to [%esi+0]. */
6306 if (ix86_tune == PROCESSOR_K6 && !optimize_size
6307 && base_reg && !index_reg && !disp
6308 && REG_P (base_reg)
6309 && REGNO_REG_CLASS (REGNO (base_reg)) == SIREG)
6310 disp = const0_rtx;
6311
6312 /* Special case: encode reg+reg instead of reg*2. */
6313 if (!base && index && scale && scale == 2)
6314 base = index, base_reg = index_reg, scale = 1;
6315
6316 /* Special case: scaling cannot be encoded without base or displacement. */
6317 if (!base && !disp && index && scale != 1)
6318 disp = const0_rtx;
6319
6320 out->base = base;
6321 out->index = index;
6322 out->disp = disp;
6323 out->scale = scale;
6324 out->seg = seg;
6325
6326 return retval;
6327 }
6328 \f
6329 /* Return cost of the memory address x.
6330 For i386, it is better to use a complex address than let gcc copy
6331 the address into a reg and make a new pseudo. But not if the address
6332 requires to two regs - that would mean more pseudos with longer
6333 lifetimes. */
6334 static int
6335 ix86_address_cost (rtx x)
6336 {
6337 struct ix86_address parts;
6338 int cost = 1;
6339 int ok = ix86_decompose_address (x, &parts);
6340
6341 gcc_assert (ok);
6342
6343 if (parts.base && GET_CODE (parts.base) == SUBREG)
6344 parts.base = SUBREG_REG (parts.base);
6345 if (parts.index && GET_CODE (parts.index) == SUBREG)
6346 parts.index = SUBREG_REG (parts.index);
6347
6348 /* More complex memory references are better. */
6349 if (parts.disp && parts.disp != const0_rtx)
6350 cost--;
6351 if (parts.seg != SEG_DEFAULT)
6352 cost--;
6353
6354 /* Attempt to minimize number of registers in the address. */
6355 if ((parts.base
6356 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
6357 || (parts.index
6358 && (!REG_P (parts.index)
6359 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
6360 cost++;
6361
6362 if (parts.base
6363 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
6364 && parts.index
6365 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
6366 && parts.base != parts.index)
6367 cost++;
6368
6369 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
6370 since it's predecode logic can't detect the length of instructions
6371 and it degenerates to vector decoded. Increase cost of such
6372 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
6373 to split such addresses or even refuse such addresses at all.
6374
6375 Following addressing modes are affected:
6376 [base+scale*index]
6377 [scale*index+disp]
6378 [base+index]
6379
6380 The first and last case may be avoidable by explicitly coding the zero in
6381 memory address, but I don't have AMD-K6 machine handy to check this
6382 theory. */
6383
6384 if (TARGET_K6
6385 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
6386 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
6387 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
6388 cost += 10;
6389
6390 return cost;
6391 }
6392 \f
6393 /* If X is a machine specific address (i.e. a symbol or label being
6394 referenced as a displacement from the GOT implemented using an
6395 UNSPEC), then return the base term. Otherwise return X. */
6396
6397 rtx
6398 ix86_find_base_term (rtx x)
6399 {
6400 rtx term;
6401
6402 if (TARGET_64BIT)
6403 {
6404 if (GET_CODE (x) != CONST)
6405 return x;
6406 term = XEXP (x, 0);
6407 if (GET_CODE (term) == PLUS
6408 && (CONST_INT_P (XEXP (term, 1))
6409 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
6410 term = XEXP (term, 0);
6411 if (GET_CODE (term) != UNSPEC
6412 || XINT (term, 1) != UNSPEC_GOTPCREL)
6413 return x;
6414
6415 term = XVECEXP (term, 0, 0);
6416
6417 if (GET_CODE (term) != SYMBOL_REF
6418 && GET_CODE (term) != LABEL_REF)
6419 return x;
6420
6421 return term;
6422 }
6423
6424 term = ix86_delegitimize_address (x);
6425
6426 if (GET_CODE (term) != SYMBOL_REF
6427 && GET_CODE (term) != LABEL_REF)
6428 return x;
6429
6430 return term;
6431 }
6432
6433 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
6434 this is used for to form addresses to local data when -fPIC is in
6435 use. */
6436
6437 static bool
6438 darwin_local_data_pic (rtx disp)
6439 {
6440 if (GET_CODE (disp) == MINUS)
6441 {
6442 if (GET_CODE (XEXP (disp, 0)) == LABEL_REF
6443 || GET_CODE (XEXP (disp, 0)) == SYMBOL_REF)
6444 if (GET_CODE (XEXP (disp, 1)) == SYMBOL_REF)
6445 {
6446 const char *sym_name = XSTR (XEXP (disp, 1), 0);
6447 if (! strcmp (sym_name, "<pic base>"))
6448 return true;
6449 }
6450 }
6451
6452 return false;
6453 }
6454 \f
6455 /* Determine if a given RTX is a valid constant. We already know this
6456 satisfies CONSTANT_P. */
6457
6458 bool
6459 legitimate_constant_p (rtx x)
6460 {
6461 switch (GET_CODE (x))
6462 {
6463 case CONST:
6464 x = XEXP (x, 0);
6465
6466 if (GET_CODE (x) == PLUS)
6467 {
6468 if (!CONST_INT_P (XEXP (x, 1)))
6469 return false;
6470 x = XEXP (x, 0);
6471 }
6472
6473 if (TARGET_MACHO && darwin_local_data_pic (x))
6474 return true;
6475
6476 /* Only some unspecs are valid as "constants". */
6477 if (GET_CODE (x) == UNSPEC)
6478 switch (XINT (x, 1))
6479 {
6480 case UNSPEC_GOTOFF:
6481 return TARGET_64BIT;
6482 case UNSPEC_TPOFF:
6483 case UNSPEC_NTPOFF:
6484 x = XVECEXP (x, 0, 0);
6485 return (GET_CODE (x) == SYMBOL_REF
6486 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
6487 case UNSPEC_DTPOFF:
6488 x = XVECEXP (x, 0, 0);
6489 return (GET_CODE (x) == SYMBOL_REF
6490 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
6491 default:
6492 return false;
6493 }
6494
6495 /* We must have drilled down to a symbol. */
6496 if (GET_CODE (x) == LABEL_REF)
6497 return true;
6498 if (GET_CODE (x) != SYMBOL_REF)
6499 return false;
6500 /* FALLTHRU */
6501
6502 case SYMBOL_REF:
6503 /* TLS symbols are never valid. */
6504 if (SYMBOL_REF_TLS_MODEL (x))
6505 return false;
6506 break;
6507
6508 case CONST_DOUBLE:
6509 if (GET_MODE (x) == TImode
6510 && x != CONST0_RTX (TImode)
6511 && !TARGET_64BIT)
6512 return false;
6513 break;
6514
6515 case CONST_VECTOR:
6516 if (x == CONST0_RTX (GET_MODE (x)))
6517 return true;
6518 return false;
6519
6520 default:
6521 break;
6522 }
6523
6524 /* Otherwise we handle everything else in the move patterns. */
6525 return true;
6526 }
6527
6528 /* Determine if it's legal to put X into the constant pool. This
6529 is not possible for the address of thread-local symbols, which
6530 is checked above. */
6531
6532 static bool
6533 ix86_cannot_force_const_mem (rtx x)
6534 {
6535 /* We can always put integral constants and vectors in memory. */
6536 switch (GET_CODE (x))
6537 {
6538 case CONST_INT:
6539 case CONST_DOUBLE:
6540 case CONST_VECTOR:
6541 return false;
6542
6543 default:
6544 break;
6545 }
6546 return !legitimate_constant_p (x);
6547 }
6548
6549 /* Determine if a given RTX is a valid constant address. */
6550
6551 bool
6552 constant_address_p (rtx x)
6553 {
6554 return CONSTANT_P (x) && legitimate_address_p (Pmode, x, 1);
6555 }
6556
6557 /* Nonzero if the constant value X is a legitimate general operand
6558 when generating PIC code. It is given that flag_pic is on and
6559 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
6560
6561 bool
6562 legitimate_pic_operand_p (rtx x)
6563 {
6564 rtx inner;
6565
6566 switch (GET_CODE (x))
6567 {
6568 case CONST:
6569 inner = XEXP (x, 0);
6570 if (GET_CODE (inner) == PLUS
6571 && CONST_INT_P (XEXP (inner, 1)))
6572 inner = XEXP (inner, 0);
6573
6574 /* Only some unspecs are valid as "constants". */
6575 if (GET_CODE (inner) == UNSPEC)
6576 switch (XINT (inner, 1))
6577 {
6578 case UNSPEC_GOTOFF:
6579 return TARGET_64BIT;
6580 case UNSPEC_TPOFF:
6581 x = XVECEXP (inner, 0, 0);
6582 return (GET_CODE (x) == SYMBOL_REF
6583 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
6584 default:
6585 return false;
6586 }
6587 /* FALLTHRU */
6588
6589 case SYMBOL_REF:
6590 case LABEL_REF:
6591 return legitimate_pic_address_disp_p (x);
6592
6593 default:
6594 return true;
6595 }
6596 }
6597
6598 /* Determine if a given CONST RTX is a valid memory displacement
6599 in PIC mode. */
6600
6601 int
6602 legitimate_pic_address_disp_p (rtx disp)
6603 {
6604 bool saw_plus;
6605
6606 /* In 64bit mode we can allow direct addresses of symbols and labels
6607 when they are not dynamic symbols. */
6608 if (TARGET_64BIT)
6609 {
6610 rtx op0 = disp, op1;
6611
6612 switch (GET_CODE (disp))
6613 {
6614 case LABEL_REF:
6615 return true;
6616
6617 case CONST:
6618 if (GET_CODE (XEXP (disp, 0)) != PLUS)
6619 break;
6620 op0 = XEXP (XEXP (disp, 0), 0);
6621 op1 = XEXP (XEXP (disp, 0), 1);
6622 if (!CONST_INT_P (op1)
6623 || INTVAL (op1) >= 16*1024*1024
6624 || INTVAL (op1) < -16*1024*1024)
6625 break;
6626 if (GET_CODE (op0) == LABEL_REF)
6627 return true;
6628 if (GET_CODE (op0) != SYMBOL_REF)
6629 break;
6630 /* FALLTHRU */
6631
6632 case SYMBOL_REF:
6633 /* TLS references should always be enclosed in UNSPEC. */
6634 if (SYMBOL_REF_TLS_MODEL (op0))
6635 return false;
6636 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0))
6637 return true;
6638 break;
6639
6640 default:
6641 break;
6642 }
6643 }
6644 if (GET_CODE (disp) != CONST)
6645 return 0;
6646 disp = XEXP (disp, 0);
6647
6648 if (TARGET_64BIT)
6649 {
6650 /* We are unsafe to allow PLUS expressions. This limit allowed distance
6651 of GOT tables. We should not need these anyway. */
6652 if (GET_CODE (disp) != UNSPEC
6653 || (XINT (disp, 1) != UNSPEC_GOTPCREL
6654 && XINT (disp, 1) != UNSPEC_GOTOFF))
6655 return 0;
6656
6657 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
6658 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
6659 return 0;
6660 return 1;
6661 }
6662
6663 saw_plus = false;
6664 if (GET_CODE (disp) == PLUS)
6665 {
6666 if (!CONST_INT_P (XEXP (disp, 1)))
6667 return 0;
6668 disp = XEXP (disp, 0);
6669 saw_plus = true;
6670 }
6671
6672 if (TARGET_MACHO && darwin_local_data_pic (disp))
6673 return 1;
6674
6675 if (GET_CODE (disp) != UNSPEC)
6676 return 0;
6677
6678 switch (XINT (disp, 1))
6679 {
6680 case UNSPEC_GOT:
6681 if (saw_plus)
6682 return false;
6683 return GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF;
6684 case UNSPEC_GOTOFF:
6685 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
6686 While ABI specify also 32bit relocation but we don't produce it in
6687 small PIC model at all. */
6688 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
6689 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
6690 && !TARGET_64BIT)
6691 return local_symbolic_operand (XVECEXP (disp, 0, 0), Pmode);
6692 return false;
6693 case UNSPEC_GOTTPOFF:
6694 case UNSPEC_GOTNTPOFF:
6695 case UNSPEC_INDNTPOFF:
6696 if (saw_plus)
6697 return false;
6698 disp = XVECEXP (disp, 0, 0);
6699 return (GET_CODE (disp) == SYMBOL_REF
6700 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
6701 case UNSPEC_NTPOFF:
6702 disp = XVECEXP (disp, 0, 0);
6703 return (GET_CODE (disp) == SYMBOL_REF
6704 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
6705 case UNSPEC_DTPOFF:
6706 disp = XVECEXP (disp, 0, 0);
6707 return (GET_CODE (disp) == SYMBOL_REF
6708 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
6709 }
6710
6711 return 0;
6712 }
6713
6714 /* GO_IF_LEGITIMATE_ADDRESS recognizes an RTL expression that is a valid
6715 memory address for an instruction. The MODE argument is the machine mode
6716 for the MEM expression that wants to use this address.
6717
6718 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
6719 convert common non-canonical forms to canonical form so that they will
6720 be recognized. */
6721
6722 int
6723 legitimate_address_p (enum machine_mode mode, rtx addr, int strict)
6724 {
6725 struct ix86_address parts;
6726 rtx base, index, disp;
6727 HOST_WIDE_INT scale;
6728 const char *reason = NULL;
6729 rtx reason_rtx = NULL_RTX;
6730
6731 if (TARGET_DEBUG_ADDR)
6732 {
6733 fprintf (stderr,
6734 "\n======\nGO_IF_LEGITIMATE_ADDRESS, mode = %s, strict = %d\n",
6735 GET_MODE_NAME (mode), strict);
6736 debug_rtx (addr);
6737 }
6738
6739 if (ix86_decompose_address (addr, &parts) <= 0)
6740 {
6741 reason = "decomposition failed";
6742 goto report_error;
6743 }
6744
6745 base = parts.base;
6746 index = parts.index;
6747 disp = parts.disp;
6748 scale = parts.scale;
6749
6750 /* Validate base register.
6751
6752 Don't allow SUBREG's that span more than a word here. It can lead to spill
6753 failures when the base is one word out of a two word structure, which is
6754 represented internally as a DImode int. */
6755
6756 if (base)
6757 {
6758 rtx reg;
6759 reason_rtx = base;
6760
6761 if (REG_P (base))
6762 reg = base;
6763 else if (GET_CODE (base) == SUBREG
6764 && REG_P (SUBREG_REG (base))
6765 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (base)))
6766 <= UNITS_PER_WORD)
6767 reg = SUBREG_REG (base);
6768 else
6769 {
6770 reason = "base is not a register";
6771 goto report_error;
6772 }
6773
6774 if (GET_MODE (base) != Pmode)
6775 {
6776 reason = "base is not in Pmode";
6777 goto report_error;
6778 }
6779
6780 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
6781 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
6782 {
6783 reason = "base is not valid";
6784 goto report_error;
6785 }
6786 }
6787
6788 /* Validate index register.
6789
6790 Don't allow SUBREG's that span more than a word here -- same as above. */
6791
6792 if (index)
6793 {
6794 rtx reg;
6795 reason_rtx = index;
6796
6797 if (REG_P (index))
6798 reg = index;
6799 else if (GET_CODE (index) == SUBREG
6800 && REG_P (SUBREG_REG (index))
6801 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (index)))
6802 <= UNITS_PER_WORD)
6803 reg = SUBREG_REG (index);
6804 else
6805 {
6806 reason = "index is not a register";
6807 goto report_error;
6808 }
6809
6810 if (GET_MODE (index) != Pmode)
6811 {
6812 reason = "index is not in Pmode";
6813 goto report_error;
6814 }
6815
6816 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
6817 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
6818 {
6819 reason = "index is not valid";
6820 goto report_error;
6821 }
6822 }
6823
6824 /* Validate scale factor. */
6825 if (scale != 1)
6826 {
6827 reason_rtx = GEN_INT (scale);
6828 if (!index)
6829 {
6830 reason = "scale without index";
6831 goto report_error;
6832 }
6833
6834 if (scale != 2 && scale != 4 && scale != 8)
6835 {
6836 reason = "scale is not a valid multiplier";
6837 goto report_error;
6838 }
6839 }
6840
6841 /* Validate displacement. */
6842 if (disp)
6843 {
6844 reason_rtx = disp;
6845
6846 if (GET_CODE (disp) == CONST
6847 && GET_CODE (XEXP (disp, 0)) == UNSPEC)
6848 switch (XINT (XEXP (disp, 0), 1))
6849 {
6850 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
6851 used. While ABI specify also 32bit relocations, we don't produce
6852 them at all and use IP relative instead. */
6853 case UNSPEC_GOT:
6854 case UNSPEC_GOTOFF:
6855 gcc_assert (flag_pic);
6856 if (!TARGET_64BIT)
6857 goto is_legitimate_pic;
6858 reason = "64bit address unspec";
6859 goto report_error;
6860
6861 case UNSPEC_GOTPCREL:
6862 gcc_assert (flag_pic);
6863 goto is_legitimate_pic;
6864
6865 case UNSPEC_GOTTPOFF:
6866 case UNSPEC_GOTNTPOFF:
6867 case UNSPEC_INDNTPOFF:
6868 case UNSPEC_NTPOFF:
6869 case UNSPEC_DTPOFF:
6870 break;
6871
6872 default:
6873 reason = "invalid address unspec";
6874 goto report_error;
6875 }
6876
6877 else if (SYMBOLIC_CONST (disp)
6878 && (flag_pic
6879 || (TARGET_MACHO
6880 #if TARGET_MACHO
6881 && MACHOPIC_INDIRECT
6882 && !machopic_operand_p (disp)
6883 #endif
6884 )))
6885 {
6886
6887 is_legitimate_pic:
6888 if (TARGET_64BIT && (index || base))
6889 {
6890 /* foo@dtpoff(%rX) is ok. */
6891 if (GET_CODE (disp) != CONST
6892 || GET_CODE (XEXP (disp, 0)) != PLUS
6893 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
6894 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
6895 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
6896 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
6897 {
6898 reason = "non-constant pic memory reference";
6899 goto report_error;
6900 }
6901 }
6902 else if (! legitimate_pic_address_disp_p (disp))
6903 {
6904 reason = "displacement is an invalid pic construct";
6905 goto report_error;
6906 }
6907
6908 /* This code used to verify that a symbolic pic displacement
6909 includes the pic_offset_table_rtx register.
6910
6911 While this is good idea, unfortunately these constructs may
6912 be created by "adds using lea" optimization for incorrect
6913 code like:
6914
6915 int a;
6916 int foo(int i)
6917 {
6918 return *(&a+i);
6919 }
6920
6921 This code is nonsensical, but results in addressing
6922 GOT table with pic_offset_table_rtx base. We can't
6923 just refuse it easily, since it gets matched by
6924 "addsi3" pattern, that later gets split to lea in the
6925 case output register differs from input. While this
6926 can be handled by separate addsi pattern for this case
6927 that never results in lea, this seems to be easier and
6928 correct fix for crash to disable this test. */
6929 }
6930 else if (GET_CODE (disp) != LABEL_REF
6931 && !CONST_INT_P (disp)
6932 && (GET_CODE (disp) != CONST
6933 || !legitimate_constant_p (disp))
6934 && (GET_CODE (disp) != SYMBOL_REF
6935 || !legitimate_constant_p (disp)))
6936 {
6937 reason = "displacement is not constant";
6938 goto report_error;
6939 }
6940 else if (TARGET_64BIT
6941 && !x86_64_immediate_operand (disp, VOIDmode))
6942 {
6943 reason = "displacement is out of range";
6944 goto report_error;
6945 }
6946 }
6947
6948 /* Everything looks valid. */
6949 if (TARGET_DEBUG_ADDR)
6950 fprintf (stderr, "Success.\n");
6951 return TRUE;
6952
6953 report_error:
6954 if (TARGET_DEBUG_ADDR)
6955 {
6956 fprintf (stderr, "Error: %s\n", reason);
6957 debug_rtx (reason_rtx);
6958 }
6959 return FALSE;
6960 }
6961 \f
6962 /* Return a unique alias set for the GOT. */
6963
6964 static HOST_WIDE_INT
6965 ix86_GOT_alias_set (void)
6966 {
6967 static HOST_WIDE_INT set = -1;
6968 if (set == -1)
6969 set = new_alias_set ();
6970 return set;
6971 }
6972
6973 /* Return a legitimate reference for ORIG (an address) using the
6974 register REG. If REG is 0, a new pseudo is generated.
6975
6976 There are two types of references that must be handled:
6977
6978 1. Global data references must load the address from the GOT, via
6979 the PIC reg. An insn is emitted to do this load, and the reg is
6980 returned.
6981
6982 2. Static data references, constant pool addresses, and code labels
6983 compute the address as an offset from the GOT, whose base is in
6984 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
6985 differentiate them from global data objects. The returned
6986 address is the PIC reg + an unspec constant.
6987
6988 GO_IF_LEGITIMATE_ADDRESS rejects symbolic references unless the PIC
6989 reg also appears in the address. */
6990
6991 static rtx
6992 legitimize_pic_address (rtx orig, rtx reg)
6993 {
6994 rtx addr = orig;
6995 rtx new = orig;
6996 rtx base;
6997
6998 #if TARGET_MACHO
6999 if (TARGET_MACHO && !TARGET_64BIT)
7000 {
7001 if (reg == 0)
7002 reg = gen_reg_rtx (Pmode);
7003 /* Use the generic Mach-O PIC machinery. */
7004 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
7005 }
7006 #endif
7007
7008 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
7009 new = addr;
7010 else if (TARGET_64BIT
7011 && ix86_cmodel != CM_SMALL_PIC
7012 && local_symbolic_operand (addr, Pmode))
7013 {
7014 rtx tmpreg;
7015 /* This symbol may be referenced via a displacement from the PIC
7016 base address (@GOTOFF). */
7017
7018 if (reload_in_progress)
7019 regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1;
7020 if (GET_CODE (addr) == CONST)
7021 addr = XEXP (addr, 0);
7022 if (GET_CODE (addr) == PLUS)
7023 {
7024 new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)), UNSPEC_GOTOFF);
7025 new = gen_rtx_PLUS (Pmode, new, XEXP (addr, 1));
7026 }
7027 else
7028 new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
7029 new = gen_rtx_CONST (Pmode, new);
7030 if (!reg)
7031 tmpreg = gen_reg_rtx (Pmode);
7032 else
7033 tmpreg = reg;
7034 emit_move_insn (tmpreg, new);
7035
7036 if (reg != 0)
7037 {
7038 new = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
7039 tmpreg, 1, OPTAB_DIRECT);
7040 new = reg;
7041 }
7042 else new = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
7043 }
7044 else if (!TARGET_64BIT && local_symbolic_operand (addr, Pmode))
7045 {
7046 /* This symbol may be referenced via a displacement from the PIC
7047 base address (@GOTOFF). */
7048
7049 if (reload_in_progress)
7050 regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1;
7051 if (GET_CODE (addr) == CONST)
7052 addr = XEXP (addr, 0);
7053 if (GET_CODE (addr) == PLUS)
7054 {
7055 new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)), UNSPEC_GOTOFF);
7056 new = gen_rtx_PLUS (Pmode, new, XEXP (addr, 1));
7057 }
7058 else
7059 new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
7060 new = gen_rtx_CONST (Pmode, new);
7061 new = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new);
7062
7063 if (reg != 0)
7064 {
7065 emit_move_insn (reg, new);
7066 new = reg;
7067 }
7068 }
7069 else if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
7070 {
7071 if (TARGET_64BIT)
7072 {
7073 new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
7074 new = gen_rtx_CONST (Pmode, new);
7075 new = gen_const_mem (Pmode, new);
7076 set_mem_alias_set (new, ix86_GOT_alias_set ());
7077
7078 if (reg == 0)
7079 reg = gen_reg_rtx (Pmode);
7080 /* Use directly gen_movsi, otherwise the address is loaded
7081 into register for CSE. We don't want to CSE this addresses,
7082 instead we CSE addresses from the GOT table, so skip this. */
7083 emit_insn (gen_movsi (reg, new));
7084 new = reg;
7085 }
7086 else
7087 {
7088 /* This symbol must be referenced via a load from the
7089 Global Offset Table (@GOT). */
7090
7091 if (reload_in_progress)
7092 regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1;
7093 new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
7094 new = gen_rtx_CONST (Pmode, new);
7095 new = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new);
7096 new = gen_const_mem (Pmode, new);
7097 set_mem_alias_set (new, ix86_GOT_alias_set ());
7098
7099 if (reg == 0)
7100 reg = gen_reg_rtx (Pmode);
7101 emit_move_insn (reg, new);
7102 new = reg;
7103 }
7104 }
7105 else
7106 {
7107 if (CONST_INT_P (addr)
7108 && !x86_64_immediate_operand (addr, VOIDmode))
7109 {
7110 if (reg)
7111 {
7112 emit_move_insn (reg, addr);
7113 new = reg;
7114 }
7115 else
7116 new = force_reg (Pmode, addr);
7117 }
7118 else if (GET_CODE (addr) == CONST)
7119 {
7120 addr = XEXP (addr, 0);
7121
7122 /* We must match stuff we generate before. Assume the only
7123 unspecs that can get here are ours. Not that we could do
7124 anything with them anyway.... */
7125 if (GET_CODE (addr) == UNSPEC
7126 || (GET_CODE (addr) == PLUS
7127 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
7128 return orig;
7129 gcc_assert (GET_CODE (addr) == PLUS);
7130 }
7131 if (GET_CODE (addr) == PLUS)
7132 {
7133 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
7134
7135 /* Check first to see if this is a constant offset from a @GOTOFF
7136 symbol reference. */
7137 if (local_symbolic_operand (op0, Pmode)
7138 && CONST_INT_P (op1))
7139 {
7140 if (!TARGET_64BIT)
7141 {
7142 if (reload_in_progress)
7143 regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1;
7144 new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
7145 UNSPEC_GOTOFF);
7146 new = gen_rtx_PLUS (Pmode, new, op1);
7147 new = gen_rtx_CONST (Pmode, new);
7148 new = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new);
7149
7150 if (reg != 0)
7151 {
7152 emit_move_insn (reg, new);
7153 new = reg;
7154 }
7155 }
7156 else
7157 {
7158 if (INTVAL (op1) < -16*1024*1024
7159 || INTVAL (op1) >= 16*1024*1024)
7160 {
7161 if (!x86_64_immediate_operand (op1, Pmode))
7162 op1 = force_reg (Pmode, op1);
7163 new = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
7164 }
7165 }
7166 }
7167 else
7168 {
7169 base = legitimize_pic_address (XEXP (addr, 0), reg);
7170 new = legitimize_pic_address (XEXP (addr, 1),
7171 base == reg ? NULL_RTX : reg);
7172
7173 if (CONST_INT_P (new))
7174 new = plus_constant (base, INTVAL (new));
7175 else
7176 {
7177 if (GET_CODE (new) == PLUS && CONSTANT_P (XEXP (new, 1)))
7178 {
7179 base = gen_rtx_PLUS (Pmode, base, XEXP (new, 0));
7180 new = XEXP (new, 1);
7181 }
7182 new = gen_rtx_PLUS (Pmode, base, new);
7183 }
7184 }
7185 }
7186 }
7187 return new;
7188 }
7189 \f
7190 /* Load the thread pointer. If TO_REG is true, force it into a register. */
7191
7192 static rtx
7193 get_thread_pointer (int to_reg)
7194 {
7195 rtx tp, reg, insn;
7196
7197 tp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
7198 if (!to_reg)
7199 return tp;
7200
7201 reg = gen_reg_rtx (Pmode);
7202 insn = gen_rtx_SET (VOIDmode, reg, tp);
7203 insn = emit_insn (insn);
7204
7205 return reg;
7206 }
7207
7208 /* A subroutine of legitimize_address and ix86_expand_move. FOR_MOV is
7209 false if we expect this to be used for a memory address and true if
7210 we expect to load the address into a register. */
7211
7212 static rtx
7213 legitimize_tls_address (rtx x, enum tls_model model, int for_mov)
7214 {
7215 rtx dest, base, off, pic, tp;
7216 int type;
7217
7218 switch (model)
7219 {
7220 case TLS_MODEL_GLOBAL_DYNAMIC:
7221 dest = gen_reg_rtx (Pmode);
7222 tp = TARGET_GNU2_TLS ? get_thread_pointer (1) : 0;
7223
7224 if (TARGET_64BIT && ! TARGET_GNU2_TLS)
7225 {
7226 rtx rax = gen_rtx_REG (Pmode, 0), insns;
7227
7228 start_sequence ();
7229 emit_call_insn (gen_tls_global_dynamic_64 (rax, x));
7230 insns = get_insns ();
7231 end_sequence ();
7232
7233 emit_libcall_block (insns, dest, rax, x);
7234 }
7235 else if (TARGET_64BIT && TARGET_GNU2_TLS)
7236 emit_insn (gen_tls_global_dynamic_64 (dest, x));
7237 else
7238 emit_insn (gen_tls_global_dynamic_32 (dest, x));
7239
7240 if (TARGET_GNU2_TLS)
7241 {
7242 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
7243
7244 set_unique_reg_note (get_last_insn (), REG_EQUIV, x);
7245 }
7246 break;
7247
7248 case TLS_MODEL_LOCAL_DYNAMIC:
7249 base = gen_reg_rtx (Pmode);
7250 tp = TARGET_GNU2_TLS ? get_thread_pointer (1) : 0;
7251
7252 if (TARGET_64BIT && ! TARGET_GNU2_TLS)
7253 {
7254 rtx rax = gen_rtx_REG (Pmode, 0), insns, note;
7255
7256 start_sequence ();
7257 emit_call_insn (gen_tls_local_dynamic_base_64 (rax));
7258 insns = get_insns ();
7259 end_sequence ();
7260
7261 note = gen_rtx_EXPR_LIST (VOIDmode, const0_rtx, NULL);
7262 note = gen_rtx_EXPR_LIST (VOIDmode, ix86_tls_get_addr (), note);
7263 emit_libcall_block (insns, base, rax, note);
7264 }
7265 else if (TARGET_64BIT && TARGET_GNU2_TLS)
7266 emit_insn (gen_tls_local_dynamic_base_64 (base));
7267 else
7268 emit_insn (gen_tls_local_dynamic_base_32 (base));
7269
7270 if (TARGET_GNU2_TLS)
7271 {
7272 rtx x = ix86_tls_module_base ();
7273
7274 set_unique_reg_note (get_last_insn (), REG_EQUIV,
7275 gen_rtx_MINUS (Pmode, x, tp));
7276 }
7277
7278 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
7279 off = gen_rtx_CONST (Pmode, off);
7280
7281 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
7282
7283 if (TARGET_GNU2_TLS)
7284 {
7285 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
7286
7287 set_unique_reg_note (get_last_insn (), REG_EQUIV, x);
7288 }
7289
7290 break;
7291
7292 case TLS_MODEL_INITIAL_EXEC:
7293 if (TARGET_64BIT)
7294 {
7295 pic = NULL;
7296 type = UNSPEC_GOTNTPOFF;
7297 }
7298 else if (flag_pic)
7299 {
7300 if (reload_in_progress)
7301 regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1;
7302 pic = pic_offset_table_rtx;
7303 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
7304 }
7305 else if (!TARGET_ANY_GNU_TLS)
7306 {
7307 pic = gen_reg_rtx (Pmode);
7308 emit_insn (gen_set_got (pic));
7309 type = UNSPEC_GOTTPOFF;
7310 }
7311 else
7312 {
7313 pic = NULL;
7314 type = UNSPEC_INDNTPOFF;
7315 }
7316
7317 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
7318 off = gen_rtx_CONST (Pmode, off);
7319 if (pic)
7320 off = gen_rtx_PLUS (Pmode, pic, off);
7321 off = gen_const_mem (Pmode, off);
7322 set_mem_alias_set (off, ix86_GOT_alias_set ());
7323
7324 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
7325 {
7326 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
7327 off = force_reg (Pmode, off);
7328 return gen_rtx_PLUS (Pmode, base, off);
7329 }
7330 else
7331 {
7332 base = get_thread_pointer (true);
7333 dest = gen_reg_rtx (Pmode);
7334 emit_insn (gen_subsi3 (dest, base, off));
7335 }
7336 break;
7337
7338 case TLS_MODEL_LOCAL_EXEC:
7339 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
7340 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
7341 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
7342 off = gen_rtx_CONST (Pmode, off);
7343
7344 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
7345 {
7346 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
7347 return gen_rtx_PLUS (Pmode, base, off);
7348 }
7349 else
7350 {
7351 base = get_thread_pointer (true);
7352 dest = gen_reg_rtx (Pmode);
7353 emit_insn (gen_subsi3 (dest, base, off));
7354 }
7355 break;
7356
7357 default:
7358 gcc_unreachable ();
7359 }
7360
7361 return dest;
7362 }
7363
7364 /* Try machine-dependent ways of modifying an illegitimate address
7365 to be legitimate. If we find one, return the new, valid address.
7366 This macro is used in only one place: `memory_address' in explow.c.
7367
7368 OLDX is the address as it was before break_out_memory_refs was called.
7369 In some cases it is useful to look at this to decide what needs to be done.
7370
7371 MODE and WIN are passed so that this macro can use
7372 GO_IF_LEGITIMATE_ADDRESS.
7373
7374 It is always safe for this macro to do nothing. It exists to recognize
7375 opportunities to optimize the output.
7376
7377 For the 80386, we handle X+REG by loading X into a register R and
7378 using R+REG. R will go in a general reg and indexing will be used.
7379 However, if REG is a broken-out memory address or multiplication,
7380 nothing needs to be done because REG can certainly go in a general reg.
7381
7382 When -fpic is used, special handling is needed for symbolic references.
7383 See comments by legitimize_pic_address in i386.c for details. */
7384
7385 rtx
7386 legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode)
7387 {
7388 int changed = 0;
7389 unsigned log;
7390
7391 if (TARGET_DEBUG_ADDR)
7392 {
7393 fprintf (stderr, "\n==========\nLEGITIMIZE_ADDRESS, mode = %s\n",
7394 GET_MODE_NAME (mode));
7395 debug_rtx (x);
7396 }
7397
7398 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
7399 if (log)
7400 return legitimize_tls_address (x, log, false);
7401 if (GET_CODE (x) == CONST
7402 && GET_CODE (XEXP (x, 0)) == PLUS
7403 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
7404 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
7405 {
7406 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0), log, false);
7407 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
7408 }
7409
7410 if (flag_pic && SYMBOLIC_CONST (x))
7411 return legitimize_pic_address (x, 0);
7412
7413 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
7414 if (GET_CODE (x) == ASHIFT
7415 && CONST_INT_P (XEXP (x, 1))
7416 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
7417 {
7418 changed = 1;
7419 log = INTVAL (XEXP (x, 1));
7420 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
7421 GEN_INT (1 << log));
7422 }
7423
7424 if (GET_CODE (x) == PLUS)
7425 {
7426 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
7427
7428 if (GET_CODE (XEXP (x, 0)) == ASHIFT
7429 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7430 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
7431 {
7432 changed = 1;
7433 log = INTVAL (XEXP (XEXP (x, 0), 1));
7434 XEXP (x, 0) = gen_rtx_MULT (Pmode,
7435 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
7436 GEN_INT (1 << log));
7437 }
7438
7439 if (GET_CODE (XEXP (x, 1)) == ASHIFT
7440 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
7441 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
7442 {
7443 changed = 1;
7444 log = INTVAL (XEXP (XEXP (x, 1), 1));
7445 XEXP (x, 1) = gen_rtx_MULT (Pmode,
7446 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
7447 GEN_INT (1 << log));
7448 }
7449
7450 /* Put multiply first if it isn't already. */
7451 if (GET_CODE (XEXP (x, 1)) == MULT)
7452 {
7453 rtx tmp = XEXP (x, 0);
7454 XEXP (x, 0) = XEXP (x, 1);
7455 XEXP (x, 1) = tmp;
7456 changed = 1;
7457 }
7458
7459 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
7460 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
7461 created by virtual register instantiation, register elimination, and
7462 similar optimizations. */
7463 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
7464 {
7465 changed = 1;
7466 x = gen_rtx_PLUS (Pmode,
7467 gen_rtx_PLUS (Pmode, XEXP (x, 0),
7468 XEXP (XEXP (x, 1), 0)),
7469 XEXP (XEXP (x, 1), 1));
7470 }
7471
7472 /* Canonicalize
7473 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
7474 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
7475 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
7476 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7477 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
7478 && CONSTANT_P (XEXP (x, 1)))
7479 {
7480 rtx constant;
7481 rtx other = NULL_RTX;
7482
7483 if (CONST_INT_P (XEXP (x, 1)))
7484 {
7485 constant = XEXP (x, 1);
7486 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
7487 }
7488 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
7489 {
7490 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
7491 other = XEXP (x, 1);
7492 }
7493 else
7494 constant = 0;
7495
7496 if (constant)
7497 {
7498 changed = 1;
7499 x = gen_rtx_PLUS (Pmode,
7500 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
7501 XEXP (XEXP (XEXP (x, 0), 1), 0)),
7502 plus_constant (other, INTVAL (constant)));
7503 }
7504 }
7505
7506 if (changed && legitimate_address_p (mode, x, FALSE))
7507 return x;
7508
7509 if (GET_CODE (XEXP (x, 0)) == MULT)
7510 {
7511 changed = 1;
7512 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
7513 }
7514
7515 if (GET_CODE (XEXP (x, 1)) == MULT)
7516 {
7517 changed = 1;
7518 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
7519 }
7520
7521 if (changed
7522 && REG_P (XEXP (x, 1))
7523 && REG_P (XEXP (x, 0)))
7524 return x;
7525
7526 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
7527 {
7528 changed = 1;
7529 x = legitimize_pic_address (x, 0);
7530 }
7531
7532 if (changed && legitimate_address_p (mode, x, FALSE))
7533 return x;
7534
7535 if (REG_P (XEXP (x, 0)))
7536 {
7537 rtx temp = gen_reg_rtx (Pmode);
7538 rtx val = force_operand (XEXP (x, 1), temp);
7539 if (val != temp)
7540 emit_move_insn (temp, val);
7541
7542 XEXP (x, 1) = temp;
7543 return x;
7544 }
7545
7546 else if (REG_P (XEXP (x, 1)))
7547 {
7548 rtx temp = gen_reg_rtx (Pmode);
7549 rtx val = force_operand (XEXP (x, 0), temp);
7550 if (val != temp)
7551 emit_move_insn (temp, val);
7552
7553 XEXP (x, 0) = temp;
7554 return x;
7555 }
7556 }
7557
7558 return x;
7559 }
7560 \f
7561 /* Print an integer constant expression in assembler syntax. Addition
7562 and subtraction are the only arithmetic that may appear in these
7563 expressions. FILE is the stdio stream to write to, X is the rtx, and
7564 CODE is the operand print code from the output string. */
7565
7566 static void
7567 output_pic_addr_const (FILE *file, rtx x, int code)
7568 {
7569 char buf[256];
7570
7571 switch (GET_CODE (x))
7572 {
7573 case PC:
7574 gcc_assert (flag_pic);
7575 putc ('.', file);
7576 break;
7577
7578 case SYMBOL_REF:
7579 output_addr_const (file, x);
7580 if (!TARGET_MACHO && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
7581 fputs ("@PLT", file);
7582 break;
7583
7584 case LABEL_REF:
7585 x = XEXP (x, 0);
7586 /* FALLTHRU */
7587 case CODE_LABEL:
7588 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
7589 assemble_name (asm_out_file, buf);
7590 break;
7591
7592 case CONST_INT:
7593 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7594 break;
7595
7596 case CONST:
7597 /* This used to output parentheses around the expression,
7598 but that does not work on the 386 (either ATT or BSD assembler). */
7599 output_pic_addr_const (file, XEXP (x, 0), code);
7600 break;
7601
7602 case CONST_DOUBLE:
7603 if (GET_MODE (x) == VOIDmode)
7604 {
7605 /* We can use %d if the number is <32 bits and positive. */
7606 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
7607 fprintf (file, "0x%lx%08lx",
7608 (unsigned long) CONST_DOUBLE_HIGH (x),
7609 (unsigned long) CONST_DOUBLE_LOW (x));
7610 else
7611 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
7612 }
7613 else
7614 /* We can't handle floating point constants;
7615 PRINT_OPERAND must handle them. */
7616 output_operand_lossage ("floating constant misused");
7617 break;
7618
7619 case PLUS:
7620 /* Some assemblers need integer constants to appear first. */
7621 if (CONST_INT_P (XEXP (x, 0)))
7622 {
7623 output_pic_addr_const (file, XEXP (x, 0), code);
7624 putc ('+', file);
7625 output_pic_addr_const (file, XEXP (x, 1), code);
7626 }
7627 else
7628 {
7629 gcc_assert (CONST_INT_P (XEXP (x, 1)));
7630 output_pic_addr_const (file, XEXP (x, 1), code);
7631 putc ('+', file);
7632 output_pic_addr_const (file, XEXP (x, 0), code);
7633 }
7634 break;
7635
7636 case MINUS:
7637 if (!TARGET_MACHO)
7638 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
7639 output_pic_addr_const (file, XEXP (x, 0), code);
7640 putc ('-', file);
7641 output_pic_addr_const (file, XEXP (x, 1), code);
7642 if (!TARGET_MACHO)
7643 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
7644 break;
7645
7646 case UNSPEC:
7647 gcc_assert (XVECLEN (x, 0) == 1);
7648 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
7649 switch (XINT (x, 1))
7650 {
7651 case UNSPEC_GOT:
7652 fputs ("@GOT", file);
7653 break;
7654 case UNSPEC_GOTOFF:
7655 fputs ("@GOTOFF", file);
7656 break;
7657 case UNSPEC_GOTPCREL:
7658 fputs ("@GOTPCREL(%rip)", file);
7659 break;
7660 case UNSPEC_GOTTPOFF:
7661 /* FIXME: This might be @TPOFF in Sun ld too. */
7662 fputs ("@GOTTPOFF", file);
7663 break;
7664 case UNSPEC_TPOFF:
7665 fputs ("@TPOFF", file);
7666 break;
7667 case UNSPEC_NTPOFF:
7668 if (TARGET_64BIT)
7669 fputs ("@TPOFF", file);
7670 else
7671 fputs ("@NTPOFF", file);
7672 break;
7673 case UNSPEC_DTPOFF:
7674 fputs ("@DTPOFF", file);
7675 break;
7676 case UNSPEC_GOTNTPOFF:
7677 if (TARGET_64BIT)
7678 fputs ("@GOTTPOFF(%rip)", file);
7679 else
7680 fputs ("@GOTNTPOFF", file);
7681 break;
7682 case UNSPEC_INDNTPOFF:
7683 fputs ("@INDNTPOFF", file);
7684 break;
7685 default:
7686 output_operand_lossage ("invalid UNSPEC as operand");
7687 break;
7688 }
7689 break;
7690
7691 default:
7692 output_operand_lossage ("invalid expression as operand");
7693 }
7694 }
7695
7696 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
7697 We need to emit DTP-relative relocations. */
7698
7699 static void
7700 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
7701 {
7702 fputs (ASM_LONG, file);
7703 output_addr_const (file, x);
7704 fputs ("@DTPOFF", file);
7705 switch (size)
7706 {
7707 case 4:
7708 break;
7709 case 8:
7710 fputs (", 0", file);
7711 break;
7712 default:
7713 gcc_unreachable ();
7714 }
7715 }
7716
7717 /* In the name of slightly smaller debug output, and to cater to
7718 general assembler lossage, recognize PIC+GOTOFF and turn it back
7719 into a direct symbol reference.
7720
7721 On Darwin, this is necessary to avoid a crash, because Darwin
7722 has a different PIC label for each routine but the DWARF debugging
7723 information is not associated with any particular routine, so it's
7724 necessary to remove references to the PIC label from RTL stored by
7725 the DWARF output code. */
7726
7727 static rtx
7728 ix86_delegitimize_address (rtx orig_x)
7729 {
7730 rtx x = orig_x;
7731 /* reg_addend is NULL or a multiple of some register. */
7732 rtx reg_addend = NULL_RTX;
7733 /* const_addend is NULL or a const_int. */
7734 rtx const_addend = NULL_RTX;
7735 /* This is the result, or NULL. */
7736 rtx result = NULL_RTX;
7737
7738 if (MEM_P (x))
7739 x = XEXP (x, 0);
7740
7741 if (TARGET_64BIT)
7742 {
7743 if (GET_CODE (x) != CONST
7744 || GET_CODE (XEXP (x, 0)) != UNSPEC
7745 || XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
7746 || !MEM_P (orig_x))
7747 return orig_x;
7748 return XVECEXP (XEXP (x, 0), 0, 0);
7749 }
7750
7751 if (GET_CODE (x) != PLUS
7752 || GET_CODE (XEXP (x, 1)) != CONST)
7753 return orig_x;
7754
7755 if (REG_P (XEXP (x, 0))
7756 && REGNO (XEXP (x, 0)) == PIC_OFFSET_TABLE_REGNUM)
7757 /* %ebx + GOT/GOTOFF */
7758 ;
7759 else if (GET_CODE (XEXP (x, 0)) == PLUS)
7760 {
7761 /* %ebx + %reg * scale + GOT/GOTOFF */
7762 reg_addend = XEXP (x, 0);
7763 if (REG_P (XEXP (reg_addend, 0))
7764 && REGNO (XEXP (reg_addend, 0)) == PIC_OFFSET_TABLE_REGNUM)
7765 reg_addend = XEXP (reg_addend, 1);
7766 else if (REG_P (XEXP (reg_addend, 1))
7767 && REGNO (XEXP (reg_addend, 1)) == PIC_OFFSET_TABLE_REGNUM)
7768 reg_addend = XEXP (reg_addend, 0);
7769 else
7770 return orig_x;
7771 if (!REG_P (reg_addend)
7772 && GET_CODE (reg_addend) != MULT
7773 && GET_CODE (reg_addend) != ASHIFT)
7774 return orig_x;
7775 }
7776 else
7777 return orig_x;
7778
7779 x = XEXP (XEXP (x, 1), 0);
7780 if (GET_CODE (x) == PLUS
7781 && CONST_INT_P (XEXP (x, 1)))
7782 {
7783 const_addend = XEXP (x, 1);
7784 x = XEXP (x, 0);
7785 }
7786
7787 if (GET_CODE (x) == UNSPEC
7788 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x))
7789 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
7790 result = XVECEXP (x, 0, 0);
7791
7792 if (TARGET_MACHO && darwin_local_data_pic (x)
7793 && !MEM_P (orig_x))
7794 result = XEXP (x, 0);
7795
7796 if (! result)
7797 return orig_x;
7798
7799 if (const_addend)
7800 result = gen_rtx_PLUS (Pmode, result, const_addend);
7801 if (reg_addend)
7802 result = gen_rtx_PLUS (Pmode, reg_addend, result);
7803 return result;
7804 }
7805 \f
7806 static void
7807 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
7808 int fp, FILE *file)
7809 {
7810 const char *suffix;
7811
7812 if (mode == CCFPmode || mode == CCFPUmode)
7813 {
7814 enum rtx_code second_code, bypass_code;
7815 ix86_fp_comparison_codes (code, &bypass_code, &code, &second_code);
7816 gcc_assert (bypass_code == UNKNOWN && second_code == UNKNOWN);
7817 code = ix86_fp_compare_code_to_integer (code);
7818 mode = CCmode;
7819 }
7820 if (reverse)
7821 code = reverse_condition (code);
7822
7823 switch (code)
7824 {
7825 case EQ:
7826 suffix = "e";
7827 break;
7828 case NE:
7829 suffix = "ne";
7830 break;
7831 case GT:
7832 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
7833 suffix = "g";
7834 break;
7835 case GTU:
7836 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
7837 Those same assemblers have the same but opposite lossage on cmov. */
7838 gcc_assert (mode == CCmode);
7839 suffix = fp ? "nbe" : "a";
7840 break;
7841 case LT:
7842 switch (mode)
7843 {
7844 case CCNOmode:
7845 case CCGOCmode:
7846 suffix = "s";
7847 break;
7848
7849 case CCmode:
7850 case CCGCmode:
7851 suffix = "l";
7852 break;
7853
7854 default:
7855 gcc_unreachable ();
7856 }
7857 break;
7858 case LTU:
7859 gcc_assert (mode == CCmode);
7860 suffix = "b";
7861 break;
7862 case GE:
7863 switch (mode)
7864 {
7865 case CCNOmode:
7866 case CCGOCmode:
7867 suffix = "ns";
7868 break;
7869
7870 case CCmode:
7871 case CCGCmode:
7872 suffix = "ge";
7873 break;
7874
7875 default:
7876 gcc_unreachable ();
7877 }
7878 break;
7879 case GEU:
7880 /* ??? As above. */
7881 gcc_assert (mode == CCmode);
7882 suffix = fp ? "nb" : "ae";
7883 break;
7884 case LE:
7885 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
7886 suffix = "le";
7887 break;
7888 case LEU:
7889 gcc_assert (mode == CCmode);
7890 suffix = "be";
7891 break;
7892 case UNORDERED:
7893 suffix = fp ? "u" : "p";
7894 break;
7895 case ORDERED:
7896 suffix = fp ? "nu" : "np";
7897 break;
7898 default:
7899 gcc_unreachable ();
7900 }
7901 fputs (suffix, file);
7902 }
7903
7904 /* Print the name of register X to FILE based on its machine mode and number.
7905 If CODE is 'w', pretend the mode is HImode.
7906 If CODE is 'b', pretend the mode is QImode.
7907 If CODE is 'k', pretend the mode is SImode.
7908 If CODE is 'q', pretend the mode is DImode.
7909 If CODE is 'h', pretend the reg is the 'high' byte register.
7910 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op. */
7911
7912 void
7913 print_reg (rtx x, int code, FILE *file)
7914 {
7915 gcc_assert (REGNO (x) != ARG_POINTER_REGNUM
7916 && REGNO (x) != FRAME_POINTER_REGNUM
7917 && REGNO (x) != FLAGS_REG
7918 && REGNO (x) != FPSR_REG
7919 && REGNO (x) != FPCR_REG);
7920
7921 if (ASSEMBLER_DIALECT == ASM_ATT || USER_LABEL_PREFIX[0] == 0)
7922 putc ('%', file);
7923
7924 if (code == 'w' || MMX_REG_P (x))
7925 code = 2;
7926 else if (code == 'b')
7927 code = 1;
7928 else if (code == 'k')
7929 code = 4;
7930 else if (code == 'q')
7931 code = 8;
7932 else if (code == 'y')
7933 code = 3;
7934 else if (code == 'h')
7935 code = 0;
7936 else
7937 code = GET_MODE_SIZE (GET_MODE (x));
7938
7939 /* Irritatingly, AMD extended registers use different naming convention
7940 from the normal registers. */
7941 if (REX_INT_REG_P (x))
7942 {
7943 gcc_assert (TARGET_64BIT);
7944 switch (code)
7945 {
7946 case 0:
7947 error ("extended registers have no high halves");
7948 break;
7949 case 1:
7950 fprintf (file, "r%ib", REGNO (x) - FIRST_REX_INT_REG + 8);
7951 break;
7952 case 2:
7953 fprintf (file, "r%iw", REGNO (x) - FIRST_REX_INT_REG + 8);
7954 break;
7955 case 4:
7956 fprintf (file, "r%id", REGNO (x) - FIRST_REX_INT_REG + 8);
7957 break;
7958 case 8:
7959 fprintf (file, "r%i", REGNO (x) - FIRST_REX_INT_REG + 8);
7960 break;
7961 default:
7962 error ("unsupported operand size for extended register");
7963 break;
7964 }
7965 return;
7966 }
7967 switch (code)
7968 {
7969 case 3:
7970 if (STACK_TOP_P (x))
7971 {
7972 fputs ("st(0)", file);
7973 break;
7974 }
7975 /* FALLTHRU */
7976 case 8:
7977 case 4:
7978 case 12:
7979 if (! ANY_FP_REG_P (x))
7980 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
7981 /* FALLTHRU */
7982 case 16:
7983 case 2:
7984 normal:
7985 fputs (hi_reg_name[REGNO (x)], file);
7986 break;
7987 case 1:
7988 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
7989 goto normal;
7990 fputs (qi_reg_name[REGNO (x)], file);
7991 break;
7992 case 0:
7993 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
7994 goto normal;
7995 fputs (qi_high_reg_name[REGNO (x)], file);
7996 break;
7997 default:
7998 gcc_unreachable ();
7999 }
8000 }
8001
8002 /* Locate some local-dynamic symbol still in use by this function
8003 so that we can print its name in some tls_local_dynamic_base
8004 pattern. */
8005
8006 static const char *
8007 get_some_local_dynamic_name (void)
8008 {
8009 rtx insn;
8010
8011 if (cfun->machine->some_ld_name)
8012 return cfun->machine->some_ld_name;
8013
8014 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
8015 if (INSN_P (insn)
8016 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
8017 return cfun->machine->some_ld_name;
8018
8019 gcc_unreachable ();
8020 }
8021
8022 static int
8023 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
8024 {
8025 rtx x = *px;
8026
8027 if (GET_CODE (x) == SYMBOL_REF
8028 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
8029 {
8030 cfun->machine->some_ld_name = XSTR (x, 0);
8031 return 1;
8032 }
8033
8034 return 0;
8035 }
8036
8037 /* Meaning of CODE:
8038 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
8039 C -- print opcode suffix for set/cmov insn.
8040 c -- like C, but print reversed condition
8041 F,f -- likewise, but for floating-point.
8042 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
8043 otherwise nothing
8044 R -- print the prefix for register names.
8045 z -- print the opcode suffix for the size of the current operand.
8046 * -- print a star (in certain assembler syntax)
8047 A -- print an absolute memory reference.
8048 w -- print the operand as if it's a "word" (HImode) even if it isn't.
8049 s -- print a shift double count, followed by the assemblers argument
8050 delimiter.
8051 b -- print the QImode name of the register for the indicated operand.
8052 %b0 would print %al if operands[0] is reg 0.
8053 w -- likewise, print the HImode name of the register.
8054 k -- likewise, print the SImode name of the register.
8055 q -- likewise, print the DImode name of the register.
8056 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
8057 y -- print "st(0)" instead of "st" as a register.
8058 D -- print condition for SSE cmp instruction.
8059 P -- if PIC, print an @PLT suffix.
8060 X -- don't print any sort of PIC '@' suffix for a symbol.
8061 & -- print some in-use local-dynamic symbol name.
8062 H -- print a memory address offset by 8; used for sse high-parts
8063 */
8064
8065 void
8066 print_operand (FILE *file, rtx x, int code)
8067 {
8068 if (code)
8069 {
8070 switch (code)
8071 {
8072 case '*':
8073 if (ASSEMBLER_DIALECT == ASM_ATT)
8074 putc ('*', file);
8075 return;
8076
8077 case '&':
8078 assemble_name (file, get_some_local_dynamic_name ());
8079 return;
8080
8081 case 'A':
8082 switch (ASSEMBLER_DIALECT)
8083 {
8084 case ASM_ATT:
8085 putc ('*', file);
8086 break;
8087
8088 case ASM_INTEL:
8089 /* Intel syntax. For absolute addresses, registers should not
8090 be surrounded by braces. */
8091 if (!REG_P (x))
8092 {
8093 putc ('[', file);
8094 PRINT_OPERAND (file, x, 0);
8095 putc (']', file);
8096 return;
8097 }
8098 break;
8099
8100 default:
8101 gcc_unreachable ();
8102 }
8103
8104 PRINT_OPERAND (file, x, 0);
8105 return;
8106
8107
8108 case 'L':
8109 if (ASSEMBLER_DIALECT == ASM_ATT)
8110 putc ('l', file);
8111 return;
8112
8113 case 'W':
8114 if (ASSEMBLER_DIALECT == ASM_ATT)
8115 putc ('w', file);
8116 return;
8117
8118 case 'B':
8119 if (ASSEMBLER_DIALECT == ASM_ATT)
8120 putc ('b', file);
8121 return;
8122
8123 case 'Q':
8124 if (ASSEMBLER_DIALECT == ASM_ATT)
8125 putc ('l', file);
8126 return;
8127
8128 case 'S':
8129 if (ASSEMBLER_DIALECT == ASM_ATT)
8130 putc ('s', file);
8131 return;
8132
8133 case 'T':
8134 if (ASSEMBLER_DIALECT == ASM_ATT)
8135 putc ('t', file);
8136 return;
8137
8138 case 'z':
8139 /* 387 opcodes don't get size suffixes if the operands are
8140 registers. */
8141 if (STACK_REG_P (x))
8142 return;
8143
8144 /* Likewise if using Intel opcodes. */
8145 if (ASSEMBLER_DIALECT == ASM_INTEL)
8146 return;
8147
8148 /* This is the size of op from size of operand. */
8149 switch (GET_MODE_SIZE (GET_MODE (x)))
8150 {
8151 case 1:
8152 putc ('b', file);
8153 return;
8154
8155 case 2:
8156 #ifdef HAVE_GAS_FILDS_FISTS
8157 putc ('s', file);
8158 #endif
8159 return;
8160
8161 case 4:
8162 if (GET_MODE (x) == SFmode)
8163 {
8164 putc ('s', file);
8165 return;
8166 }
8167 else
8168 putc ('l', file);
8169 return;
8170
8171 case 12:
8172 case 16:
8173 putc ('t', file);
8174 return;
8175
8176 case 8:
8177 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
8178 {
8179 #ifdef GAS_MNEMONICS
8180 putc ('q', file);
8181 #else
8182 putc ('l', file);
8183 putc ('l', file);
8184 #endif
8185 }
8186 else
8187 putc ('l', file);
8188 return;
8189
8190 default:
8191 gcc_unreachable ();
8192 }
8193
8194 case 'b':
8195 case 'w':
8196 case 'k':
8197 case 'q':
8198 case 'h':
8199 case 'y':
8200 case 'X':
8201 case 'P':
8202 break;
8203
8204 case 's':
8205 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
8206 {
8207 PRINT_OPERAND (file, x, 0);
8208 putc (',', file);
8209 }
8210 return;
8211
8212 case 'D':
8213 /* Little bit of braindamage here. The SSE compare instructions
8214 does use completely different names for the comparisons that the
8215 fp conditional moves. */
8216 switch (GET_CODE (x))
8217 {
8218 case EQ:
8219 case UNEQ:
8220 fputs ("eq", file);
8221 break;
8222 case LT:
8223 case UNLT:
8224 fputs ("lt", file);
8225 break;
8226 case LE:
8227 case UNLE:
8228 fputs ("le", file);
8229 break;
8230 case UNORDERED:
8231 fputs ("unord", file);
8232 break;
8233 case NE:
8234 case LTGT:
8235 fputs ("neq", file);
8236 break;
8237 case UNGE:
8238 case GE:
8239 fputs ("nlt", file);
8240 break;
8241 case UNGT:
8242 case GT:
8243 fputs ("nle", file);
8244 break;
8245 case ORDERED:
8246 fputs ("ord", file);
8247 break;
8248 default:
8249 gcc_unreachable ();
8250 }
8251 return;
8252 case 'O':
8253 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
8254 if (ASSEMBLER_DIALECT == ASM_ATT)
8255 {
8256 switch (GET_MODE (x))
8257 {
8258 case HImode: putc ('w', file); break;
8259 case SImode:
8260 case SFmode: putc ('l', file); break;
8261 case DImode:
8262 case DFmode: putc ('q', file); break;
8263 default: gcc_unreachable ();
8264 }
8265 putc ('.', file);
8266 }
8267 #endif
8268 return;
8269 case 'C':
8270 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
8271 return;
8272 case 'F':
8273 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
8274 if (ASSEMBLER_DIALECT == ASM_ATT)
8275 putc ('.', file);
8276 #endif
8277 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
8278 return;
8279
8280 /* Like above, but reverse condition */
8281 case 'c':
8282 /* Check to see if argument to %c is really a constant
8283 and not a condition code which needs to be reversed. */
8284 if (!COMPARISON_P (x))
8285 {
8286 output_operand_lossage ("operand is neither a constant nor a condition code, invalid operand code 'c'");
8287 return;
8288 }
8289 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
8290 return;
8291 case 'f':
8292 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
8293 if (ASSEMBLER_DIALECT == ASM_ATT)
8294 putc ('.', file);
8295 #endif
8296 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
8297 return;
8298
8299 case 'H':
8300 /* It doesn't actually matter what mode we use here, as we're
8301 only going to use this for printing. */
8302 x = adjust_address_nv (x, DImode, 8);
8303 break;
8304
8305 case '+':
8306 {
8307 rtx x;
8308
8309 if (!optimize || optimize_size || !TARGET_BRANCH_PREDICTION_HINTS)
8310 return;
8311
8312 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
8313 if (x)
8314 {
8315 int pred_val = INTVAL (XEXP (x, 0));
8316
8317 if (pred_val < REG_BR_PROB_BASE * 45 / 100
8318 || pred_val > REG_BR_PROB_BASE * 55 / 100)
8319 {
8320 int taken = pred_val > REG_BR_PROB_BASE / 2;
8321 int cputaken = final_forward_branch_p (current_output_insn) == 0;
8322
8323 /* Emit hints only in the case default branch prediction
8324 heuristics would fail. */
8325 if (taken != cputaken)
8326 {
8327 /* We use 3e (DS) prefix for taken branches and
8328 2e (CS) prefix for not taken branches. */
8329 if (taken)
8330 fputs ("ds ; ", file);
8331 else
8332 fputs ("cs ; ", file);
8333 }
8334 }
8335 }
8336 return;
8337 }
8338 default:
8339 output_operand_lossage ("invalid operand code '%c'", code);
8340 }
8341 }
8342
8343 if (REG_P (x))
8344 print_reg (x, code, file);
8345
8346 else if (MEM_P (x))
8347 {
8348 /* No `byte ptr' prefix for call instructions. */
8349 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
8350 {
8351 const char * size;
8352 switch (GET_MODE_SIZE (GET_MODE (x)))
8353 {
8354 case 1: size = "BYTE"; break;
8355 case 2: size = "WORD"; break;
8356 case 4: size = "DWORD"; break;
8357 case 8: size = "QWORD"; break;
8358 case 12: size = "XWORD"; break;
8359 case 16: size = "XMMWORD"; break;
8360 default:
8361 gcc_unreachable ();
8362 }
8363
8364 /* Check for explicit size override (codes 'b', 'w' and 'k') */
8365 if (code == 'b')
8366 size = "BYTE";
8367 else if (code == 'w')
8368 size = "WORD";
8369 else if (code == 'k')
8370 size = "DWORD";
8371
8372 fputs (size, file);
8373 fputs (" PTR ", file);
8374 }
8375
8376 x = XEXP (x, 0);
8377 /* Avoid (%rip) for call operands. */
8378 if (CONSTANT_ADDRESS_P (x) && code == 'P'
8379 && !CONST_INT_P (x))
8380 output_addr_const (file, x);
8381 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
8382 output_operand_lossage ("invalid constraints for operand");
8383 else
8384 output_address (x);
8385 }
8386
8387 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
8388 {
8389 REAL_VALUE_TYPE r;
8390 long l;
8391
8392 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8393 REAL_VALUE_TO_TARGET_SINGLE (r, l);
8394
8395 if (ASSEMBLER_DIALECT == ASM_ATT)
8396 putc ('$', file);
8397 fprintf (file, "0x%08lx", l);
8398 }
8399
8400 /* These float cases don't actually occur as immediate operands. */
8401 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
8402 {
8403 char dstr[30];
8404
8405 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
8406 fprintf (file, "%s", dstr);
8407 }
8408
8409 else if (GET_CODE (x) == CONST_DOUBLE
8410 && GET_MODE (x) == XFmode)
8411 {
8412 char dstr[30];
8413
8414 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
8415 fprintf (file, "%s", dstr);
8416 }
8417
8418 else
8419 {
8420 /* We have patterns that allow zero sets of memory, for instance.
8421 In 64-bit mode, we should probably support all 8-byte vectors,
8422 since we can in fact encode that into an immediate. */
8423 if (GET_CODE (x) == CONST_VECTOR)
8424 {
8425 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
8426 x = const0_rtx;
8427 }
8428
8429 if (code != 'P')
8430 {
8431 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
8432 {
8433 if (ASSEMBLER_DIALECT == ASM_ATT)
8434 putc ('$', file);
8435 }
8436 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
8437 || GET_CODE (x) == LABEL_REF)
8438 {
8439 if (ASSEMBLER_DIALECT == ASM_ATT)
8440 putc ('$', file);
8441 else
8442 fputs ("OFFSET FLAT:", file);
8443 }
8444 }
8445 if (CONST_INT_P (x))
8446 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8447 else if (flag_pic)
8448 output_pic_addr_const (file, x, code);
8449 else
8450 output_addr_const (file, x);
8451 }
8452 }
8453 \f
8454 /* Print a memory operand whose address is ADDR. */
8455
8456 void
8457 print_operand_address (FILE *file, rtx addr)
8458 {
8459 struct ix86_address parts;
8460 rtx base, index, disp;
8461 int scale;
8462 int ok = ix86_decompose_address (addr, &parts);
8463
8464 gcc_assert (ok);
8465
8466 base = parts.base;
8467 index = parts.index;
8468 disp = parts.disp;
8469 scale = parts.scale;
8470
8471 switch (parts.seg)
8472 {
8473 case SEG_DEFAULT:
8474 break;
8475 case SEG_FS:
8476 case SEG_GS:
8477 if (USER_LABEL_PREFIX[0] == 0)
8478 putc ('%', file);
8479 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
8480 break;
8481 default:
8482 gcc_unreachable ();
8483 }
8484
8485 if (!base && !index)
8486 {
8487 /* Displacement only requires special attention. */
8488
8489 if (CONST_INT_P (disp))
8490 {
8491 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
8492 {
8493 if (USER_LABEL_PREFIX[0] == 0)
8494 putc ('%', file);
8495 fputs ("ds:", file);
8496 }
8497 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
8498 }
8499 else if (flag_pic)
8500 output_pic_addr_const (file, disp, 0);
8501 else
8502 output_addr_const (file, disp);
8503
8504 /* Use one byte shorter RIP relative addressing for 64bit mode. */
8505 if (TARGET_64BIT)
8506 {
8507 if (GET_CODE (disp) == CONST
8508 && GET_CODE (XEXP (disp, 0)) == PLUS
8509 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
8510 disp = XEXP (XEXP (disp, 0), 0);
8511 if (GET_CODE (disp) == LABEL_REF
8512 || (GET_CODE (disp) == SYMBOL_REF
8513 && SYMBOL_REF_TLS_MODEL (disp) == 0))
8514 fputs ("(%rip)", file);
8515 }
8516 }
8517 else
8518 {
8519 if (ASSEMBLER_DIALECT == ASM_ATT)
8520 {
8521 if (disp)
8522 {
8523 if (flag_pic)
8524 output_pic_addr_const (file, disp, 0);
8525 else if (GET_CODE (disp) == LABEL_REF)
8526 output_asm_label (disp);
8527 else
8528 output_addr_const (file, disp);
8529 }
8530
8531 putc ('(', file);
8532 if (base)
8533 print_reg (base, 0, file);
8534 if (index)
8535 {
8536 putc (',', file);
8537 print_reg (index, 0, file);
8538 if (scale != 1)
8539 fprintf (file, ",%d", scale);
8540 }
8541 putc (')', file);
8542 }
8543 else
8544 {
8545 rtx offset = NULL_RTX;
8546
8547 if (disp)
8548 {
8549 /* Pull out the offset of a symbol; print any symbol itself. */
8550 if (GET_CODE (disp) == CONST
8551 && GET_CODE (XEXP (disp, 0)) == PLUS
8552 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
8553 {
8554 offset = XEXP (XEXP (disp, 0), 1);
8555 disp = gen_rtx_CONST (VOIDmode,
8556 XEXP (XEXP (disp, 0), 0));
8557 }
8558
8559 if (flag_pic)
8560 output_pic_addr_const (file, disp, 0);
8561 else if (GET_CODE (disp) == LABEL_REF)
8562 output_asm_label (disp);
8563 else if (CONST_INT_P (disp))
8564 offset = disp;
8565 else
8566 output_addr_const (file, disp);
8567 }
8568
8569 putc ('[', file);
8570 if (base)
8571 {
8572 print_reg (base, 0, file);
8573 if (offset)
8574 {
8575 if (INTVAL (offset) >= 0)
8576 putc ('+', file);
8577 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
8578 }
8579 }
8580 else if (offset)
8581 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
8582 else
8583 putc ('0', file);
8584
8585 if (index)
8586 {
8587 putc ('+', file);
8588 print_reg (index, 0, file);
8589 if (scale != 1)
8590 fprintf (file, "*%d", scale);
8591 }
8592 putc (']', file);
8593 }
8594 }
8595 }
8596
8597 bool
8598 output_addr_const_extra (FILE *file, rtx x)
8599 {
8600 rtx op;
8601
8602 if (GET_CODE (x) != UNSPEC)
8603 return false;
8604
8605 op = XVECEXP (x, 0, 0);
8606 switch (XINT (x, 1))
8607 {
8608 case UNSPEC_GOTTPOFF:
8609 output_addr_const (file, op);
8610 /* FIXME: This might be @TPOFF in Sun ld. */
8611 fputs ("@GOTTPOFF", file);
8612 break;
8613 case UNSPEC_TPOFF:
8614 output_addr_const (file, op);
8615 fputs ("@TPOFF", file);
8616 break;
8617 case UNSPEC_NTPOFF:
8618 output_addr_const (file, op);
8619 if (TARGET_64BIT)
8620 fputs ("@TPOFF", file);
8621 else
8622 fputs ("@NTPOFF", file);
8623 break;
8624 case UNSPEC_DTPOFF:
8625 output_addr_const (file, op);
8626 fputs ("@DTPOFF", file);
8627 break;
8628 case UNSPEC_GOTNTPOFF:
8629 output_addr_const (file, op);
8630 if (TARGET_64BIT)
8631 fputs ("@GOTTPOFF(%rip)", file);
8632 else
8633 fputs ("@GOTNTPOFF", file);
8634 break;
8635 case UNSPEC_INDNTPOFF:
8636 output_addr_const (file, op);
8637 fputs ("@INDNTPOFF", file);
8638 break;
8639
8640 default:
8641 return false;
8642 }
8643
8644 return true;
8645 }
8646 \f
8647 /* Split one or more DImode RTL references into pairs of SImode
8648 references. The RTL can be REG, offsettable MEM, integer constant, or
8649 CONST_DOUBLE. "operands" is a pointer to an array of DImode RTL to
8650 split and "num" is its length. lo_half and hi_half are output arrays
8651 that parallel "operands". */
8652
8653 void
8654 split_di (rtx operands[], int num, rtx lo_half[], rtx hi_half[])
8655 {
8656 while (num--)
8657 {
8658 rtx op = operands[num];
8659
8660 /* simplify_subreg refuse to split volatile memory addresses,
8661 but we still have to handle it. */
8662 if (MEM_P (op))
8663 {
8664 lo_half[num] = adjust_address (op, SImode, 0);
8665 hi_half[num] = adjust_address (op, SImode, 4);
8666 }
8667 else
8668 {
8669 lo_half[num] = simplify_gen_subreg (SImode, op,
8670 GET_MODE (op) == VOIDmode
8671 ? DImode : GET_MODE (op), 0);
8672 hi_half[num] = simplify_gen_subreg (SImode, op,
8673 GET_MODE (op) == VOIDmode
8674 ? DImode : GET_MODE (op), 4);
8675 }
8676 }
8677 }
8678 /* Split one or more TImode RTL references into pairs of DImode
8679 references. The RTL can be REG, offsettable MEM, integer constant, or
8680 CONST_DOUBLE. "operands" is a pointer to an array of DImode RTL to
8681 split and "num" is its length. lo_half and hi_half are output arrays
8682 that parallel "operands". */
8683
8684 void
8685 split_ti (rtx operands[], int num, rtx lo_half[], rtx hi_half[])
8686 {
8687 while (num--)
8688 {
8689 rtx op = operands[num];
8690
8691 /* simplify_subreg refuse to split volatile memory addresses, but we
8692 still have to handle it. */
8693 if (MEM_P (op))
8694 {
8695 lo_half[num] = adjust_address (op, DImode, 0);
8696 hi_half[num] = adjust_address (op, DImode, 8);
8697 }
8698 else
8699 {
8700 lo_half[num] = simplify_gen_subreg (DImode, op, TImode, 0);
8701 hi_half[num] = simplify_gen_subreg (DImode, op, TImode, 8);
8702 }
8703 }
8704 }
8705 \f
8706 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
8707 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
8708 is the expression of the binary operation. The output may either be
8709 emitted here, or returned to the caller, like all output_* functions.
8710
8711 There is no guarantee that the operands are the same mode, as they
8712 might be within FLOAT or FLOAT_EXTEND expressions. */
8713
8714 #ifndef SYSV386_COMPAT
8715 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
8716 wants to fix the assemblers because that causes incompatibility
8717 with gcc. No-one wants to fix gcc because that causes
8718 incompatibility with assemblers... You can use the option of
8719 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
8720 #define SYSV386_COMPAT 1
8721 #endif
8722
8723 const char *
8724 output_387_binary_op (rtx insn, rtx *operands)
8725 {
8726 static char buf[30];
8727 const char *p;
8728 const char *ssep;
8729 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
8730
8731 #ifdef ENABLE_CHECKING
8732 /* Even if we do not want to check the inputs, this documents input
8733 constraints. Which helps in understanding the following code. */
8734 if (STACK_REG_P (operands[0])
8735 && ((REG_P (operands[1])
8736 && REGNO (operands[0]) == REGNO (operands[1])
8737 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
8738 || (REG_P (operands[2])
8739 && REGNO (operands[0]) == REGNO (operands[2])
8740 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
8741 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
8742 ; /* ok */
8743 else
8744 gcc_assert (is_sse);
8745 #endif
8746
8747 switch (GET_CODE (operands[3]))
8748 {
8749 case PLUS:
8750 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
8751 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
8752 p = "fiadd";
8753 else
8754 p = "fadd";
8755 ssep = "add";
8756 break;
8757
8758 case MINUS:
8759 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
8760 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
8761 p = "fisub";
8762 else
8763 p = "fsub";
8764 ssep = "sub";
8765 break;
8766
8767 case MULT:
8768 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
8769 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
8770 p = "fimul";
8771 else
8772 p = "fmul";
8773 ssep = "mul";
8774 break;
8775
8776 case DIV:
8777 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
8778 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
8779 p = "fidiv";
8780 else
8781 p = "fdiv";
8782 ssep = "div";
8783 break;
8784
8785 default:
8786 gcc_unreachable ();
8787 }
8788
8789 if (is_sse)
8790 {
8791 strcpy (buf, ssep);
8792 if (GET_MODE (operands[0]) == SFmode)
8793 strcat (buf, "ss\t{%2, %0|%0, %2}");
8794 else
8795 strcat (buf, "sd\t{%2, %0|%0, %2}");
8796 return buf;
8797 }
8798 strcpy (buf, p);
8799
8800 switch (GET_CODE (operands[3]))
8801 {
8802 case MULT:
8803 case PLUS:
8804 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
8805 {
8806 rtx temp = operands[2];
8807 operands[2] = operands[1];
8808 operands[1] = temp;
8809 }
8810
8811 /* know operands[0] == operands[1]. */
8812
8813 if (MEM_P (operands[2]))
8814 {
8815 p = "%z2\t%2";
8816 break;
8817 }
8818
8819 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
8820 {
8821 if (STACK_TOP_P (operands[0]))
8822 /* How is it that we are storing to a dead operand[2]?
8823 Well, presumably operands[1] is dead too. We can't
8824 store the result to st(0) as st(0) gets popped on this
8825 instruction. Instead store to operands[2] (which I
8826 think has to be st(1)). st(1) will be popped later.
8827 gcc <= 2.8.1 didn't have this check and generated
8828 assembly code that the Unixware assembler rejected. */
8829 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
8830 else
8831 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
8832 break;
8833 }
8834
8835 if (STACK_TOP_P (operands[0]))
8836 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
8837 else
8838 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
8839 break;
8840
8841 case MINUS:
8842 case DIV:
8843 if (MEM_P (operands[1]))
8844 {
8845 p = "r%z1\t%1";
8846 break;
8847 }
8848
8849 if (MEM_P (operands[2]))
8850 {
8851 p = "%z2\t%2";
8852 break;
8853 }
8854
8855 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
8856 {
8857 #if SYSV386_COMPAT
8858 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
8859 derived assemblers, confusingly reverse the direction of
8860 the operation for fsub{r} and fdiv{r} when the
8861 destination register is not st(0). The Intel assembler
8862 doesn't have this brain damage. Read !SYSV386_COMPAT to
8863 figure out what the hardware really does. */
8864 if (STACK_TOP_P (operands[0]))
8865 p = "{p\t%0, %2|rp\t%2, %0}";
8866 else
8867 p = "{rp\t%2, %0|p\t%0, %2}";
8868 #else
8869 if (STACK_TOP_P (operands[0]))
8870 /* As above for fmul/fadd, we can't store to st(0). */
8871 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
8872 else
8873 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
8874 #endif
8875 break;
8876 }
8877
8878 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
8879 {
8880 #if SYSV386_COMPAT
8881 if (STACK_TOP_P (operands[0]))
8882 p = "{rp\t%0, %1|p\t%1, %0}";
8883 else
8884 p = "{p\t%1, %0|rp\t%0, %1}";
8885 #else
8886 if (STACK_TOP_P (operands[0]))
8887 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
8888 else
8889 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
8890 #endif
8891 break;
8892 }
8893
8894 if (STACK_TOP_P (operands[0]))
8895 {
8896 if (STACK_TOP_P (operands[1]))
8897 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
8898 else
8899 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
8900 break;
8901 }
8902 else if (STACK_TOP_P (operands[1]))
8903 {
8904 #if SYSV386_COMPAT
8905 p = "{\t%1, %0|r\t%0, %1}";
8906 #else
8907 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
8908 #endif
8909 }
8910 else
8911 {
8912 #if SYSV386_COMPAT
8913 p = "{r\t%2, %0|\t%0, %2}";
8914 #else
8915 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
8916 #endif
8917 }
8918 break;
8919
8920 default:
8921 gcc_unreachable ();
8922 }
8923
8924 strcat (buf, p);
8925 return buf;
8926 }
8927
8928 /* Return needed mode for entity in optimize_mode_switching pass. */
8929
8930 int
8931 ix86_mode_needed (int entity, rtx insn)
8932 {
8933 enum attr_i387_cw mode;
8934
8935 /* The mode UNINITIALIZED is used to store control word after a
8936 function call or ASM pattern. The mode ANY specify that function
8937 has no requirements on the control word and make no changes in the
8938 bits we are interested in. */
8939
8940 if (CALL_P (insn)
8941 || (NONJUMP_INSN_P (insn)
8942 && (asm_noperands (PATTERN (insn)) >= 0
8943 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
8944 return I387_CW_UNINITIALIZED;
8945
8946 if (recog_memoized (insn) < 0)
8947 return I387_CW_ANY;
8948
8949 mode = get_attr_i387_cw (insn);
8950
8951 switch (entity)
8952 {
8953 case I387_TRUNC:
8954 if (mode == I387_CW_TRUNC)
8955 return mode;
8956 break;
8957
8958 case I387_FLOOR:
8959 if (mode == I387_CW_FLOOR)
8960 return mode;
8961 break;
8962
8963 case I387_CEIL:
8964 if (mode == I387_CW_CEIL)
8965 return mode;
8966 break;
8967
8968 case I387_MASK_PM:
8969 if (mode == I387_CW_MASK_PM)
8970 return mode;
8971 break;
8972
8973 default:
8974 gcc_unreachable ();
8975 }
8976
8977 return I387_CW_ANY;
8978 }
8979
8980 /* Output code to initialize control word copies used by trunc?f?i and
8981 rounding patterns. CURRENT_MODE is set to current control word,
8982 while NEW_MODE is set to new control word. */
8983
8984 void
8985 emit_i387_cw_initialization (int mode)
8986 {
8987 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
8988 rtx new_mode;
8989
8990 int slot;
8991
8992 rtx reg = gen_reg_rtx (HImode);
8993
8994 emit_insn (gen_x86_fnstcw_1 (stored_mode));
8995 emit_move_insn (reg, copy_rtx (stored_mode));
8996
8997 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL || optimize_size)
8998 {
8999 switch (mode)
9000 {
9001 case I387_CW_TRUNC:
9002 /* round toward zero (truncate) */
9003 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
9004 slot = SLOT_CW_TRUNC;
9005 break;
9006
9007 case I387_CW_FLOOR:
9008 /* round down toward -oo */
9009 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
9010 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
9011 slot = SLOT_CW_FLOOR;
9012 break;
9013
9014 case I387_CW_CEIL:
9015 /* round up toward +oo */
9016 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
9017 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
9018 slot = SLOT_CW_CEIL;
9019 break;
9020
9021 case I387_CW_MASK_PM:
9022 /* mask precision exception for nearbyint() */
9023 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
9024 slot = SLOT_CW_MASK_PM;
9025 break;
9026
9027 default:
9028 gcc_unreachable ();
9029 }
9030 }
9031 else
9032 {
9033 switch (mode)
9034 {
9035 case I387_CW_TRUNC:
9036 /* round toward zero (truncate) */
9037 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
9038 slot = SLOT_CW_TRUNC;
9039 break;
9040
9041 case I387_CW_FLOOR:
9042 /* round down toward -oo */
9043 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
9044 slot = SLOT_CW_FLOOR;
9045 break;
9046
9047 case I387_CW_CEIL:
9048 /* round up toward +oo */
9049 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
9050 slot = SLOT_CW_CEIL;
9051 break;
9052
9053 case I387_CW_MASK_PM:
9054 /* mask precision exception for nearbyint() */
9055 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
9056 slot = SLOT_CW_MASK_PM;
9057 break;
9058
9059 default:
9060 gcc_unreachable ();
9061 }
9062 }
9063
9064 gcc_assert (slot < MAX_386_STACK_LOCALS);
9065
9066 new_mode = assign_386_stack_local (HImode, slot);
9067 emit_move_insn (new_mode, reg);
9068 }
9069
9070 /* Output code for INSN to convert a float to a signed int. OPERANDS
9071 are the insn operands. The output may be [HSD]Imode and the input
9072 operand may be [SDX]Fmode. */
9073
9074 const char *
9075 output_fix_trunc (rtx insn, rtx *operands, int fisttp)
9076 {
9077 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
9078 int dimode_p = GET_MODE (operands[0]) == DImode;
9079 int round_mode = get_attr_i387_cw (insn);
9080
9081 /* Jump through a hoop or two for DImode, since the hardware has no
9082 non-popping instruction. We used to do this a different way, but
9083 that was somewhat fragile and broke with post-reload splitters. */
9084 if ((dimode_p || fisttp) && !stack_top_dies)
9085 output_asm_insn ("fld\t%y1", operands);
9086
9087 gcc_assert (STACK_TOP_P (operands[1]));
9088 gcc_assert (MEM_P (operands[0]));
9089
9090 if (fisttp)
9091 output_asm_insn ("fisttp%z0\t%0", operands);
9092 else
9093 {
9094 if (round_mode != I387_CW_ANY)
9095 output_asm_insn ("fldcw\t%3", operands);
9096 if (stack_top_dies || dimode_p)
9097 output_asm_insn ("fistp%z0\t%0", operands);
9098 else
9099 output_asm_insn ("fist%z0\t%0", operands);
9100 if (round_mode != I387_CW_ANY)
9101 output_asm_insn ("fldcw\t%2", operands);
9102 }
9103
9104 return "";
9105 }
9106
9107 /* Output code for x87 ffreep insn. The OPNO argument, which may only
9108 have the values zero or one, indicates the ffreep insn's operand
9109 from the OPERANDS array. */
9110
9111 static const char *
9112 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
9113 {
9114 if (TARGET_USE_FFREEP)
9115 #if HAVE_AS_IX86_FFREEP
9116 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
9117 #else
9118 {
9119 static char retval[] = ".word\t0xc_df";
9120 int regno = REGNO (operands[opno]);
9121
9122 gcc_assert (FP_REGNO_P (regno));
9123
9124 retval[9] = '0' + (regno - FIRST_STACK_REG);
9125 return retval;
9126 }
9127 #endif
9128
9129 return opno ? "fstp\t%y1" : "fstp\t%y0";
9130 }
9131
9132
9133 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
9134 should be used. UNORDERED_P is true when fucom should be used. */
9135
9136 const char *
9137 output_fp_compare (rtx insn, rtx *operands, int eflags_p, int unordered_p)
9138 {
9139 int stack_top_dies;
9140 rtx cmp_op0, cmp_op1;
9141 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
9142
9143 if (eflags_p)
9144 {
9145 cmp_op0 = operands[0];
9146 cmp_op1 = operands[1];
9147 }
9148 else
9149 {
9150 cmp_op0 = operands[1];
9151 cmp_op1 = operands[2];
9152 }
9153
9154 if (is_sse)
9155 {
9156 if (GET_MODE (operands[0]) == SFmode)
9157 if (unordered_p)
9158 return "ucomiss\t{%1, %0|%0, %1}";
9159 else
9160 return "comiss\t{%1, %0|%0, %1}";
9161 else
9162 if (unordered_p)
9163 return "ucomisd\t{%1, %0|%0, %1}";
9164 else
9165 return "comisd\t{%1, %0|%0, %1}";
9166 }
9167
9168 gcc_assert (STACK_TOP_P (cmp_op0));
9169
9170 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
9171
9172 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
9173 {
9174 if (stack_top_dies)
9175 {
9176 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
9177 return output_387_ffreep (operands, 1);
9178 }
9179 else
9180 return "ftst\n\tfnstsw\t%0";
9181 }
9182
9183 if (STACK_REG_P (cmp_op1)
9184 && stack_top_dies
9185 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
9186 && REGNO (cmp_op1) != FIRST_STACK_REG)
9187 {
9188 /* If both the top of the 387 stack dies, and the other operand
9189 is also a stack register that dies, then this must be a
9190 `fcompp' float compare */
9191
9192 if (eflags_p)
9193 {
9194 /* There is no double popping fcomi variant. Fortunately,
9195 eflags is immune from the fstp's cc clobbering. */
9196 if (unordered_p)
9197 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
9198 else
9199 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
9200 return output_387_ffreep (operands, 0);
9201 }
9202 else
9203 {
9204 if (unordered_p)
9205 return "fucompp\n\tfnstsw\t%0";
9206 else
9207 return "fcompp\n\tfnstsw\t%0";
9208 }
9209 }
9210 else
9211 {
9212 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
9213
9214 static const char * const alt[16] =
9215 {
9216 "fcom%z2\t%y2\n\tfnstsw\t%0",
9217 "fcomp%z2\t%y2\n\tfnstsw\t%0",
9218 "fucom%z2\t%y2\n\tfnstsw\t%0",
9219 "fucomp%z2\t%y2\n\tfnstsw\t%0",
9220
9221 "ficom%z2\t%y2\n\tfnstsw\t%0",
9222 "ficomp%z2\t%y2\n\tfnstsw\t%0",
9223 NULL,
9224 NULL,
9225
9226 "fcomi\t{%y1, %0|%0, %y1}",
9227 "fcomip\t{%y1, %0|%0, %y1}",
9228 "fucomi\t{%y1, %0|%0, %y1}",
9229 "fucomip\t{%y1, %0|%0, %y1}",
9230
9231 NULL,
9232 NULL,
9233 NULL,
9234 NULL
9235 };
9236
9237 int mask;
9238 const char *ret;
9239
9240 mask = eflags_p << 3;
9241 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
9242 mask |= unordered_p << 1;
9243 mask |= stack_top_dies;
9244
9245 gcc_assert (mask < 16);
9246 ret = alt[mask];
9247 gcc_assert (ret);
9248
9249 return ret;
9250 }
9251 }
9252
9253 void
9254 ix86_output_addr_vec_elt (FILE *file, int value)
9255 {
9256 const char *directive = ASM_LONG;
9257
9258 #ifdef ASM_QUAD
9259 if (TARGET_64BIT)
9260 directive = ASM_QUAD;
9261 #else
9262 gcc_assert (!TARGET_64BIT);
9263 #endif
9264
9265 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
9266 }
9267
9268 void
9269 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
9270 {
9271 if (TARGET_64BIT)
9272 fprintf (file, "%s%s%d-%s%d\n",
9273 ASM_LONG, LPREFIX, value, LPREFIX, rel);
9274 else if (HAVE_AS_GOTOFF_IN_DATA)
9275 fprintf (file, "%s%s%d@GOTOFF\n", ASM_LONG, LPREFIX, value);
9276 #if TARGET_MACHO
9277 else if (TARGET_MACHO)
9278 {
9279 fprintf (file, "%s%s%d-", ASM_LONG, LPREFIX, value);
9280 machopic_output_function_base_name (file);
9281 fprintf(file, "\n");
9282 }
9283 #endif
9284 else
9285 asm_fprintf (file, "%s%U%s+[.-%s%d]\n",
9286 ASM_LONG, GOT_SYMBOL_NAME, LPREFIX, value);
9287 }
9288 \f
9289 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
9290 for the target. */
9291
9292 void
9293 ix86_expand_clear (rtx dest)
9294 {
9295 rtx tmp;
9296
9297 /* We play register width games, which are only valid after reload. */
9298 gcc_assert (reload_completed);
9299
9300 /* Avoid HImode and its attendant prefix byte. */
9301 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
9302 dest = gen_rtx_REG (SImode, REGNO (dest));
9303
9304 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
9305
9306 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
9307 if (reload_completed && (!TARGET_USE_MOV0 || optimize_size))
9308 {
9309 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, 17));
9310 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
9311 }
9312
9313 emit_insn (tmp);
9314 }
9315
9316 /* X is an unchanging MEM. If it is a constant pool reference, return
9317 the constant pool rtx, else NULL. */
9318
9319 rtx
9320 maybe_get_pool_constant (rtx x)
9321 {
9322 x = ix86_delegitimize_address (XEXP (x, 0));
9323
9324 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
9325 return get_pool_constant (x);
9326
9327 return NULL_RTX;
9328 }
9329
9330 void
9331 ix86_expand_move (enum machine_mode mode, rtx operands[])
9332 {
9333 int strict = (reload_in_progress || reload_completed);
9334 rtx op0, op1;
9335 enum tls_model model;
9336
9337 op0 = operands[0];
9338 op1 = operands[1];
9339
9340 if (GET_CODE (op1) == SYMBOL_REF)
9341 {
9342 model = SYMBOL_REF_TLS_MODEL (op1);
9343 if (model)
9344 {
9345 op1 = legitimize_tls_address (op1, model, true);
9346 op1 = force_operand (op1, op0);
9347 if (op1 == op0)
9348 return;
9349 }
9350 }
9351 else if (GET_CODE (op1) == CONST
9352 && GET_CODE (XEXP (op1, 0)) == PLUS
9353 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
9354 {
9355 model = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (op1, 0), 0));
9356 if (model)
9357 {
9358 rtx addend = XEXP (XEXP (op1, 0), 1);
9359 op1 = legitimize_tls_address (XEXP (XEXP (op1, 0), 0), model, true);
9360 op1 = force_operand (op1, NULL);
9361 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
9362 op0, 1, OPTAB_DIRECT);
9363 if (op1 == op0)
9364 return;
9365 }
9366 }
9367
9368 if (flag_pic && mode == Pmode && symbolic_operand (op1, Pmode))
9369 {
9370 if (TARGET_MACHO && !TARGET_64BIT)
9371 {
9372 #if TARGET_MACHO
9373 if (MACHOPIC_PURE)
9374 {
9375 rtx temp = ((reload_in_progress
9376 || ((op0 && REG_P (op0))
9377 && mode == Pmode))
9378 ? op0 : gen_reg_rtx (Pmode));
9379 op1 = machopic_indirect_data_reference (op1, temp);
9380 op1 = machopic_legitimize_pic_address (op1, mode,
9381 temp == op1 ? 0 : temp);
9382 }
9383 else if (MACHOPIC_INDIRECT)
9384 op1 = machopic_indirect_data_reference (op1, 0);
9385 if (op0 == op1)
9386 return;
9387 #endif
9388 }
9389 else
9390 {
9391 if (MEM_P (op0))
9392 op1 = force_reg (Pmode, op1);
9393 else
9394 op1 = legitimize_address (op1, op1, Pmode);
9395 }
9396 }
9397 else
9398 {
9399 if (MEM_P (op0)
9400 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
9401 || !push_operand (op0, mode))
9402 && MEM_P (op1))
9403 op1 = force_reg (mode, op1);
9404
9405 if (push_operand (op0, mode)
9406 && ! general_no_elim_operand (op1, mode))
9407 op1 = copy_to_mode_reg (mode, op1);
9408
9409 /* Force large constants in 64bit compilation into register
9410 to get them CSEed. */
9411 if (TARGET_64BIT && mode == DImode
9412 && immediate_operand (op1, mode)
9413 && !x86_64_zext_immediate_operand (op1, VOIDmode)
9414 && !register_operand (op0, mode)
9415 && optimize && !reload_completed && !reload_in_progress)
9416 op1 = copy_to_mode_reg (mode, op1);
9417
9418 if (FLOAT_MODE_P (mode))
9419 {
9420 /* If we are loading a floating point constant to a register,
9421 force the value to memory now, since we'll get better code
9422 out the back end. */
9423
9424 if (strict)
9425 ;
9426 else if (GET_CODE (op1) == CONST_DOUBLE)
9427 {
9428 op1 = validize_mem (force_const_mem (mode, op1));
9429 if (!register_operand (op0, mode))
9430 {
9431 rtx temp = gen_reg_rtx (mode);
9432 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
9433 emit_move_insn (op0, temp);
9434 return;
9435 }
9436 }
9437 }
9438 }
9439
9440 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
9441 }
9442
9443 void
9444 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
9445 {
9446 rtx op0 = operands[0], op1 = operands[1];
9447
9448 /* Force constants other than zero into memory. We do not know how
9449 the instructions used to build constants modify the upper 64 bits
9450 of the register, once we have that information we may be able
9451 to handle some of them more efficiently. */
9452 if ((reload_in_progress | reload_completed) == 0
9453 && register_operand (op0, mode)
9454 && CONSTANT_P (op1)
9455 && standard_sse_constant_p (op1) <= 0)
9456 op1 = validize_mem (force_const_mem (mode, op1));
9457
9458 /* Make operand1 a register if it isn't already. */
9459 if (!no_new_pseudos
9460 && !register_operand (op0, mode)
9461 && !register_operand (op1, mode))
9462 {
9463 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
9464 return;
9465 }
9466
9467 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
9468 }
9469
9470 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
9471 straight to ix86_expand_vector_move. */
9472
9473 void
9474 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
9475 {
9476 rtx op0, op1, m;
9477
9478 op0 = operands[0];
9479 op1 = operands[1];
9480
9481 if (MEM_P (op1))
9482 {
9483 /* If we're optimizing for size, movups is the smallest. */
9484 if (optimize_size)
9485 {
9486 op0 = gen_lowpart (V4SFmode, op0);
9487 op1 = gen_lowpart (V4SFmode, op1);
9488 emit_insn (gen_sse_movups (op0, op1));
9489 return;
9490 }
9491
9492 /* ??? If we have typed data, then it would appear that using
9493 movdqu is the only way to get unaligned data loaded with
9494 integer type. */
9495 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
9496 {
9497 op0 = gen_lowpart (V16QImode, op0);
9498 op1 = gen_lowpart (V16QImode, op1);
9499 emit_insn (gen_sse2_movdqu (op0, op1));
9500 return;
9501 }
9502
9503 if (TARGET_SSE2 && mode == V2DFmode)
9504 {
9505 rtx zero;
9506
9507 if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL)
9508 {
9509 op0 = gen_lowpart (V2DFmode, op0);
9510 op1 = gen_lowpart (V2DFmode, op1);
9511 emit_insn (gen_sse2_movupd (op0, op1));
9512 return;
9513 }
9514
9515 /* When SSE registers are split into halves, we can avoid
9516 writing to the top half twice. */
9517 if (TARGET_SSE_SPLIT_REGS)
9518 {
9519 emit_insn (gen_rtx_CLOBBER (VOIDmode, op0));
9520 zero = op0;
9521 }
9522 else
9523 {
9524 /* ??? Not sure about the best option for the Intel chips.
9525 The following would seem to satisfy; the register is
9526 entirely cleared, breaking the dependency chain. We
9527 then store to the upper half, with a dependency depth
9528 of one. A rumor has it that Intel recommends two movsd
9529 followed by an unpacklpd, but this is unconfirmed. And
9530 given that the dependency depth of the unpacklpd would
9531 still be one, I'm not sure why this would be better. */
9532 zero = CONST0_RTX (V2DFmode);
9533 }
9534
9535 m = adjust_address (op1, DFmode, 0);
9536 emit_insn (gen_sse2_loadlpd (op0, zero, m));
9537 m = adjust_address (op1, DFmode, 8);
9538 emit_insn (gen_sse2_loadhpd (op0, op0, m));
9539 }
9540 else
9541 {
9542 if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL)
9543 {
9544 op0 = gen_lowpart (V4SFmode, op0);
9545 op1 = gen_lowpart (V4SFmode, op1);
9546 emit_insn (gen_sse_movups (op0, op1));
9547 return;
9548 }
9549
9550 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
9551 emit_move_insn (op0, CONST0_RTX (mode));
9552 else
9553 emit_insn (gen_rtx_CLOBBER (VOIDmode, op0));
9554
9555 if (mode != V4SFmode)
9556 op0 = gen_lowpart (V4SFmode, op0);
9557 m = adjust_address (op1, V2SFmode, 0);
9558 emit_insn (gen_sse_loadlps (op0, op0, m));
9559 m = adjust_address (op1, V2SFmode, 8);
9560 emit_insn (gen_sse_loadhps (op0, op0, m));
9561 }
9562 }
9563 else if (MEM_P (op0))
9564 {
9565 /* If we're optimizing for size, movups is the smallest. */
9566 if (optimize_size)
9567 {
9568 op0 = gen_lowpart (V4SFmode, op0);
9569 op1 = gen_lowpart (V4SFmode, op1);
9570 emit_insn (gen_sse_movups (op0, op1));
9571 return;
9572 }
9573
9574 /* ??? Similar to above, only less clear because of quote
9575 typeless stores unquote. */
9576 if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
9577 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
9578 {
9579 op0 = gen_lowpart (V16QImode, op0);
9580 op1 = gen_lowpart (V16QImode, op1);
9581 emit_insn (gen_sse2_movdqu (op0, op1));
9582 return;
9583 }
9584
9585 if (TARGET_SSE2 && mode == V2DFmode)
9586 {
9587 m = adjust_address (op0, DFmode, 0);
9588 emit_insn (gen_sse2_storelpd (m, op1));
9589 m = adjust_address (op0, DFmode, 8);
9590 emit_insn (gen_sse2_storehpd (m, op1));
9591 }
9592 else
9593 {
9594 if (mode != V4SFmode)
9595 op1 = gen_lowpart (V4SFmode, op1);
9596 m = adjust_address (op0, V2SFmode, 0);
9597 emit_insn (gen_sse_storelps (m, op1));
9598 m = adjust_address (op0, V2SFmode, 8);
9599 emit_insn (gen_sse_storehps (m, op1));
9600 }
9601 }
9602 else
9603 gcc_unreachable ();
9604 }
9605
9606 /* Expand a push in MODE. This is some mode for which we do not support
9607 proper push instructions, at least from the registers that we expect
9608 the value to live in. */
9609
9610 void
9611 ix86_expand_push (enum machine_mode mode, rtx x)
9612 {
9613 rtx tmp;
9614
9615 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
9616 GEN_INT (-GET_MODE_SIZE (mode)),
9617 stack_pointer_rtx, 1, OPTAB_DIRECT);
9618 if (tmp != stack_pointer_rtx)
9619 emit_move_insn (stack_pointer_rtx, tmp);
9620
9621 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
9622 emit_move_insn (tmp, x);
9623 }
9624
9625 /* Helper function of ix86_fixup_binary_operands to canonicalize
9626 operand order. Returns true if the operands should be swapped. */
9627
9628 static bool
9629 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
9630 rtx operands[])
9631 {
9632 rtx dst = operands[0];
9633 rtx src1 = operands[1];
9634 rtx src2 = operands[2];
9635
9636 /* If the operation is not commutative, we can't do anything. */
9637 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
9638 return false;
9639
9640 /* Highest priority is that src1 should match dst. */
9641 if (rtx_equal_p (dst, src1))
9642 return false;
9643 if (rtx_equal_p (dst, src2))
9644 return true;
9645
9646 /* Next highest priority is that immediate constants come second. */
9647 if (immediate_operand (src2, mode))
9648 return false;
9649 if (immediate_operand (src1, mode))
9650 return true;
9651
9652 /* Lowest priority is that memory references should come second. */
9653 if (MEM_P (src2))
9654 return false;
9655 if (MEM_P (src1))
9656 return true;
9657
9658 return false;
9659 }
9660
9661
9662 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
9663 destination to use for the operation. If different from the true
9664 destination in operands[0], a copy operation will be required. */
9665
9666 rtx
9667 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
9668 rtx operands[])
9669 {
9670 rtx dst = operands[0];
9671 rtx src1 = operands[1];
9672 rtx src2 = operands[2];
9673
9674 /* Canonicalize operand order. */
9675 if (ix86_swap_binary_operands_p (code, mode, operands))
9676 {
9677 rtx temp = src1;
9678 src1 = src2;
9679 src2 = temp;
9680 }
9681
9682 /* Both source operands cannot be in memory. */
9683 if (MEM_P (src1) && MEM_P (src2))
9684 {
9685 /* Optimization: Only read from memory once. */
9686 if (rtx_equal_p (src1, src2))
9687 {
9688 src2 = force_reg (mode, src2);
9689 src1 = src2;
9690 }
9691 else
9692 src2 = force_reg (mode, src2);
9693 }
9694
9695 /* If the destination is memory, and we do not have matching source
9696 operands, do things in registers. */
9697 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
9698 dst = gen_reg_rtx (mode);
9699
9700 /* Source 1 cannot be a constant. */
9701 if (CONSTANT_P (src1))
9702 src1 = force_reg (mode, src1);
9703
9704 /* Source 1 cannot be a non-matching memory. */
9705 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
9706 src1 = force_reg (mode, src1);
9707
9708 operands[1] = src1;
9709 operands[2] = src2;
9710 return dst;
9711 }
9712
9713 /* Similarly, but assume that the destination has already been
9714 set up properly. */
9715
9716 void
9717 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
9718 enum machine_mode mode, rtx operands[])
9719 {
9720 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
9721 gcc_assert (dst == operands[0]);
9722 }
9723
9724 /* Attempt to expand a binary operator. Make the expansion closer to the
9725 actual machine, then just general_operand, which will allow 3 separate
9726 memory references (one output, two input) in a single insn. */
9727
9728 void
9729 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
9730 rtx operands[])
9731 {
9732 rtx src1, src2, dst, op, clob;
9733
9734 dst = ix86_fixup_binary_operands (code, mode, operands);
9735 src1 = operands[1];
9736 src2 = operands[2];
9737
9738 /* Emit the instruction. */
9739
9740 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
9741 if (reload_in_progress)
9742 {
9743 /* Reload doesn't know about the flags register, and doesn't know that
9744 it doesn't want to clobber it. We can only do this with PLUS. */
9745 gcc_assert (code == PLUS);
9746 emit_insn (op);
9747 }
9748 else
9749 {
9750 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
9751 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
9752 }
9753
9754 /* Fix up the destination if needed. */
9755 if (dst != operands[0])
9756 emit_move_insn (operands[0], dst);
9757 }
9758
9759 /* Return TRUE or FALSE depending on whether the binary operator meets the
9760 appropriate constraints. */
9761
9762 int
9763 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
9764 rtx operands[3])
9765 {
9766 rtx dst = operands[0];
9767 rtx src1 = operands[1];
9768 rtx src2 = operands[2];
9769
9770 /* Both source operands cannot be in memory. */
9771 if (MEM_P (src1) && MEM_P (src2))
9772 return 0;
9773
9774 /* Canonicalize operand order for commutative operators. */
9775 if (ix86_swap_binary_operands_p (code, mode, operands))
9776 {
9777 rtx temp = src1;
9778 src1 = src2;
9779 src2 = temp;
9780 }
9781
9782 /* If the destination is memory, we must have a matching source operand. */
9783 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
9784 return 0;
9785
9786 /* Source 1 cannot be a constant. */
9787 if (CONSTANT_P (src1))
9788 return 0;
9789
9790 /* Source 1 cannot be a non-matching memory. */
9791 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
9792 return 0;
9793
9794 return 1;
9795 }
9796
9797 /* Attempt to expand a unary operator. Make the expansion closer to the
9798 actual machine, then just general_operand, which will allow 2 separate
9799 memory references (one output, one input) in a single insn. */
9800
9801 void
9802 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
9803 rtx operands[])
9804 {
9805 int matching_memory;
9806 rtx src, dst, op, clob;
9807
9808 dst = operands[0];
9809 src = operands[1];
9810
9811 /* If the destination is memory, and we do not have matching source
9812 operands, do things in registers. */
9813 matching_memory = 0;
9814 if (MEM_P (dst))
9815 {
9816 if (rtx_equal_p (dst, src))
9817 matching_memory = 1;
9818 else
9819 dst = gen_reg_rtx (mode);
9820 }
9821
9822 /* When source operand is memory, destination must match. */
9823 if (MEM_P (src) && !matching_memory)
9824 src = force_reg (mode, src);
9825
9826 /* Emit the instruction. */
9827
9828 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
9829 if (reload_in_progress || code == NOT)
9830 {
9831 /* Reload doesn't know about the flags register, and doesn't know that
9832 it doesn't want to clobber it. */
9833 gcc_assert (code == NOT);
9834 emit_insn (op);
9835 }
9836 else
9837 {
9838 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
9839 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
9840 }
9841
9842 /* Fix up the destination if needed. */
9843 if (dst != operands[0])
9844 emit_move_insn (operands[0], dst);
9845 }
9846
9847 /* Return TRUE or FALSE depending on whether the unary operator meets the
9848 appropriate constraints. */
9849
9850 int
9851 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
9852 enum machine_mode mode ATTRIBUTE_UNUSED,
9853 rtx operands[2] ATTRIBUTE_UNUSED)
9854 {
9855 /* If one of operands is memory, source and destination must match. */
9856 if ((MEM_P (operands[0])
9857 || MEM_P (operands[1]))
9858 && ! rtx_equal_p (operands[0], operands[1]))
9859 return FALSE;
9860 return TRUE;
9861 }
9862
9863 /* A subroutine of ix86_expand_fp_absneg_operator and copysign expanders.
9864 Create a mask for the sign bit in MODE for an SSE register. If VECT is
9865 true, then replicate the mask for all elements of the vector register.
9866 If INVERT is true, then create a mask excluding the sign bit. */
9867
9868 rtx
9869 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
9870 {
9871 enum machine_mode vec_mode;
9872 HOST_WIDE_INT hi, lo;
9873 int shift = 63;
9874 rtvec v;
9875 rtx mask;
9876
9877 /* Find the sign bit, sign extended to 2*HWI. */
9878 if (mode == SFmode)
9879 lo = 0x80000000, hi = lo < 0;
9880 else if (HOST_BITS_PER_WIDE_INT >= 64)
9881 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
9882 else
9883 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
9884
9885 if (invert)
9886 lo = ~lo, hi = ~hi;
9887
9888 /* Force this value into the low part of a fp vector constant. */
9889 mask = immed_double_const (lo, hi, mode == SFmode ? SImode : DImode);
9890 mask = gen_lowpart (mode, mask);
9891
9892 if (mode == SFmode)
9893 {
9894 if (vect)
9895 v = gen_rtvec (4, mask, mask, mask, mask);
9896 else
9897 v = gen_rtvec (4, mask, CONST0_RTX (SFmode),
9898 CONST0_RTX (SFmode), CONST0_RTX (SFmode));
9899 vec_mode = V4SFmode;
9900 }
9901 else
9902 {
9903 if (vect)
9904 v = gen_rtvec (2, mask, mask);
9905 else
9906 v = gen_rtvec (2, mask, CONST0_RTX (DFmode));
9907 vec_mode = V2DFmode;
9908 }
9909
9910 return force_reg (vec_mode, gen_rtx_CONST_VECTOR (vec_mode, v));
9911 }
9912
9913 /* Generate code for floating point ABS or NEG. */
9914
9915 void
9916 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
9917 rtx operands[])
9918 {
9919 rtx mask, set, use, clob, dst, src;
9920 bool matching_memory;
9921 bool use_sse = false;
9922 bool vector_mode = VECTOR_MODE_P (mode);
9923 enum machine_mode elt_mode = mode;
9924
9925 if (vector_mode)
9926 {
9927 elt_mode = GET_MODE_INNER (mode);
9928 use_sse = true;
9929 }
9930 else if (TARGET_SSE_MATH)
9931 use_sse = SSE_FLOAT_MODE_P (mode);
9932
9933 /* NEG and ABS performed with SSE use bitwise mask operations.
9934 Create the appropriate mask now. */
9935 if (use_sse)
9936 mask = ix86_build_signbit_mask (elt_mode, vector_mode, code == ABS);
9937 else
9938 mask = NULL_RTX;
9939
9940 dst = operands[0];
9941 src = operands[1];
9942
9943 /* If the destination is memory, and we don't have matching source
9944 operands or we're using the x87, do things in registers. */
9945 matching_memory = false;
9946 if (MEM_P (dst))
9947 {
9948 if (use_sse && rtx_equal_p (dst, src))
9949 matching_memory = true;
9950 else
9951 dst = gen_reg_rtx (mode);
9952 }
9953 if (MEM_P (src) && !matching_memory)
9954 src = force_reg (mode, src);
9955
9956 if (vector_mode)
9957 {
9958 set = gen_rtx_fmt_ee (code == NEG ? XOR : AND, mode, src, mask);
9959 set = gen_rtx_SET (VOIDmode, dst, set);
9960 emit_insn (set);
9961 }
9962 else
9963 {
9964 set = gen_rtx_fmt_e (code, mode, src);
9965 set = gen_rtx_SET (VOIDmode, dst, set);
9966 if (mask)
9967 {
9968 use = gen_rtx_USE (VOIDmode, mask);
9969 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
9970 emit_insn (gen_rtx_PARALLEL (VOIDmode,
9971 gen_rtvec (3, set, use, clob)));
9972 }
9973 else
9974 emit_insn (set);
9975 }
9976
9977 if (dst != operands[0])
9978 emit_move_insn (operands[0], dst);
9979 }
9980
9981 /* Expand a copysign operation. Special case operand 0 being a constant. */
9982
9983 void
9984 ix86_expand_copysign (rtx operands[])
9985 {
9986 enum machine_mode mode, vmode;
9987 rtx dest, op0, op1, mask, nmask;
9988
9989 dest = operands[0];
9990 op0 = operands[1];
9991 op1 = operands[2];
9992
9993 mode = GET_MODE (dest);
9994 vmode = mode == SFmode ? V4SFmode : V2DFmode;
9995
9996 if (GET_CODE (op0) == CONST_DOUBLE)
9997 {
9998 rtvec v;
9999
10000 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
10001 op0 = simplify_unary_operation (ABS, mode, op0, mode);
10002
10003 if (op0 == CONST0_RTX (mode))
10004 op0 = CONST0_RTX (vmode);
10005 else
10006 {
10007 if (mode == SFmode)
10008 v = gen_rtvec (4, op0, CONST0_RTX (SFmode),
10009 CONST0_RTX (SFmode), CONST0_RTX (SFmode));
10010 else
10011 v = gen_rtvec (2, op0, CONST0_RTX (DFmode));
10012 op0 = force_reg (vmode, gen_rtx_CONST_VECTOR (vmode, v));
10013 }
10014
10015 mask = ix86_build_signbit_mask (mode, 0, 0);
10016
10017 if (mode == SFmode)
10018 emit_insn (gen_copysignsf3_const (dest, op0, op1, mask));
10019 else
10020 emit_insn (gen_copysigndf3_const (dest, op0, op1, mask));
10021 }
10022 else
10023 {
10024 nmask = ix86_build_signbit_mask (mode, 0, 1);
10025 mask = ix86_build_signbit_mask (mode, 0, 0);
10026
10027 if (mode == SFmode)
10028 emit_insn (gen_copysignsf3_var (dest, NULL, op0, op1, nmask, mask));
10029 else
10030 emit_insn (gen_copysigndf3_var (dest, NULL, op0, op1, nmask, mask));
10031 }
10032 }
10033
10034 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
10035 be a constant, and so has already been expanded into a vector constant. */
10036
10037 void
10038 ix86_split_copysign_const (rtx operands[])
10039 {
10040 enum machine_mode mode, vmode;
10041 rtx dest, op0, op1, mask, x;
10042
10043 dest = operands[0];
10044 op0 = operands[1];
10045 op1 = operands[2];
10046 mask = operands[3];
10047
10048 mode = GET_MODE (dest);
10049 vmode = GET_MODE (mask);
10050
10051 dest = simplify_gen_subreg (vmode, dest, mode, 0);
10052 x = gen_rtx_AND (vmode, dest, mask);
10053 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
10054
10055 if (op0 != CONST0_RTX (vmode))
10056 {
10057 x = gen_rtx_IOR (vmode, dest, op0);
10058 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
10059 }
10060 }
10061
10062 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
10063 so we have to do two masks. */
10064
10065 void
10066 ix86_split_copysign_var (rtx operands[])
10067 {
10068 enum machine_mode mode, vmode;
10069 rtx dest, scratch, op0, op1, mask, nmask, x;
10070
10071 dest = operands[0];
10072 scratch = operands[1];
10073 op0 = operands[2];
10074 op1 = operands[3];
10075 nmask = operands[4];
10076 mask = operands[5];
10077
10078 mode = GET_MODE (dest);
10079 vmode = GET_MODE (mask);
10080
10081 if (rtx_equal_p (op0, op1))
10082 {
10083 /* Shouldn't happen often (it's useless, obviously), but when it does
10084 we'd generate incorrect code if we continue below. */
10085 emit_move_insn (dest, op0);
10086 return;
10087 }
10088
10089 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
10090 {
10091 gcc_assert (REGNO (op1) == REGNO (scratch));
10092
10093 x = gen_rtx_AND (vmode, scratch, mask);
10094 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
10095
10096 dest = mask;
10097 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
10098 x = gen_rtx_NOT (vmode, dest);
10099 x = gen_rtx_AND (vmode, x, op0);
10100 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
10101 }
10102 else
10103 {
10104 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
10105 {
10106 x = gen_rtx_AND (vmode, scratch, mask);
10107 }
10108 else /* alternative 2,4 */
10109 {
10110 gcc_assert (REGNO (mask) == REGNO (scratch));
10111 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
10112 x = gen_rtx_AND (vmode, scratch, op1);
10113 }
10114 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
10115
10116 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
10117 {
10118 dest = simplify_gen_subreg (vmode, op0, mode, 0);
10119 x = gen_rtx_AND (vmode, dest, nmask);
10120 }
10121 else /* alternative 3,4 */
10122 {
10123 gcc_assert (REGNO (nmask) == REGNO (dest));
10124 dest = nmask;
10125 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
10126 x = gen_rtx_AND (vmode, dest, op0);
10127 }
10128 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
10129 }
10130
10131 x = gen_rtx_IOR (vmode, dest, scratch);
10132 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
10133 }
10134
10135 /* Return TRUE or FALSE depending on whether the first SET in INSN
10136 has source and destination with matching CC modes, and that the
10137 CC mode is at least as constrained as REQ_MODE. */
10138
10139 int
10140 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
10141 {
10142 rtx set;
10143 enum machine_mode set_mode;
10144
10145 set = PATTERN (insn);
10146 if (GET_CODE (set) == PARALLEL)
10147 set = XVECEXP (set, 0, 0);
10148 gcc_assert (GET_CODE (set) == SET);
10149 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
10150
10151 set_mode = GET_MODE (SET_DEST (set));
10152 switch (set_mode)
10153 {
10154 case CCNOmode:
10155 if (req_mode != CCNOmode
10156 && (req_mode != CCmode
10157 || XEXP (SET_SRC (set), 1) != const0_rtx))
10158 return 0;
10159 break;
10160 case CCmode:
10161 if (req_mode == CCGCmode)
10162 return 0;
10163 /* FALLTHRU */
10164 case CCGCmode:
10165 if (req_mode == CCGOCmode || req_mode == CCNOmode)
10166 return 0;
10167 /* FALLTHRU */
10168 case CCGOCmode:
10169 if (req_mode == CCZmode)
10170 return 0;
10171 /* FALLTHRU */
10172 case CCZmode:
10173 break;
10174
10175 default:
10176 gcc_unreachable ();
10177 }
10178
10179 return (GET_MODE (SET_SRC (set)) == set_mode);
10180 }
10181
10182 /* Generate insn patterns to do an integer compare of OPERANDS. */
10183
10184 static rtx
10185 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
10186 {
10187 enum machine_mode cmpmode;
10188 rtx tmp, flags;
10189
10190 cmpmode = SELECT_CC_MODE (code, op0, op1);
10191 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
10192
10193 /* This is very simple, but making the interface the same as in the
10194 FP case makes the rest of the code easier. */
10195 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
10196 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
10197
10198 /* Return the test that should be put into the flags user, i.e.
10199 the bcc, scc, or cmov instruction. */
10200 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
10201 }
10202
10203 /* Figure out whether to use ordered or unordered fp comparisons.
10204 Return the appropriate mode to use. */
10205
10206 enum machine_mode
10207 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
10208 {
10209 /* ??? In order to make all comparisons reversible, we do all comparisons
10210 non-trapping when compiling for IEEE. Once gcc is able to distinguish
10211 all forms trapping and nontrapping comparisons, we can make inequality
10212 comparisons trapping again, since it results in better code when using
10213 FCOM based compares. */
10214 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
10215 }
10216
10217 enum machine_mode
10218 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
10219 {
10220 if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
10221 return ix86_fp_compare_mode (code);
10222 switch (code)
10223 {
10224 /* Only zero flag is needed. */
10225 case EQ: /* ZF=0 */
10226 case NE: /* ZF!=0 */
10227 return CCZmode;
10228 /* Codes needing carry flag. */
10229 case GEU: /* CF=0 */
10230 case GTU: /* CF=0 & ZF=0 */
10231 case LTU: /* CF=1 */
10232 case LEU: /* CF=1 | ZF=1 */
10233 return CCmode;
10234 /* Codes possibly doable only with sign flag when
10235 comparing against zero. */
10236 case GE: /* SF=OF or SF=0 */
10237 case LT: /* SF<>OF or SF=1 */
10238 if (op1 == const0_rtx)
10239 return CCGOCmode;
10240 else
10241 /* For other cases Carry flag is not required. */
10242 return CCGCmode;
10243 /* Codes doable only with sign flag when comparing
10244 against zero, but we miss jump instruction for it
10245 so we need to use relational tests against overflow
10246 that thus needs to be zero. */
10247 case GT: /* ZF=0 & SF=OF */
10248 case LE: /* ZF=1 | SF<>OF */
10249 if (op1 == const0_rtx)
10250 return CCNOmode;
10251 else
10252 return CCGCmode;
10253 /* strcmp pattern do (use flags) and combine may ask us for proper
10254 mode. */
10255 case USE:
10256 return CCmode;
10257 default:
10258 gcc_unreachable ();
10259 }
10260 }
10261
10262 /* Return the fixed registers used for condition codes. */
10263
10264 static bool
10265 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
10266 {
10267 *p1 = FLAGS_REG;
10268 *p2 = FPSR_REG;
10269 return true;
10270 }
10271
10272 /* If two condition code modes are compatible, return a condition code
10273 mode which is compatible with both. Otherwise, return
10274 VOIDmode. */
10275
10276 static enum machine_mode
10277 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
10278 {
10279 if (m1 == m2)
10280 return m1;
10281
10282 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
10283 return VOIDmode;
10284
10285 if ((m1 == CCGCmode && m2 == CCGOCmode)
10286 || (m1 == CCGOCmode && m2 == CCGCmode))
10287 return CCGCmode;
10288
10289 switch (m1)
10290 {
10291 default:
10292 gcc_unreachable ();
10293
10294 case CCmode:
10295 case CCGCmode:
10296 case CCGOCmode:
10297 case CCNOmode:
10298 case CCZmode:
10299 switch (m2)
10300 {
10301 default:
10302 return VOIDmode;
10303
10304 case CCmode:
10305 case CCGCmode:
10306 case CCGOCmode:
10307 case CCNOmode:
10308 case CCZmode:
10309 return CCmode;
10310 }
10311
10312 case CCFPmode:
10313 case CCFPUmode:
10314 /* These are only compatible with themselves, which we already
10315 checked above. */
10316 return VOIDmode;
10317 }
10318 }
10319
10320 /* Return true if we should use an FCOMI instruction for this fp comparison. */
10321
10322 int
10323 ix86_use_fcomi_compare (enum rtx_code code ATTRIBUTE_UNUSED)
10324 {
10325 enum rtx_code swapped_code = swap_condition (code);
10326 return ((ix86_fp_comparison_cost (code) == ix86_fp_comparison_fcomi_cost (code))
10327 || (ix86_fp_comparison_cost (swapped_code)
10328 == ix86_fp_comparison_fcomi_cost (swapped_code)));
10329 }
10330
10331 /* Swap, force into registers, or otherwise massage the two operands
10332 to a fp comparison. The operands are updated in place; the new
10333 comparison code is returned. */
10334
10335 static enum rtx_code
10336 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
10337 {
10338 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
10339 rtx op0 = *pop0, op1 = *pop1;
10340 enum machine_mode op_mode = GET_MODE (op0);
10341 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
10342
10343 /* All of the unordered compare instructions only work on registers.
10344 The same is true of the fcomi compare instructions. The XFmode
10345 compare instructions require registers except when comparing
10346 against zero or when converting operand 1 from fixed point to
10347 floating point. */
10348
10349 if (!is_sse
10350 && (fpcmp_mode == CCFPUmode
10351 || (op_mode == XFmode
10352 && ! (standard_80387_constant_p (op0) == 1
10353 || standard_80387_constant_p (op1) == 1)
10354 && GET_CODE (op1) != FLOAT)
10355 || ix86_use_fcomi_compare (code)))
10356 {
10357 op0 = force_reg (op_mode, op0);
10358 op1 = force_reg (op_mode, op1);
10359 }
10360 else
10361 {
10362 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
10363 things around if they appear profitable, otherwise force op0
10364 into a register. */
10365
10366 if (standard_80387_constant_p (op0) == 0
10367 || (MEM_P (op0)
10368 && ! (standard_80387_constant_p (op1) == 0
10369 || MEM_P (op1))))
10370 {
10371 rtx tmp;
10372 tmp = op0, op0 = op1, op1 = tmp;
10373 code = swap_condition (code);
10374 }
10375
10376 if (!REG_P (op0))
10377 op0 = force_reg (op_mode, op0);
10378
10379 if (CONSTANT_P (op1))
10380 {
10381 int tmp = standard_80387_constant_p (op1);
10382 if (tmp == 0)
10383 op1 = validize_mem (force_const_mem (op_mode, op1));
10384 else if (tmp == 1)
10385 {
10386 if (TARGET_CMOVE)
10387 op1 = force_reg (op_mode, op1);
10388 }
10389 else
10390 op1 = force_reg (op_mode, op1);
10391 }
10392 }
10393
10394 /* Try to rearrange the comparison to make it cheaper. */
10395 if (ix86_fp_comparison_cost (code)
10396 > ix86_fp_comparison_cost (swap_condition (code))
10397 && (REG_P (op1) || !no_new_pseudos))
10398 {
10399 rtx tmp;
10400 tmp = op0, op0 = op1, op1 = tmp;
10401 code = swap_condition (code);
10402 if (!REG_P (op0))
10403 op0 = force_reg (op_mode, op0);
10404 }
10405
10406 *pop0 = op0;
10407 *pop1 = op1;
10408 return code;
10409 }
10410
10411 /* Convert comparison codes we use to represent FP comparison to integer
10412 code that will result in proper branch. Return UNKNOWN if no such code
10413 is available. */
10414
10415 enum rtx_code
10416 ix86_fp_compare_code_to_integer (enum rtx_code code)
10417 {
10418 switch (code)
10419 {
10420 case GT:
10421 return GTU;
10422 case GE:
10423 return GEU;
10424 case ORDERED:
10425 case UNORDERED:
10426 return code;
10427 break;
10428 case UNEQ:
10429 return EQ;
10430 break;
10431 case UNLT:
10432 return LTU;
10433 break;
10434 case UNLE:
10435 return LEU;
10436 break;
10437 case LTGT:
10438 return NE;
10439 break;
10440 default:
10441 return UNKNOWN;
10442 }
10443 }
10444
10445 /* Split comparison code CODE into comparisons we can do using branch
10446 instructions. BYPASS_CODE is comparison code for branch that will
10447 branch around FIRST_CODE and SECOND_CODE. If some of branches
10448 is not required, set value to UNKNOWN.
10449 We never require more than two branches. */
10450
10451 void
10452 ix86_fp_comparison_codes (enum rtx_code code, enum rtx_code *bypass_code,
10453 enum rtx_code *first_code,
10454 enum rtx_code *second_code)
10455 {
10456 *first_code = code;
10457 *bypass_code = UNKNOWN;
10458 *second_code = UNKNOWN;
10459
10460 /* The fcomi comparison sets flags as follows:
10461
10462 cmp ZF PF CF
10463 > 0 0 0
10464 < 0 0 1
10465 = 1 0 0
10466 un 1 1 1 */
10467
10468 switch (code)
10469 {
10470 case GT: /* GTU - CF=0 & ZF=0 */
10471 case GE: /* GEU - CF=0 */
10472 case ORDERED: /* PF=0 */
10473 case UNORDERED: /* PF=1 */
10474 case UNEQ: /* EQ - ZF=1 */
10475 case UNLT: /* LTU - CF=1 */
10476 case UNLE: /* LEU - CF=1 | ZF=1 */
10477 case LTGT: /* EQ - ZF=0 */
10478 break;
10479 case LT: /* LTU - CF=1 - fails on unordered */
10480 *first_code = UNLT;
10481 *bypass_code = UNORDERED;
10482 break;
10483 case LE: /* LEU - CF=1 | ZF=1 - fails on unordered */
10484 *first_code = UNLE;
10485 *bypass_code = UNORDERED;
10486 break;
10487 case EQ: /* EQ - ZF=1 - fails on unordered */
10488 *first_code = UNEQ;
10489 *bypass_code = UNORDERED;
10490 break;
10491 case NE: /* NE - ZF=0 - fails on unordered */
10492 *first_code = LTGT;
10493 *second_code = UNORDERED;
10494 break;
10495 case UNGE: /* GEU - CF=0 - fails on unordered */
10496 *first_code = GE;
10497 *second_code = UNORDERED;
10498 break;
10499 case UNGT: /* GTU - CF=0 & ZF=0 - fails on unordered */
10500 *first_code = GT;
10501 *second_code = UNORDERED;
10502 break;
10503 default:
10504 gcc_unreachable ();
10505 }
10506 if (!TARGET_IEEE_FP)
10507 {
10508 *second_code = UNKNOWN;
10509 *bypass_code = UNKNOWN;
10510 }
10511 }
10512
10513 /* Return cost of comparison done fcom + arithmetics operations on AX.
10514 All following functions do use number of instructions as a cost metrics.
10515 In future this should be tweaked to compute bytes for optimize_size and
10516 take into account performance of various instructions on various CPUs. */
10517 static int
10518 ix86_fp_comparison_arithmetics_cost (enum rtx_code code)
10519 {
10520 if (!TARGET_IEEE_FP)
10521 return 4;
10522 /* The cost of code output by ix86_expand_fp_compare. */
10523 switch (code)
10524 {
10525 case UNLE:
10526 case UNLT:
10527 case LTGT:
10528 case GT:
10529 case GE:
10530 case UNORDERED:
10531 case ORDERED:
10532 case UNEQ:
10533 return 4;
10534 break;
10535 case LT:
10536 case NE:
10537 case EQ:
10538 case UNGE:
10539 return 5;
10540 break;
10541 case LE:
10542 case UNGT:
10543 return 6;
10544 break;
10545 default:
10546 gcc_unreachable ();
10547 }
10548 }
10549
10550 /* Return cost of comparison done using fcomi operation.
10551 See ix86_fp_comparison_arithmetics_cost for the metrics. */
10552 static int
10553 ix86_fp_comparison_fcomi_cost (enum rtx_code code)
10554 {
10555 enum rtx_code bypass_code, first_code, second_code;
10556 /* Return arbitrarily high cost when instruction is not supported - this
10557 prevents gcc from using it. */
10558 if (!TARGET_CMOVE)
10559 return 1024;
10560 ix86_fp_comparison_codes (code, &bypass_code, &first_code, &second_code);
10561 return (bypass_code != UNKNOWN || second_code != UNKNOWN) + 2;
10562 }
10563
10564 /* Return cost of comparison done using sahf operation.
10565 See ix86_fp_comparison_arithmetics_cost for the metrics. */
10566 static int
10567 ix86_fp_comparison_sahf_cost (enum rtx_code code)
10568 {
10569 enum rtx_code bypass_code, first_code, second_code;
10570 /* Return arbitrarily high cost when instruction is not preferred - this
10571 avoids gcc from using it. */
10572 if (!TARGET_USE_SAHF && !optimize_size)
10573 return 1024;
10574 ix86_fp_comparison_codes (code, &bypass_code, &first_code, &second_code);
10575 return (bypass_code != UNKNOWN || second_code != UNKNOWN) + 3;
10576 }
10577
10578 /* Compute cost of the comparison done using any method.
10579 See ix86_fp_comparison_arithmetics_cost for the metrics. */
10580 static int
10581 ix86_fp_comparison_cost (enum rtx_code code)
10582 {
10583 int fcomi_cost, sahf_cost, arithmetics_cost = 1024;
10584 int min;
10585
10586 fcomi_cost = ix86_fp_comparison_fcomi_cost (code);
10587 sahf_cost = ix86_fp_comparison_sahf_cost (code);
10588
10589 min = arithmetics_cost = ix86_fp_comparison_arithmetics_cost (code);
10590 if (min > sahf_cost)
10591 min = sahf_cost;
10592 if (min > fcomi_cost)
10593 min = fcomi_cost;
10594 return min;
10595 }
10596
10597 /* Generate insn patterns to do a floating point compare of OPERANDS. */
10598
10599 static rtx
10600 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch,
10601 rtx *second_test, rtx *bypass_test)
10602 {
10603 enum machine_mode fpcmp_mode, intcmp_mode;
10604 rtx tmp, tmp2;
10605 int cost = ix86_fp_comparison_cost (code);
10606 enum rtx_code bypass_code, first_code, second_code;
10607
10608 fpcmp_mode = ix86_fp_compare_mode (code);
10609 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
10610
10611 if (second_test)
10612 *second_test = NULL_RTX;
10613 if (bypass_test)
10614 *bypass_test = NULL_RTX;
10615
10616 ix86_fp_comparison_codes (code, &bypass_code, &first_code, &second_code);
10617
10618 /* Do fcomi/sahf based test when profitable. */
10619 if ((bypass_code == UNKNOWN || bypass_test)
10620 && (second_code == UNKNOWN || second_test)
10621 && ix86_fp_comparison_arithmetics_cost (code) > cost)
10622 {
10623 if (TARGET_CMOVE)
10624 {
10625 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
10626 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
10627 tmp);
10628 emit_insn (tmp);
10629 }
10630 else
10631 {
10632 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
10633 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
10634 if (!scratch)
10635 scratch = gen_reg_rtx (HImode);
10636 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
10637 emit_insn (gen_x86_sahf_1 (scratch));
10638 }
10639
10640 /* The FP codes work out to act like unsigned. */
10641 intcmp_mode = fpcmp_mode;
10642 code = first_code;
10643 if (bypass_code != UNKNOWN)
10644 *bypass_test = gen_rtx_fmt_ee (bypass_code, VOIDmode,
10645 gen_rtx_REG (intcmp_mode, FLAGS_REG),
10646 const0_rtx);
10647 if (second_code != UNKNOWN)
10648 *second_test = gen_rtx_fmt_ee (second_code, VOIDmode,
10649 gen_rtx_REG (intcmp_mode, FLAGS_REG),
10650 const0_rtx);
10651 }
10652 else
10653 {
10654 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
10655 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
10656 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
10657 if (!scratch)
10658 scratch = gen_reg_rtx (HImode);
10659 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
10660
10661 /* In the unordered case, we have to check C2 for NaN's, which
10662 doesn't happen to work out to anything nice combination-wise.
10663 So do some bit twiddling on the value we've got in AH to come
10664 up with an appropriate set of condition codes. */
10665
10666 intcmp_mode = CCNOmode;
10667 switch (code)
10668 {
10669 case GT:
10670 case UNGT:
10671 if (code == GT || !TARGET_IEEE_FP)
10672 {
10673 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
10674 code = EQ;
10675 }
10676 else
10677 {
10678 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
10679 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
10680 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
10681 intcmp_mode = CCmode;
10682 code = GEU;
10683 }
10684 break;
10685 case LT:
10686 case UNLT:
10687 if (code == LT && TARGET_IEEE_FP)
10688 {
10689 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
10690 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x01)));
10691 intcmp_mode = CCmode;
10692 code = EQ;
10693 }
10694 else
10695 {
10696 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x01)));
10697 code = NE;
10698 }
10699 break;
10700 case GE:
10701 case UNGE:
10702 if (code == GE || !TARGET_IEEE_FP)
10703 {
10704 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
10705 code = EQ;
10706 }
10707 else
10708 {
10709 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
10710 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
10711 GEN_INT (0x01)));
10712 code = NE;
10713 }
10714 break;
10715 case LE:
10716 case UNLE:
10717 if (code == LE && TARGET_IEEE_FP)
10718 {
10719 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
10720 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
10721 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
10722 intcmp_mode = CCmode;
10723 code = LTU;
10724 }
10725 else
10726 {
10727 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
10728 code = NE;
10729 }
10730 break;
10731 case EQ:
10732 case UNEQ:
10733 if (code == EQ && TARGET_IEEE_FP)
10734 {
10735 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
10736 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
10737 intcmp_mode = CCmode;
10738 code = EQ;
10739 }
10740 else
10741 {
10742 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
10743 code = NE;
10744 break;
10745 }
10746 break;
10747 case NE:
10748 case LTGT:
10749 if (code == NE && TARGET_IEEE_FP)
10750 {
10751 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
10752 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
10753 GEN_INT (0x40)));
10754 code = NE;
10755 }
10756 else
10757 {
10758 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
10759 code = EQ;
10760 }
10761 break;
10762
10763 case UNORDERED:
10764 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
10765 code = NE;
10766 break;
10767 case ORDERED:
10768 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
10769 code = EQ;
10770 break;
10771
10772 default:
10773 gcc_unreachable ();
10774 }
10775 }
10776
10777 /* Return the test that should be put into the flags user, i.e.
10778 the bcc, scc, or cmov instruction. */
10779 return gen_rtx_fmt_ee (code, VOIDmode,
10780 gen_rtx_REG (intcmp_mode, FLAGS_REG),
10781 const0_rtx);
10782 }
10783
10784 rtx
10785 ix86_expand_compare (enum rtx_code code, rtx *second_test, rtx *bypass_test)
10786 {
10787 rtx op0, op1, ret;
10788 op0 = ix86_compare_op0;
10789 op1 = ix86_compare_op1;
10790
10791 if (second_test)
10792 *second_test = NULL_RTX;
10793 if (bypass_test)
10794 *bypass_test = NULL_RTX;
10795
10796 if (ix86_compare_emitted)
10797 {
10798 ret = gen_rtx_fmt_ee (code, VOIDmode, ix86_compare_emitted, const0_rtx);
10799 ix86_compare_emitted = NULL_RTX;
10800 }
10801 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
10802 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX,
10803 second_test, bypass_test);
10804 else
10805 ret = ix86_expand_int_compare (code, op0, op1);
10806
10807 return ret;
10808 }
10809
10810 /* Return true if the CODE will result in nontrivial jump sequence. */
10811 bool
10812 ix86_fp_jump_nontrivial_p (enum rtx_code code)
10813 {
10814 enum rtx_code bypass_code, first_code, second_code;
10815 if (!TARGET_CMOVE)
10816 return true;
10817 ix86_fp_comparison_codes (code, &bypass_code, &first_code, &second_code);
10818 return bypass_code != UNKNOWN || second_code != UNKNOWN;
10819 }
10820
10821 void
10822 ix86_expand_branch (enum rtx_code code, rtx label)
10823 {
10824 rtx tmp;
10825
10826 /* If we have emitted a compare insn, go straight to simple.
10827 ix86_expand_compare won't emit anything if ix86_compare_emitted
10828 is non NULL. */
10829 if (ix86_compare_emitted)
10830 goto simple;
10831
10832 switch (GET_MODE (ix86_compare_op0))
10833 {
10834 case QImode:
10835 case HImode:
10836 case SImode:
10837 simple:
10838 tmp = ix86_expand_compare (code, NULL, NULL);
10839 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10840 gen_rtx_LABEL_REF (VOIDmode, label),
10841 pc_rtx);
10842 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
10843 return;
10844
10845 case SFmode:
10846 case DFmode:
10847 case XFmode:
10848 {
10849 rtvec vec;
10850 int use_fcomi;
10851 enum rtx_code bypass_code, first_code, second_code;
10852
10853 code = ix86_prepare_fp_compare_args (code, &ix86_compare_op0,
10854 &ix86_compare_op1);
10855
10856 ix86_fp_comparison_codes (code, &bypass_code, &first_code, &second_code);
10857
10858 /* Check whether we will use the natural sequence with one jump. If
10859 so, we can expand jump early. Otherwise delay expansion by
10860 creating compound insn to not confuse optimizers. */
10861 if (bypass_code == UNKNOWN && second_code == UNKNOWN
10862 && TARGET_CMOVE)
10863 {
10864 ix86_split_fp_branch (code, ix86_compare_op0, ix86_compare_op1,
10865 gen_rtx_LABEL_REF (VOIDmode, label),
10866 pc_rtx, NULL_RTX, NULL_RTX);
10867 }
10868 else
10869 {
10870 tmp = gen_rtx_fmt_ee (code, VOIDmode,
10871 ix86_compare_op0, ix86_compare_op1);
10872 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10873 gen_rtx_LABEL_REF (VOIDmode, label),
10874 pc_rtx);
10875 tmp = gen_rtx_SET (VOIDmode, pc_rtx, tmp);
10876
10877 use_fcomi = ix86_use_fcomi_compare (code);
10878 vec = rtvec_alloc (3 + !use_fcomi);
10879 RTVEC_ELT (vec, 0) = tmp;
10880 RTVEC_ELT (vec, 1)
10881 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCFPmode, 18));
10882 RTVEC_ELT (vec, 2)
10883 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCFPmode, 17));
10884 if (! use_fcomi)
10885 RTVEC_ELT (vec, 3)
10886 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (HImode));
10887
10888 emit_jump_insn (gen_rtx_PARALLEL (VOIDmode, vec));
10889 }
10890 return;
10891 }
10892
10893 case DImode:
10894 if (TARGET_64BIT)
10895 goto simple;
10896 case TImode:
10897 /* Expand DImode branch into multiple compare+branch. */
10898 {
10899 rtx lo[2], hi[2], label2;
10900 enum rtx_code code1, code2, code3;
10901 enum machine_mode submode;
10902
10903 if (CONSTANT_P (ix86_compare_op0) && ! CONSTANT_P (ix86_compare_op1))
10904 {
10905 tmp = ix86_compare_op0;
10906 ix86_compare_op0 = ix86_compare_op1;
10907 ix86_compare_op1 = tmp;
10908 code = swap_condition (code);
10909 }
10910 if (GET_MODE (ix86_compare_op0) == DImode)
10911 {
10912 split_di (&ix86_compare_op0, 1, lo+0, hi+0);
10913 split_di (&ix86_compare_op1, 1, lo+1, hi+1);
10914 submode = SImode;
10915 }
10916 else
10917 {
10918 split_ti (&ix86_compare_op0, 1, lo+0, hi+0);
10919 split_ti (&ix86_compare_op1, 1, lo+1, hi+1);
10920 submode = DImode;
10921 }
10922
10923 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
10924 avoid two branches. This costs one extra insn, so disable when
10925 optimizing for size. */
10926
10927 if ((code == EQ || code == NE)
10928 && (!optimize_size
10929 || hi[1] == const0_rtx || lo[1] == const0_rtx))
10930 {
10931 rtx xor0, xor1;
10932
10933 xor1 = hi[0];
10934 if (hi[1] != const0_rtx)
10935 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
10936 NULL_RTX, 0, OPTAB_WIDEN);
10937
10938 xor0 = lo[0];
10939 if (lo[1] != const0_rtx)
10940 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
10941 NULL_RTX, 0, OPTAB_WIDEN);
10942
10943 tmp = expand_binop (submode, ior_optab, xor1, xor0,
10944 NULL_RTX, 0, OPTAB_WIDEN);
10945
10946 ix86_compare_op0 = tmp;
10947 ix86_compare_op1 = const0_rtx;
10948 ix86_expand_branch (code, label);
10949 return;
10950 }
10951
10952 /* Otherwise, if we are doing less-than or greater-or-equal-than,
10953 op1 is a constant and the low word is zero, then we can just
10954 examine the high word. */
10955
10956 if (CONST_INT_P (hi[1]) && lo[1] == const0_rtx)
10957 switch (code)
10958 {
10959 case LT: case LTU: case GE: case GEU:
10960 ix86_compare_op0 = hi[0];
10961 ix86_compare_op1 = hi[1];
10962 ix86_expand_branch (code, label);
10963 return;
10964 default:
10965 break;
10966 }
10967
10968 /* Otherwise, we need two or three jumps. */
10969
10970 label2 = gen_label_rtx ();
10971
10972 code1 = code;
10973 code2 = swap_condition (code);
10974 code3 = unsigned_condition (code);
10975
10976 switch (code)
10977 {
10978 case LT: case GT: case LTU: case GTU:
10979 break;
10980
10981 case LE: code1 = LT; code2 = GT; break;
10982 case GE: code1 = GT; code2 = LT; break;
10983 case LEU: code1 = LTU; code2 = GTU; break;
10984 case GEU: code1 = GTU; code2 = LTU; break;
10985
10986 case EQ: code1 = UNKNOWN; code2 = NE; break;
10987 case NE: code2 = UNKNOWN; break;
10988
10989 default:
10990 gcc_unreachable ();
10991 }
10992
10993 /*
10994 * a < b =>
10995 * if (hi(a) < hi(b)) goto true;
10996 * if (hi(a) > hi(b)) goto false;
10997 * if (lo(a) < lo(b)) goto true;
10998 * false:
10999 */
11000
11001 ix86_compare_op0 = hi[0];
11002 ix86_compare_op1 = hi[1];
11003
11004 if (code1 != UNKNOWN)
11005 ix86_expand_branch (code1, label);
11006 if (code2 != UNKNOWN)
11007 ix86_expand_branch (code2, label2);
11008
11009 ix86_compare_op0 = lo[0];
11010 ix86_compare_op1 = lo[1];
11011 ix86_expand_branch (code3, label);
11012
11013 if (code2 != UNKNOWN)
11014 emit_label (label2);
11015 return;
11016 }
11017
11018 default:
11019 gcc_unreachable ();
11020 }
11021 }
11022
11023 /* Split branch based on floating point condition. */
11024 void
11025 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
11026 rtx target1, rtx target2, rtx tmp, rtx pushed)
11027 {
11028 rtx second, bypass;
11029 rtx label = NULL_RTX;
11030 rtx condition;
11031 int bypass_probability = -1, second_probability = -1, probability = -1;
11032 rtx i;
11033
11034 if (target2 != pc_rtx)
11035 {
11036 rtx tmp = target2;
11037 code = reverse_condition_maybe_unordered (code);
11038 target2 = target1;
11039 target1 = tmp;
11040 }
11041
11042 condition = ix86_expand_fp_compare (code, op1, op2,
11043 tmp, &second, &bypass);
11044
11045 /* Remove pushed operand from stack. */
11046 if (pushed)
11047 ix86_free_from_memory (GET_MODE (pushed));
11048
11049 if (split_branch_probability >= 0)
11050 {
11051 /* Distribute the probabilities across the jumps.
11052 Assume the BYPASS and SECOND to be always test
11053 for UNORDERED. */
11054 probability = split_branch_probability;
11055
11056 /* Value of 1 is low enough to make no need for probability
11057 to be updated. Later we may run some experiments and see
11058 if unordered values are more frequent in practice. */
11059 if (bypass)
11060 bypass_probability = 1;
11061 if (second)
11062 second_probability = 1;
11063 }
11064 if (bypass != NULL_RTX)
11065 {
11066 label = gen_label_rtx ();
11067 i = emit_jump_insn (gen_rtx_SET
11068 (VOIDmode, pc_rtx,
11069 gen_rtx_IF_THEN_ELSE (VOIDmode,
11070 bypass,
11071 gen_rtx_LABEL_REF (VOIDmode,
11072 label),
11073 pc_rtx)));
11074 if (bypass_probability >= 0)
11075 REG_NOTES (i)
11076 = gen_rtx_EXPR_LIST (REG_BR_PROB,
11077 GEN_INT (bypass_probability),
11078 REG_NOTES (i));
11079 }
11080 i = emit_jump_insn (gen_rtx_SET
11081 (VOIDmode, pc_rtx,
11082 gen_rtx_IF_THEN_ELSE (VOIDmode,
11083 condition, target1, target2)));
11084 if (probability >= 0)
11085 REG_NOTES (i)
11086 = gen_rtx_EXPR_LIST (REG_BR_PROB,
11087 GEN_INT (probability),
11088 REG_NOTES (i));
11089 if (second != NULL_RTX)
11090 {
11091 i = emit_jump_insn (gen_rtx_SET
11092 (VOIDmode, pc_rtx,
11093 gen_rtx_IF_THEN_ELSE (VOIDmode, second, target1,
11094 target2)));
11095 if (second_probability >= 0)
11096 REG_NOTES (i)
11097 = gen_rtx_EXPR_LIST (REG_BR_PROB,
11098 GEN_INT (second_probability),
11099 REG_NOTES (i));
11100 }
11101 if (label != NULL_RTX)
11102 emit_label (label);
11103 }
11104
11105 int
11106 ix86_expand_setcc (enum rtx_code code, rtx dest)
11107 {
11108 rtx ret, tmp, tmpreg, equiv;
11109 rtx second_test, bypass_test;
11110
11111 if (GET_MODE (ix86_compare_op0) == (TARGET_64BIT ? TImode : DImode))
11112 return 0; /* FAIL */
11113
11114 gcc_assert (GET_MODE (dest) == QImode);
11115
11116 ret = ix86_expand_compare (code, &second_test, &bypass_test);
11117 PUT_MODE (ret, QImode);
11118
11119 tmp = dest;
11120 tmpreg = dest;
11121
11122 emit_insn (gen_rtx_SET (VOIDmode, tmp, ret));
11123 if (bypass_test || second_test)
11124 {
11125 rtx test = second_test;
11126 int bypass = 0;
11127 rtx tmp2 = gen_reg_rtx (QImode);
11128 if (bypass_test)
11129 {
11130 gcc_assert (!second_test);
11131 test = bypass_test;
11132 bypass = 1;
11133 PUT_CODE (test, reverse_condition_maybe_unordered (GET_CODE (test)));
11134 }
11135 PUT_MODE (test, QImode);
11136 emit_insn (gen_rtx_SET (VOIDmode, tmp2, test));
11137
11138 if (bypass)
11139 emit_insn (gen_andqi3 (tmp, tmpreg, tmp2));
11140 else
11141 emit_insn (gen_iorqi3 (tmp, tmpreg, tmp2));
11142 }
11143
11144 /* Attach a REG_EQUAL note describing the comparison result. */
11145 if (ix86_compare_op0 && ix86_compare_op1)
11146 {
11147 equiv = simplify_gen_relational (code, QImode,
11148 GET_MODE (ix86_compare_op0),
11149 ix86_compare_op0, ix86_compare_op1);
11150 set_unique_reg_note (get_last_insn (), REG_EQUAL, equiv);
11151 }
11152
11153 return 1; /* DONE */
11154 }
11155
11156 /* Expand comparison setting or clearing carry flag. Return true when
11157 successful and set pop for the operation. */
11158 static bool
11159 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
11160 {
11161 enum machine_mode mode =
11162 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
11163
11164 /* Do not handle DImode compares that go through special path. Also we can't
11165 deal with FP compares yet. This is possible to add. */
11166 if (mode == (TARGET_64BIT ? TImode : DImode))
11167 return false;
11168 if (FLOAT_MODE_P (mode))
11169 {
11170 rtx second_test = NULL, bypass_test = NULL;
11171 rtx compare_op, compare_seq;
11172
11173 /* Shortcut: following common codes never translate into carry flag compares. */
11174 if (code == EQ || code == NE || code == UNEQ || code == LTGT
11175 || code == ORDERED || code == UNORDERED)
11176 return false;
11177
11178 /* These comparisons require zero flag; swap operands so they won't. */
11179 if ((code == GT || code == UNLE || code == LE || code == UNGT)
11180 && !TARGET_IEEE_FP)
11181 {
11182 rtx tmp = op0;
11183 op0 = op1;
11184 op1 = tmp;
11185 code = swap_condition (code);
11186 }
11187
11188 /* Try to expand the comparison and verify that we end up with carry flag
11189 based comparison. This is fails to be true only when we decide to expand
11190 comparison using arithmetic that is not too common scenario. */
11191 start_sequence ();
11192 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX,
11193 &second_test, &bypass_test);
11194 compare_seq = get_insns ();
11195 end_sequence ();
11196
11197 if (second_test || bypass_test)
11198 return false;
11199 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
11200 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
11201 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
11202 else
11203 code = GET_CODE (compare_op);
11204 if (code != LTU && code != GEU)
11205 return false;
11206 emit_insn (compare_seq);
11207 *pop = compare_op;
11208 return true;
11209 }
11210 if (!INTEGRAL_MODE_P (mode))
11211 return false;
11212 switch (code)
11213 {
11214 case LTU:
11215 case GEU:
11216 break;
11217
11218 /* Convert a==0 into (unsigned)a<1. */
11219 case EQ:
11220 case NE:
11221 if (op1 != const0_rtx)
11222 return false;
11223 op1 = const1_rtx;
11224 code = (code == EQ ? LTU : GEU);
11225 break;
11226
11227 /* Convert a>b into b<a or a>=b-1. */
11228 case GTU:
11229 case LEU:
11230 if (CONST_INT_P (op1))
11231 {
11232 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
11233 /* Bail out on overflow. We still can swap operands but that
11234 would force loading of the constant into register. */
11235 if (op1 == const0_rtx
11236 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
11237 return false;
11238 code = (code == GTU ? GEU : LTU);
11239 }
11240 else
11241 {
11242 rtx tmp = op1;
11243 op1 = op0;
11244 op0 = tmp;
11245 code = (code == GTU ? LTU : GEU);
11246 }
11247 break;
11248
11249 /* Convert a>=0 into (unsigned)a<0x80000000. */
11250 case LT:
11251 case GE:
11252 if (mode == DImode || op1 != const0_rtx)
11253 return false;
11254 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
11255 code = (code == LT ? GEU : LTU);
11256 break;
11257 case LE:
11258 case GT:
11259 if (mode == DImode || op1 != constm1_rtx)
11260 return false;
11261 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
11262 code = (code == LE ? GEU : LTU);
11263 break;
11264
11265 default:
11266 return false;
11267 }
11268 /* Swapping operands may cause constant to appear as first operand. */
11269 if (!nonimmediate_operand (op0, VOIDmode))
11270 {
11271 if (no_new_pseudos)
11272 return false;
11273 op0 = force_reg (mode, op0);
11274 }
11275 ix86_compare_op0 = op0;
11276 ix86_compare_op1 = op1;
11277 *pop = ix86_expand_compare (code, NULL, NULL);
11278 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
11279 return true;
11280 }
11281
11282 int
11283 ix86_expand_int_movcc (rtx operands[])
11284 {
11285 enum rtx_code code = GET_CODE (operands[1]), compare_code;
11286 rtx compare_seq, compare_op;
11287 rtx second_test, bypass_test;
11288 enum machine_mode mode = GET_MODE (operands[0]);
11289 bool sign_bit_compare_p = false;;
11290
11291 start_sequence ();
11292 compare_op = ix86_expand_compare (code, &second_test, &bypass_test);
11293 compare_seq = get_insns ();
11294 end_sequence ();
11295
11296 compare_code = GET_CODE (compare_op);
11297
11298 if ((ix86_compare_op1 == const0_rtx && (code == GE || code == LT))
11299 || (ix86_compare_op1 == constm1_rtx && (code == GT || code == LE)))
11300 sign_bit_compare_p = true;
11301
11302 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
11303 HImode insns, we'd be swallowed in word prefix ops. */
11304
11305 if ((mode != HImode || TARGET_FAST_PREFIX)
11306 && (mode != (TARGET_64BIT ? TImode : DImode))
11307 && CONST_INT_P (operands[2])
11308 && CONST_INT_P (operands[3]))
11309 {
11310 rtx out = operands[0];
11311 HOST_WIDE_INT ct = INTVAL (operands[2]);
11312 HOST_WIDE_INT cf = INTVAL (operands[3]);
11313 HOST_WIDE_INT diff;
11314
11315 diff = ct - cf;
11316 /* Sign bit compares are better done using shifts than we do by using
11317 sbb. */
11318 if (sign_bit_compare_p
11319 || ix86_expand_carry_flag_compare (code, ix86_compare_op0,
11320 ix86_compare_op1, &compare_op))
11321 {
11322 /* Detect overlap between destination and compare sources. */
11323 rtx tmp = out;
11324
11325 if (!sign_bit_compare_p)
11326 {
11327 bool fpcmp = false;
11328
11329 compare_code = GET_CODE (compare_op);
11330
11331 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
11332 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
11333 {
11334 fpcmp = true;
11335 compare_code = ix86_fp_compare_code_to_integer (compare_code);
11336 }
11337
11338 /* To simplify rest of code, restrict to the GEU case. */
11339 if (compare_code == LTU)
11340 {
11341 HOST_WIDE_INT tmp = ct;
11342 ct = cf;
11343 cf = tmp;
11344 compare_code = reverse_condition (compare_code);
11345 code = reverse_condition (code);
11346 }
11347 else
11348 {
11349 if (fpcmp)
11350 PUT_CODE (compare_op,
11351 reverse_condition_maybe_unordered
11352 (GET_CODE (compare_op)));
11353 else
11354 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
11355 }
11356 diff = ct - cf;
11357
11358 if (reg_overlap_mentioned_p (out, ix86_compare_op0)
11359 || reg_overlap_mentioned_p (out, ix86_compare_op1))
11360 tmp = gen_reg_rtx (mode);
11361
11362 if (mode == DImode)
11363 emit_insn (gen_x86_movdicc_0_m1_rex64 (tmp, compare_op));
11364 else
11365 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp), compare_op));
11366 }
11367 else
11368 {
11369 if (code == GT || code == GE)
11370 code = reverse_condition (code);
11371 else
11372 {
11373 HOST_WIDE_INT tmp = ct;
11374 ct = cf;
11375 cf = tmp;
11376 diff = ct - cf;
11377 }
11378 tmp = emit_store_flag (tmp, code, ix86_compare_op0,
11379 ix86_compare_op1, VOIDmode, 0, -1);
11380 }
11381
11382 if (diff == 1)
11383 {
11384 /*
11385 * cmpl op0,op1
11386 * sbbl dest,dest
11387 * [addl dest, ct]
11388 *
11389 * Size 5 - 8.
11390 */
11391 if (ct)
11392 tmp = expand_simple_binop (mode, PLUS,
11393 tmp, GEN_INT (ct),
11394 copy_rtx (tmp), 1, OPTAB_DIRECT);
11395 }
11396 else if (cf == -1)
11397 {
11398 /*
11399 * cmpl op0,op1
11400 * sbbl dest,dest
11401 * orl $ct, dest
11402 *
11403 * Size 8.
11404 */
11405 tmp = expand_simple_binop (mode, IOR,
11406 tmp, GEN_INT (ct),
11407 copy_rtx (tmp), 1, OPTAB_DIRECT);
11408 }
11409 else if (diff == -1 && ct)
11410 {
11411 /*
11412 * cmpl op0,op1
11413 * sbbl dest,dest
11414 * notl dest
11415 * [addl dest, cf]
11416 *
11417 * Size 8 - 11.
11418 */
11419 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
11420 if (cf)
11421 tmp = expand_simple_binop (mode, PLUS,
11422 copy_rtx (tmp), GEN_INT (cf),
11423 copy_rtx (tmp), 1, OPTAB_DIRECT);
11424 }
11425 else
11426 {
11427 /*
11428 * cmpl op0,op1
11429 * sbbl dest,dest
11430 * [notl dest]
11431 * andl cf - ct, dest
11432 * [addl dest, ct]
11433 *
11434 * Size 8 - 11.
11435 */
11436
11437 if (cf == 0)
11438 {
11439 cf = ct;
11440 ct = 0;
11441 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
11442 }
11443
11444 tmp = expand_simple_binop (mode, AND,
11445 copy_rtx (tmp),
11446 gen_int_mode (cf - ct, mode),
11447 copy_rtx (tmp), 1, OPTAB_DIRECT);
11448 if (ct)
11449 tmp = expand_simple_binop (mode, PLUS,
11450 copy_rtx (tmp), GEN_INT (ct),
11451 copy_rtx (tmp), 1, OPTAB_DIRECT);
11452 }
11453
11454 if (!rtx_equal_p (tmp, out))
11455 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
11456
11457 return 1; /* DONE */
11458 }
11459
11460 if (diff < 0)
11461 {
11462 HOST_WIDE_INT tmp;
11463 tmp = ct, ct = cf, cf = tmp;
11464 diff = -diff;
11465 if (FLOAT_MODE_P (GET_MODE (ix86_compare_op0)))
11466 {
11467 /* We may be reversing unordered compare to normal compare, that
11468 is not valid in general (we may convert non-trapping condition
11469 to trapping one), however on i386 we currently emit all
11470 comparisons unordered. */
11471 compare_code = reverse_condition_maybe_unordered (compare_code);
11472 code = reverse_condition_maybe_unordered (code);
11473 }
11474 else
11475 {
11476 compare_code = reverse_condition (compare_code);
11477 code = reverse_condition (code);
11478 }
11479 }
11480
11481 compare_code = UNKNOWN;
11482 if (GET_MODE_CLASS (GET_MODE (ix86_compare_op0)) == MODE_INT
11483 && CONST_INT_P (ix86_compare_op1))
11484 {
11485 if (ix86_compare_op1 == const0_rtx
11486 && (code == LT || code == GE))
11487 compare_code = code;
11488 else if (ix86_compare_op1 == constm1_rtx)
11489 {
11490 if (code == LE)
11491 compare_code = LT;
11492 else if (code == GT)
11493 compare_code = GE;
11494 }
11495 }
11496
11497 /* Optimize dest = (op0 < 0) ? -1 : cf. */
11498 if (compare_code != UNKNOWN
11499 && GET_MODE (ix86_compare_op0) == GET_MODE (out)
11500 && (cf == -1 || ct == -1))
11501 {
11502 /* If lea code below could be used, only optimize
11503 if it results in a 2 insn sequence. */
11504
11505 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
11506 || diff == 3 || diff == 5 || diff == 9)
11507 || (compare_code == LT && ct == -1)
11508 || (compare_code == GE && cf == -1))
11509 {
11510 /*
11511 * notl op1 (if necessary)
11512 * sarl $31, op1
11513 * orl cf, op1
11514 */
11515 if (ct != -1)
11516 {
11517 cf = ct;
11518 ct = -1;
11519 code = reverse_condition (code);
11520 }
11521
11522 out = emit_store_flag (out, code, ix86_compare_op0,
11523 ix86_compare_op1, VOIDmode, 0, -1);
11524
11525 out = expand_simple_binop (mode, IOR,
11526 out, GEN_INT (cf),
11527 out, 1, OPTAB_DIRECT);
11528 if (out != operands[0])
11529 emit_move_insn (operands[0], out);
11530
11531 return 1; /* DONE */
11532 }
11533 }
11534
11535
11536 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
11537 || diff == 3 || diff == 5 || diff == 9)
11538 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
11539 && (mode != DImode
11540 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
11541 {
11542 /*
11543 * xorl dest,dest
11544 * cmpl op1,op2
11545 * setcc dest
11546 * lea cf(dest*(ct-cf)),dest
11547 *
11548 * Size 14.
11549 *
11550 * This also catches the degenerate setcc-only case.
11551 */
11552
11553 rtx tmp;
11554 int nops;
11555
11556 out = emit_store_flag (out, code, ix86_compare_op0,
11557 ix86_compare_op1, VOIDmode, 0, 1);
11558
11559 nops = 0;
11560 /* On x86_64 the lea instruction operates on Pmode, so we need
11561 to get arithmetics done in proper mode to match. */
11562 if (diff == 1)
11563 tmp = copy_rtx (out);
11564 else
11565 {
11566 rtx out1;
11567 out1 = copy_rtx (out);
11568 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
11569 nops++;
11570 if (diff & 1)
11571 {
11572 tmp = gen_rtx_PLUS (mode, tmp, out1);
11573 nops++;
11574 }
11575 }
11576 if (cf != 0)
11577 {
11578 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
11579 nops++;
11580 }
11581 if (!rtx_equal_p (tmp, out))
11582 {
11583 if (nops == 1)
11584 out = force_operand (tmp, copy_rtx (out));
11585 else
11586 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
11587 }
11588 if (!rtx_equal_p (out, operands[0]))
11589 emit_move_insn (operands[0], copy_rtx (out));
11590
11591 return 1; /* DONE */
11592 }
11593
11594 /*
11595 * General case: Jumpful:
11596 * xorl dest,dest cmpl op1, op2
11597 * cmpl op1, op2 movl ct, dest
11598 * setcc dest jcc 1f
11599 * decl dest movl cf, dest
11600 * andl (cf-ct),dest 1:
11601 * addl ct,dest
11602 *
11603 * Size 20. Size 14.
11604 *
11605 * This is reasonably steep, but branch mispredict costs are
11606 * high on modern cpus, so consider failing only if optimizing
11607 * for space.
11608 */
11609
11610 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
11611 && BRANCH_COST >= 2)
11612 {
11613 if (cf == 0)
11614 {
11615 cf = ct;
11616 ct = 0;
11617 if (FLOAT_MODE_P (GET_MODE (ix86_compare_op0)))
11618 /* We may be reversing unordered compare to normal compare,
11619 that is not valid in general (we may convert non-trapping
11620 condition to trapping one), however on i386 we currently
11621 emit all comparisons unordered. */
11622 code = reverse_condition_maybe_unordered (code);
11623 else
11624 {
11625 code = reverse_condition (code);
11626 if (compare_code != UNKNOWN)
11627 compare_code = reverse_condition (compare_code);
11628 }
11629 }
11630
11631 if (compare_code != UNKNOWN)
11632 {
11633 /* notl op1 (if needed)
11634 sarl $31, op1
11635 andl (cf-ct), op1
11636 addl ct, op1
11637
11638 For x < 0 (resp. x <= -1) there will be no notl,
11639 so if possible swap the constants to get rid of the
11640 complement.
11641 True/false will be -1/0 while code below (store flag
11642 followed by decrement) is 0/-1, so the constants need
11643 to be exchanged once more. */
11644
11645 if (compare_code == GE || !cf)
11646 {
11647 code = reverse_condition (code);
11648 compare_code = LT;
11649 }
11650 else
11651 {
11652 HOST_WIDE_INT tmp = cf;
11653 cf = ct;
11654 ct = tmp;
11655 }
11656
11657 out = emit_store_flag (out, code, ix86_compare_op0,
11658 ix86_compare_op1, VOIDmode, 0, -1);
11659 }
11660 else
11661 {
11662 out = emit_store_flag (out, code, ix86_compare_op0,
11663 ix86_compare_op1, VOIDmode, 0, 1);
11664
11665 out = expand_simple_binop (mode, PLUS, copy_rtx (out), constm1_rtx,
11666 copy_rtx (out), 1, OPTAB_DIRECT);
11667 }
11668
11669 out = expand_simple_binop (mode, AND, copy_rtx (out),
11670 gen_int_mode (cf - ct, mode),
11671 copy_rtx (out), 1, OPTAB_DIRECT);
11672 if (ct)
11673 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
11674 copy_rtx (out), 1, OPTAB_DIRECT);
11675 if (!rtx_equal_p (out, operands[0]))
11676 emit_move_insn (operands[0], copy_rtx (out));
11677
11678 return 1; /* DONE */
11679 }
11680 }
11681
11682 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
11683 {
11684 /* Try a few things more with specific constants and a variable. */
11685
11686 optab op;
11687 rtx var, orig_out, out, tmp;
11688
11689 if (BRANCH_COST <= 2)
11690 return 0; /* FAIL */
11691
11692 /* If one of the two operands is an interesting constant, load a
11693 constant with the above and mask it in with a logical operation. */
11694
11695 if (CONST_INT_P (operands[2]))
11696 {
11697 var = operands[3];
11698 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
11699 operands[3] = constm1_rtx, op = and_optab;
11700 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
11701 operands[3] = const0_rtx, op = ior_optab;
11702 else
11703 return 0; /* FAIL */
11704 }
11705 else if (CONST_INT_P (operands[3]))
11706 {
11707 var = operands[2];
11708 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
11709 operands[2] = constm1_rtx, op = and_optab;
11710 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
11711 operands[2] = const0_rtx, op = ior_optab;
11712 else
11713 return 0; /* FAIL */
11714 }
11715 else
11716 return 0; /* FAIL */
11717
11718 orig_out = operands[0];
11719 tmp = gen_reg_rtx (mode);
11720 operands[0] = tmp;
11721
11722 /* Recurse to get the constant loaded. */
11723 if (ix86_expand_int_movcc (operands) == 0)
11724 return 0; /* FAIL */
11725
11726 /* Mask in the interesting variable. */
11727 out = expand_binop (mode, op, var, tmp, orig_out, 0,
11728 OPTAB_WIDEN);
11729 if (!rtx_equal_p (out, orig_out))
11730 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
11731
11732 return 1; /* DONE */
11733 }
11734
11735 /*
11736 * For comparison with above,
11737 *
11738 * movl cf,dest
11739 * movl ct,tmp
11740 * cmpl op1,op2
11741 * cmovcc tmp,dest
11742 *
11743 * Size 15.
11744 */
11745
11746 if (! nonimmediate_operand (operands[2], mode))
11747 operands[2] = force_reg (mode, operands[2]);
11748 if (! nonimmediate_operand (operands[3], mode))
11749 operands[3] = force_reg (mode, operands[3]);
11750
11751 if (bypass_test && reg_overlap_mentioned_p (operands[0], operands[3]))
11752 {
11753 rtx tmp = gen_reg_rtx (mode);
11754 emit_move_insn (tmp, operands[3]);
11755 operands[3] = tmp;
11756 }
11757 if (second_test && reg_overlap_mentioned_p (operands[0], operands[2]))
11758 {
11759 rtx tmp = gen_reg_rtx (mode);
11760 emit_move_insn (tmp, operands[2]);
11761 operands[2] = tmp;
11762 }
11763
11764 if (! register_operand (operands[2], VOIDmode)
11765 && (mode == QImode
11766 || ! register_operand (operands[3], VOIDmode)))
11767 operands[2] = force_reg (mode, operands[2]);
11768
11769 if (mode == QImode
11770 && ! register_operand (operands[3], VOIDmode))
11771 operands[3] = force_reg (mode, operands[3]);
11772
11773 emit_insn (compare_seq);
11774 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
11775 gen_rtx_IF_THEN_ELSE (mode,
11776 compare_op, operands[2],
11777 operands[3])));
11778 if (bypass_test)
11779 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (operands[0]),
11780 gen_rtx_IF_THEN_ELSE (mode,
11781 bypass_test,
11782 copy_rtx (operands[3]),
11783 copy_rtx (operands[0]))));
11784 if (second_test)
11785 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (operands[0]),
11786 gen_rtx_IF_THEN_ELSE (mode,
11787 second_test,
11788 copy_rtx (operands[2]),
11789 copy_rtx (operands[0]))));
11790
11791 return 1; /* DONE */
11792 }
11793
11794 /* Swap, force into registers, or otherwise massage the two operands
11795 to an sse comparison with a mask result. Thus we differ a bit from
11796 ix86_prepare_fp_compare_args which expects to produce a flags result.
11797
11798 The DEST operand exists to help determine whether to commute commutative
11799 operators. The POP0/POP1 operands are updated in place. The new
11800 comparison code is returned, or UNKNOWN if not implementable. */
11801
11802 static enum rtx_code
11803 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
11804 rtx *pop0, rtx *pop1)
11805 {
11806 rtx tmp;
11807
11808 switch (code)
11809 {
11810 case LTGT:
11811 case UNEQ:
11812 /* We have no LTGT as an operator. We could implement it with
11813 NE & ORDERED, but this requires an extra temporary. It's
11814 not clear that it's worth it. */
11815 return UNKNOWN;
11816
11817 case LT:
11818 case LE:
11819 case UNGT:
11820 case UNGE:
11821 /* These are supported directly. */
11822 break;
11823
11824 case EQ:
11825 case NE:
11826 case UNORDERED:
11827 case ORDERED:
11828 /* For commutative operators, try to canonicalize the destination
11829 operand to be first in the comparison - this helps reload to
11830 avoid extra moves. */
11831 if (!dest || !rtx_equal_p (dest, *pop1))
11832 break;
11833 /* FALLTHRU */
11834
11835 case GE:
11836 case GT:
11837 case UNLE:
11838 case UNLT:
11839 /* These are not supported directly. Swap the comparison operands
11840 to transform into something that is supported. */
11841 tmp = *pop0;
11842 *pop0 = *pop1;
11843 *pop1 = tmp;
11844 code = swap_condition (code);
11845 break;
11846
11847 default:
11848 gcc_unreachable ();
11849 }
11850
11851 return code;
11852 }
11853
11854 /* Detect conditional moves that exactly match min/max operational
11855 semantics. Note that this is IEEE safe, as long as we don't
11856 interchange the operands.
11857
11858 Returns FALSE if this conditional move doesn't match a MIN/MAX,
11859 and TRUE if the operation is successful and instructions are emitted. */
11860
11861 static bool
11862 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
11863 rtx cmp_op1, rtx if_true, rtx if_false)
11864 {
11865 enum machine_mode mode;
11866 bool is_min;
11867 rtx tmp;
11868
11869 if (code == LT)
11870 ;
11871 else if (code == UNGE)
11872 {
11873 tmp = if_true;
11874 if_true = if_false;
11875 if_false = tmp;
11876 }
11877 else
11878 return false;
11879
11880 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
11881 is_min = true;
11882 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
11883 is_min = false;
11884 else
11885 return false;
11886
11887 mode = GET_MODE (dest);
11888
11889 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
11890 but MODE may be a vector mode and thus not appropriate. */
11891 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
11892 {
11893 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
11894 rtvec v;
11895
11896 if_true = force_reg (mode, if_true);
11897 v = gen_rtvec (2, if_true, if_false);
11898 tmp = gen_rtx_UNSPEC (mode, v, u);
11899 }
11900 else
11901 {
11902 code = is_min ? SMIN : SMAX;
11903 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
11904 }
11905
11906 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
11907 return true;
11908 }
11909
11910 /* Expand an sse vector comparison. Return the register with the result. */
11911
11912 static rtx
11913 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
11914 rtx op_true, rtx op_false)
11915 {
11916 enum machine_mode mode = GET_MODE (dest);
11917 rtx x;
11918
11919 cmp_op0 = force_reg (mode, cmp_op0);
11920 if (!nonimmediate_operand (cmp_op1, mode))
11921 cmp_op1 = force_reg (mode, cmp_op1);
11922
11923 if (optimize
11924 || reg_overlap_mentioned_p (dest, op_true)
11925 || reg_overlap_mentioned_p (dest, op_false))
11926 dest = gen_reg_rtx (mode);
11927
11928 x = gen_rtx_fmt_ee (code, mode, cmp_op0, cmp_op1);
11929 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
11930
11931 return dest;
11932 }
11933
11934 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
11935 operations. This is used for both scalar and vector conditional moves. */
11936
11937 static void
11938 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
11939 {
11940 enum machine_mode mode = GET_MODE (dest);
11941 rtx t2, t3, x;
11942
11943 if (op_false == CONST0_RTX (mode))
11944 {
11945 op_true = force_reg (mode, op_true);
11946 x = gen_rtx_AND (mode, cmp, op_true);
11947 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
11948 }
11949 else if (op_true == CONST0_RTX (mode))
11950 {
11951 op_false = force_reg (mode, op_false);
11952 x = gen_rtx_NOT (mode, cmp);
11953 x = gen_rtx_AND (mode, x, op_false);
11954 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
11955 }
11956 else
11957 {
11958 op_true = force_reg (mode, op_true);
11959 op_false = force_reg (mode, op_false);
11960
11961 t2 = gen_reg_rtx (mode);
11962 if (optimize)
11963 t3 = gen_reg_rtx (mode);
11964 else
11965 t3 = dest;
11966
11967 x = gen_rtx_AND (mode, op_true, cmp);
11968 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
11969
11970 x = gen_rtx_NOT (mode, cmp);
11971 x = gen_rtx_AND (mode, x, op_false);
11972 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
11973
11974 x = gen_rtx_IOR (mode, t3, t2);
11975 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
11976 }
11977 }
11978
11979 /* Expand a floating-point conditional move. Return true if successful. */
11980
11981 int
11982 ix86_expand_fp_movcc (rtx operands[])
11983 {
11984 enum machine_mode mode = GET_MODE (operands[0]);
11985 enum rtx_code code = GET_CODE (operands[1]);
11986 rtx tmp, compare_op, second_test, bypass_test;
11987
11988 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
11989 {
11990 enum machine_mode cmode;
11991
11992 /* Since we've no cmove for sse registers, don't force bad register
11993 allocation just to gain access to it. Deny movcc when the
11994 comparison mode doesn't match the move mode. */
11995 cmode = GET_MODE (ix86_compare_op0);
11996 if (cmode == VOIDmode)
11997 cmode = GET_MODE (ix86_compare_op1);
11998 if (cmode != mode)
11999 return 0;
12000
12001 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
12002 &ix86_compare_op0,
12003 &ix86_compare_op1);
12004 if (code == UNKNOWN)
12005 return 0;
12006
12007 if (ix86_expand_sse_fp_minmax (operands[0], code, ix86_compare_op0,
12008 ix86_compare_op1, operands[2],
12009 operands[3]))
12010 return 1;
12011
12012 tmp = ix86_expand_sse_cmp (operands[0], code, ix86_compare_op0,
12013 ix86_compare_op1, operands[2], operands[3]);
12014 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
12015 return 1;
12016 }
12017
12018 /* The floating point conditional move instructions don't directly
12019 support conditions resulting from a signed integer comparison. */
12020
12021 compare_op = ix86_expand_compare (code, &second_test, &bypass_test);
12022
12023 /* The floating point conditional move instructions don't directly
12024 support signed integer comparisons. */
12025
12026 if (!fcmov_comparison_operator (compare_op, VOIDmode))
12027 {
12028 gcc_assert (!second_test && !bypass_test);
12029 tmp = gen_reg_rtx (QImode);
12030 ix86_expand_setcc (code, tmp);
12031 code = NE;
12032 ix86_compare_op0 = tmp;
12033 ix86_compare_op1 = const0_rtx;
12034 compare_op = ix86_expand_compare (code, &second_test, &bypass_test);
12035 }
12036 if (bypass_test && reg_overlap_mentioned_p (operands[0], operands[3]))
12037 {
12038 tmp = gen_reg_rtx (mode);
12039 emit_move_insn (tmp, operands[3]);
12040 operands[3] = tmp;
12041 }
12042 if (second_test && reg_overlap_mentioned_p (operands[0], operands[2]))
12043 {
12044 tmp = gen_reg_rtx (mode);
12045 emit_move_insn (tmp, operands[2]);
12046 operands[2] = tmp;
12047 }
12048
12049 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
12050 gen_rtx_IF_THEN_ELSE (mode, compare_op,
12051 operands[2], operands[3])));
12052 if (bypass_test)
12053 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
12054 gen_rtx_IF_THEN_ELSE (mode, bypass_test,
12055 operands[3], operands[0])));
12056 if (second_test)
12057 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
12058 gen_rtx_IF_THEN_ELSE (mode, second_test,
12059 operands[2], operands[0])));
12060
12061 return 1;
12062 }
12063
12064 /* Expand a floating-point vector conditional move; a vcond operation
12065 rather than a movcc operation. */
12066
12067 bool
12068 ix86_expand_fp_vcond (rtx operands[])
12069 {
12070 enum rtx_code code = GET_CODE (operands[3]);
12071 rtx cmp;
12072
12073 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
12074 &operands[4], &operands[5]);
12075 if (code == UNKNOWN)
12076 return false;
12077
12078 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
12079 operands[5], operands[1], operands[2]))
12080 return true;
12081
12082 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
12083 operands[1], operands[2]);
12084 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
12085 return true;
12086 }
12087
12088 /* Expand a signed integral vector conditional move. */
12089
12090 bool
12091 ix86_expand_int_vcond (rtx operands[])
12092 {
12093 enum machine_mode mode = GET_MODE (operands[0]);
12094 enum rtx_code code = GET_CODE (operands[3]);
12095 bool negate = false;
12096 rtx x, cop0, cop1;
12097
12098 cop0 = operands[4];
12099 cop1 = operands[5];
12100
12101 /* Canonicalize the comparison to EQ, GT, GTU. */
12102 switch (code)
12103 {
12104 case EQ:
12105 case GT:
12106 case GTU:
12107 break;
12108
12109 case NE:
12110 case LE:
12111 case LEU:
12112 code = reverse_condition (code);
12113 negate = true;
12114 break;
12115
12116 case GE:
12117 case GEU:
12118 code = reverse_condition (code);
12119 negate = true;
12120 /* FALLTHRU */
12121
12122 case LT:
12123 case LTU:
12124 code = swap_condition (code);
12125 x = cop0, cop0 = cop1, cop1 = x;
12126 break;
12127
12128 default:
12129 gcc_unreachable ();
12130 }
12131
12132 /* Unsigned parallel compare is not supported by the hardware. Play some
12133 tricks to turn this into a signed comparison against 0. */
12134 if (code == GTU)
12135 {
12136 cop0 = force_reg (mode, cop0);
12137
12138 switch (mode)
12139 {
12140 case V4SImode:
12141 {
12142 rtx t1, t2, mask;
12143
12144 /* Perform a parallel modulo subtraction. */
12145 t1 = gen_reg_rtx (mode);
12146 emit_insn (gen_subv4si3 (t1, cop0, cop1));
12147
12148 /* Extract the original sign bit of op0. */
12149 mask = GEN_INT (-0x80000000);
12150 mask = gen_rtx_CONST_VECTOR (mode,
12151 gen_rtvec (4, mask, mask, mask, mask));
12152 mask = force_reg (mode, mask);
12153 t2 = gen_reg_rtx (mode);
12154 emit_insn (gen_andv4si3 (t2, cop0, mask));
12155
12156 /* XOR it back into the result of the subtraction. This results
12157 in the sign bit set iff we saw unsigned underflow. */
12158 x = gen_reg_rtx (mode);
12159 emit_insn (gen_xorv4si3 (x, t1, t2));
12160
12161 code = GT;
12162 }
12163 break;
12164
12165 case V16QImode:
12166 case V8HImode:
12167 /* Perform a parallel unsigned saturating subtraction. */
12168 x = gen_reg_rtx (mode);
12169 emit_insn (gen_rtx_SET (VOIDmode, x,
12170 gen_rtx_US_MINUS (mode, cop0, cop1)));
12171
12172 code = EQ;
12173 negate = !negate;
12174 break;
12175
12176 default:
12177 gcc_unreachable ();
12178 }
12179
12180 cop0 = x;
12181 cop1 = CONST0_RTX (mode);
12182 }
12183
12184 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
12185 operands[1+negate], operands[2-negate]);
12186
12187 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
12188 operands[2-negate]);
12189 return true;
12190 }
12191
12192 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
12193 true if we should do zero extension, else sign extension. HIGH_P is
12194 true if we want the N/2 high elements, else the low elements. */
12195
12196 void
12197 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
12198 {
12199 enum machine_mode imode = GET_MODE (operands[1]);
12200 rtx (*unpack)(rtx, rtx, rtx);
12201 rtx se, dest;
12202
12203 switch (imode)
12204 {
12205 case V16QImode:
12206 if (high_p)
12207 unpack = gen_vec_interleave_highv16qi;
12208 else
12209 unpack = gen_vec_interleave_lowv16qi;
12210 break;
12211 case V8HImode:
12212 if (high_p)
12213 unpack = gen_vec_interleave_highv8hi;
12214 else
12215 unpack = gen_vec_interleave_lowv8hi;
12216 break;
12217 case V4SImode:
12218 if (high_p)
12219 unpack = gen_vec_interleave_highv4si;
12220 else
12221 unpack = gen_vec_interleave_lowv4si;
12222 break;
12223 default:
12224 gcc_unreachable ();
12225 }
12226
12227 dest = gen_lowpart (imode, operands[0]);
12228
12229 if (unsigned_p)
12230 se = force_reg (imode, CONST0_RTX (imode));
12231 else
12232 se = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
12233 operands[1], pc_rtx, pc_rtx);
12234
12235 emit_insn (unpack (dest, operands[1], se));
12236 }
12237
12238 /* Expand conditional increment or decrement using adb/sbb instructions.
12239 The default case using setcc followed by the conditional move can be
12240 done by generic code. */
12241 int
12242 ix86_expand_int_addcc (rtx operands[])
12243 {
12244 enum rtx_code code = GET_CODE (operands[1]);
12245 rtx compare_op;
12246 rtx val = const0_rtx;
12247 bool fpcmp = false;
12248 enum machine_mode mode = GET_MODE (operands[0]);
12249
12250 if (operands[3] != const1_rtx
12251 && operands[3] != constm1_rtx)
12252 return 0;
12253 if (!ix86_expand_carry_flag_compare (code, ix86_compare_op0,
12254 ix86_compare_op1, &compare_op))
12255 return 0;
12256 code = GET_CODE (compare_op);
12257
12258 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
12259 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
12260 {
12261 fpcmp = true;
12262 code = ix86_fp_compare_code_to_integer (code);
12263 }
12264
12265 if (code != LTU)
12266 {
12267 val = constm1_rtx;
12268 if (fpcmp)
12269 PUT_CODE (compare_op,
12270 reverse_condition_maybe_unordered
12271 (GET_CODE (compare_op)));
12272 else
12273 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
12274 }
12275 PUT_MODE (compare_op, mode);
12276
12277 /* Construct either adc or sbb insn. */
12278 if ((code == LTU) == (operands[3] == constm1_rtx))
12279 {
12280 switch (GET_MODE (operands[0]))
12281 {
12282 case QImode:
12283 emit_insn (gen_subqi3_carry (operands[0], operands[2], val, compare_op));
12284 break;
12285 case HImode:
12286 emit_insn (gen_subhi3_carry (operands[0], operands[2], val, compare_op));
12287 break;
12288 case SImode:
12289 emit_insn (gen_subsi3_carry (operands[0], operands[2], val, compare_op));
12290 break;
12291 case DImode:
12292 emit_insn (gen_subdi3_carry_rex64 (operands[0], operands[2], val, compare_op));
12293 break;
12294 default:
12295 gcc_unreachable ();
12296 }
12297 }
12298 else
12299 {
12300 switch (GET_MODE (operands[0]))
12301 {
12302 case QImode:
12303 emit_insn (gen_addqi3_carry (operands[0], operands[2], val, compare_op));
12304 break;
12305 case HImode:
12306 emit_insn (gen_addhi3_carry (operands[0], operands[2], val, compare_op));
12307 break;
12308 case SImode:
12309 emit_insn (gen_addsi3_carry (operands[0], operands[2], val, compare_op));
12310 break;
12311 case DImode:
12312 emit_insn (gen_adddi3_carry_rex64 (operands[0], operands[2], val, compare_op));
12313 break;
12314 default:
12315 gcc_unreachable ();
12316 }
12317 }
12318 return 1; /* DONE */
12319 }
12320
12321
12322 /* Split operands 0 and 1 into SImode parts. Similar to split_di, but
12323 works for floating pointer parameters and nonoffsetable memories.
12324 For pushes, it returns just stack offsets; the values will be saved
12325 in the right order. Maximally three parts are generated. */
12326
12327 static int
12328 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
12329 {
12330 int size;
12331
12332 if (!TARGET_64BIT)
12333 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
12334 else
12335 size = (GET_MODE_SIZE (mode) + 4) / 8;
12336
12337 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
12338 gcc_assert (size >= 2 && size <= 3);
12339
12340 /* Optimize constant pool reference to immediates. This is used by fp
12341 moves, that force all constants to memory to allow combining. */
12342 if (MEM_P (operand) && MEM_READONLY_P (operand))
12343 {
12344 rtx tmp = maybe_get_pool_constant (operand);
12345 if (tmp)
12346 operand = tmp;
12347 }
12348
12349 if (MEM_P (operand) && !offsettable_memref_p (operand))
12350 {
12351 /* The only non-offsetable memories we handle are pushes. */
12352 int ok = push_operand (operand, VOIDmode);
12353
12354 gcc_assert (ok);
12355
12356 operand = copy_rtx (operand);
12357 PUT_MODE (operand, Pmode);
12358 parts[0] = parts[1] = parts[2] = operand;
12359 return size;
12360 }
12361
12362 if (GET_CODE (operand) == CONST_VECTOR)
12363 {
12364 enum machine_mode imode = int_mode_for_mode (mode);
12365 /* Caution: if we looked through a constant pool memory above,
12366 the operand may actually have a different mode now. That's
12367 ok, since we want to pun this all the way back to an integer. */
12368 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
12369 gcc_assert (operand != NULL);
12370 mode = imode;
12371 }
12372
12373 if (!TARGET_64BIT)
12374 {
12375 if (mode == DImode)
12376 split_di (&operand, 1, &parts[0], &parts[1]);
12377 else
12378 {
12379 if (REG_P (operand))
12380 {
12381 gcc_assert (reload_completed);
12382 parts[0] = gen_rtx_REG (SImode, REGNO (operand) + 0);
12383 parts[1] = gen_rtx_REG (SImode, REGNO (operand) + 1);
12384 if (size == 3)
12385 parts[2] = gen_rtx_REG (SImode, REGNO (operand) + 2);
12386 }
12387 else if (offsettable_memref_p (operand))
12388 {
12389 operand = adjust_address (operand, SImode, 0);
12390 parts[0] = operand;
12391 parts[1] = adjust_address (operand, SImode, 4);
12392 if (size == 3)
12393 parts[2] = adjust_address (operand, SImode, 8);
12394 }
12395 else if (GET_CODE (operand) == CONST_DOUBLE)
12396 {
12397 REAL_VALUE_TYPE r;
12398 long l[4];
12399
12400 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
12401 switch (mode)
12402 {
12403 case XFmode:
12404 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
12405 parts[2] = gen_int_mode (l[2], SImode);
12406 break;
12407 case DFmode:
12408 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
12409 break;
12410 default:
12411 gcc_unreachable ();
12412 }
12413 parts[1] = gen_int_mode (l[1], SImode);
12414 parts[0] = gen_int_mode (l[0], SImode);
12415 }
12416 else
12417 gcc_unreachable ();
12418 }
12419 }
12420 else
12421 {
12422 if (mode == TImode)
12423 split_ti (&operand, 1, &parts[0], &parts[1]);
12424 if (mode == XFmode || mode == TFmode)
12425 {
12426 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
12427 if (REG_P (operand))
12428 {
12429 gcc_assert (reload_completed);
12430 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
12431 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
12432 }
12433 else if (offsettable_memref_p (operand))
12434 {
12435 operand = adjust_address (operand, DImode, 0);
12436 parts[0] = operand;
12437 parts[1] = adjust_address (operand, upper_mode, 8);
12438 }
12439 else if (GET_CODE (operand) == CONST_DOUBLE)
12440 {
12441 REAL_VALUE_TYPE r;
12442 long l[4];
12443
12444 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
12445 real_to_target (l, &r, mode);
12446
12447 /* Do not use shift by 32 to avoid warning on 32bit systems. */
12448 if (HOST_BITS_PER_WIDE_INT >= 64)
12449 parts[0]
12450 = gen_int_mode
12451 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
12452 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
12453 DImode);
12454 else
12455 parts[0] = immed_double_const (l[0], l[1], DImode);
12456
12457 if (upper_mode == SImode)
12458 parts[1] = gen_int_mode (l[2], SImode);
12459 else if (HOST_BITS_PER_WIDE_INT >= 64)
12460 parts[1]
12461 = gen_int_mode
12462 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
12463 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
12464 DImode);
12465 else
12466 parts[1] = immed_double_const (l[2], l[3], DImode);
12467 }
12468 else
12469 gcc_unreachable ();
12470 }
12471 }
12472
12473 return size;
12474 }
12475
12476 /* Emit insns to perform a move or push of DI, DF, and XF values.
12477 Return false when normal moves are needed; true when all required
12478 insns have been emitted. Operands 2-4 contain the input values
12479 int the correct order; operands 5-7 contain the output values. */
12480
12481 void
12482 ix86_split_long_move (rtx operands[])
12483 {
12484 rtx part[2][3];
12485 int nparts;
12486 int push = 0;
12487 int collisions = 0;
12488 enum machine_mode mode = GET_MODE (operands[0]);
12489
12490 /* The DFmode expanders may ask us to move double.
12491 For 64bit target this is single move. By hiding the fact
12492 here we simplify i386.md splitters. */
12493 if (GET_MODE_SIZE (GET_MODE (operands[0])) == 8 && TARGET_64BIT)
12494 {
12495 /* Optimize constant pool reference to immediates. This is used by
12496 fp moves, that force all constants to memory to allow combining. */
12497
12498 if (MEM_P (operands[1])
12499 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
12500 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
12501 operands[1] = get_pool_constant (XEXP (operands[1], 0));
12502 if (push_operand (operands[0], VOIDmode))
12503 {
12504 operands[0] = copy_rtx (operands[0]);
12505 PUT_MODE (operands[0], Pmode);
12506 }
12507 else
12508 operands[0] = gen_lowpart (DImode, operands[0]);
12509 operands[1] = gen_lowpart (DImode, operands[1]);
12510 emit_move_insn (operands[0], operands[1]);
12511 return;
12512 }
12513
12514 /* The only non-offsettable memory we handle is push. */
12515 if (push_operand (operands[0], VOIDmode))
12516 push = 1;
12517 else
12518 gcc_assert (!MEM_P (operands[0])
12519 || offsettable_memref_p (operands[0]));
12520
12521 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
12522 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
12523
12524 /* When emitting push, take care for source operands on the stack. */
12525 if (push && MEM_P (operands[1])
12526 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
12527 {
12528 if (nparts == 3)
12529 part[1][1] = change_address (part[1][1], GET_MODE (part[1][1]),
12530 XEXP (part[1][2], 0));
12531 part[1][0] = change_address (part[1][0], GET_MODE (part[1][0]),
12532 XEXP (part[1][1], 0));
12533 }
12534
12535 /* We need to do copy in the right order in case an address register
12536 of the source overlaps the destination. */
12537 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
12538 {
12539 if (reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0)))
12540 collisions++;
12541 if (reg_overlap_mentioned_p (part[0][1], XEXP (part[1][0], 0)))
12542 collisions++;
12543 if (nparts == 3
12544 && reg_overlap_mentioned_p (part[0][2], XEXP (part[1][0], 0)))
12545 collisions++;
12546
12547 /* Collision in the middle part can be handled by reordering. */
12548 if (collisions == 1 && nparts == 3
12549 && reg_overlap_mentioned_p (part[0][1], XEXP (part[1][0], 0)))
12550 {
12551 rtx tmp;
12552 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
12553 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
12554 }
12555
12556 /* If there are more collisions, we can't handle it by reordering.
12557 Do an lea to the last part and use only one colliding move. */
12558 else if (collisions > 1)
12559 {
12560 rtx base;
12561
12562 collisions = 1;
12563
12564 base = part[0][nparts - 1];
12565
12566 /* Handle the case when the last part isn't valid for lea.
12567 Happens in 64-bit mode storing the 12-byte XFmode. */
12568 if (GET_MODE (base) != Pmode)
12569 base = gen_rtx_REG (Pmode, REGNO (base));
12570
12571 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
12572 part[1][0] = replace_equiv_address (part[1][0], base);
12573 part[1][1] = replace_equiv_address (part[1][1],
12574 plus_constant (base, UNITS_PER_WORD));
12575 if (nparts == 3)
12576 part[1][2] = replace_equiv_address (part[1][2],
12577 plus_constant (base, 8));
12578 }
12579 }
12580
12581 if (push)
12582 {
12583 if (!TARGET_64BIT)
12584 {
12585 if (nparts == 3)
12586 {
12587 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
12588 emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, GEN_INT (-4)));
12589 emit_move_insn (part[0][2], part[1][2]);
12590 }
12591 }
12592 else
12593 {
12594 /* In 64bit mode we don't have 32bit push available. In case this is
12595 register, it is OK - we will just use larger counterpart. We also
12596 retype memory - these comes from attempt to avoid REX prefix on
12597 moving of second half of TFmode value. */
12598 if (GET_MODE (part[1][1]) == SImode)
12599 {
12600 switch (GET_CODE (part[1][1]))
12601 {
12602 case MEM:
12603 part[1][1] = adjust_address (part[1][1], DImode, 0);
12604 break;
12605
12606 case REG:
12607 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
12608 break;
12609
12610 default:
12611 gcc_unreachable ();
12612 }
12613
12614 if (GET_MODE (part[1][0]) == SImode)
12615 part[1][0] = part[1][1];
12616 }
12617 }
12618 emit_move_insn (part[0][1], part[1][1]);
12619 emit_move_insn (part[0][0], part[1][0]);
12620 return;
12621 }
12622
12623 /* Choose correct order to not overwrite the source before it is copied. */
12624 if ((REG_P (part[0][0])
12625 && REG_P (part[1][1])
12626 && (REGNO (part[0][0]) == REGNO (part[1][1])
12627 || (nparts == 3
12628 && REGNO (part[0][0]) == REGNO (part[1][2]))))
12629 || (collisions > 0
12630 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
12631 {
12632 if (nparts == 3)
12633 {
12634 operands[2] = part[0][2];
12635 operands[3] = part[0][1];
12636 operands[4] = part[0][0];
12637 operands[5] = part[1][2];
12638 operands[6] = part[1][1];
12639 operands[7] = part[1][0];
12640 }
12641 else
12642 {
12643 operands[2] = part[0][1];
12644 operands[3] = part[0][0];
12645 operands[5] = part[1][1];
12646 operands[6] = part[1][0];
12647 }
12648 }
12649 else
12650 {
12651 if (nparts == 3)
12652 {
12653 operands[2] = part[0][0];
12654 operands[3] = part[0][1];
12655 operands[4] = part[0][2];
12656 operands[5] = part[1][0];
12657 operands[6] = part[1][1];
12658 operands[7] = part[1][2];
12659 }
12660 else
12661 {
12662 operands[2] = part[0][0];
12663 operands[3] = part[0][1];
12664 operands[5] = part[1][0];
12665 operands[6] = part[1][1];
12666 }
12667 }
12668
12669 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
12670 if (optimize_size)
12671 {
12672 if (CONST_INT_P (operands[5])
12673 && operands[5] != const0_rtx
12674 && REG_P (operands[2]))
12675 {
12676 if (CONST_INT_P (operands[6])
12677 && INTVAL (operands[6]) == INTVAL (operands[5]))
12678 operands[6] = operands[2];
12679
12680 if (nparts == 3
12681 && CONST_INT_P (operands[7])
12682 && INTVAL (operands[7]) == INTVAL (operands[5]))
12683 operands[7] = operands[2];
12684 }
12685
12686 if (nparts == 3
12687 && CONST_INT_P (operands[6])
12688 && operands[6] != const0_rtx
12689 && REG_P (operands[3])
12690 && CONST_INT_P (operands[7])
12691 && INTVAL (operands[7]) == INTVAL (operands[6]))
12692 operands[7] = operands[3];
12693 }
12694
12695 emit_move_insn (operands[2], operands[5]);
12696 emit_move_insn (operands[3], operands[6]);
12697 if (nparts == 3)
12698 emit_move_insn (operands[4], operands[7]);
12699
12700 return;
12701 }
12702
12703 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
12704 left shift by a constant, either using a single shift or
12705 a sequence of add instructions. */
12706
12707 static void
12708 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
12709 {
12710 if (count == 1)
12711 {
12712 emit_insn ((mode == DImode
12713 ? gen_addsi3
12714 : gen_adddi3) (operand, operand, operand));
12715 }
12716 else if (!optimize_size
12717 && count * ix86_cost->add <= ix86_cost->shift_const)
12718 {
12719 int i;
12720 for (i=0; i<count; i++)
12721 {
12722 emit_insn ((mode == DImode
12723 ? gen_addsi3
12724 : gen_adddi3) (operand, operand, operand));
12725 }
12726 }
12727 else
12728 emit_insn ((mode == DImode
12729 ? gen_ashlsi3
12730 : gen_ashldi3) (operand, operand, GEN_INT (count)));
12731 }
12732
12733 void
12734 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
12735 {
12736 rtx low[2], high[2];
12737 int count;
12738 const int single_width = mode == DImode ? 32 : 64;
12739
12740 if (CONST_INT_P (operands[2]))
12741 {
12742 (mode == DImode ? split_di : split_ti) (operands, 2, low, high);
12743 count = INTVAL (operands[2]) & (single_width * 2 - 1);
12744
12745 if (count >= single_width)
12746 {
12747 emit_move_insn (high[0], low[1]);
12748 emit_move_insn (low[0], const0_rtx);
12749
12750 if (count > single_width)
12751 ix86_expand_ashl_const (high[0], count - single_width, mode);
12752 }
12753 else
12754 {
12755 if (!rtx_equal_p (operands[0], operands[1]))
12756 emit_move_insn (operands[0], operands[1]);
12757 emit_insn ((mode == DImode
12758 ? gen_x86_shld_1
12759 : gen_x86_64_shld) (high[0], low[0], GEN_INT (count)));
12760 ix86_expand_ashl_const (low[0], count, mode);
12761 }
12762 return;
12763 }
12764
12765 (mode == DImode ? split_di : split_ti) (operands, 1, low, high);
12766
12767 if (operands[1] == const1_rtx)
12768 {
12769 /* Assuming we've chosen a QImode capable registers, then 1 << N
12770 can be done with two 32/64-bit shifts, no branches, no cmoves. */
12771 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
12772 {
12773 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
12774
12775 ix86_expand_clear (low[0]);
12776 ix86_expand_clear (high[0]);
12777 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (single_width)));
12778
12779 d = gen_lowpart (QImode, low[0]);
12780 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
12781 s = gen_rtx_EQ (QImode, flags, const0_rtx);
12782 emit_insn (gen_rtx_SET (VOIDmode, d, s));
12783
12784 d = gen_lowpart (QImode, high[0]);
12785 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
12786 s = gen_rtx_NE (QImode, flags, const0_rtx);
12787 emit_insn (gen_rtx_SET (VOIDmode, d, s));
12788 }
12789
12790 /* Otherwise, we can get the same results by manually performing
12791 a bit extract operation on bit 5/6, and then performing the two
12792 shifts. The two methods of getting 0/1 into low/high are exactly
12793 the same size. Avoiding the shift in the bit extract case helps
12794 pentium4 a bit; no one else seems to care much either way. */
12795 else
12796 {
12797 rtx x;
12798
12799 if (TARGET_PARTIAL_REG_STALL && !optimize_size)
12800 x = gen_rtx_ZERO_EXTEND (mode == DImode ? SImode : DImode, operands[2]);
12801 else
12802 x = gen_lowpart (mode == DImode ? SImode : DImode, operands[2]);
12803 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
12804
12805 emit_insn ((mode == DImode
12806 ? gen_lshrsi3
12807 : gen_lshrdi3) (high[0], high[0], GEN_INT (mode == DImode ? 5 : 6)));
12808 emit_insn ((mode == DImode
12809 ? gen_andsi3
12810 : gen_anddi3) (high[0], high[0], GEN_INT (1)));
12811 emit_move_insn (low[0], high[0]);
12812 emit_insn ((mode == DImode
12813 ? gen_xorsi3
12814 : gen_xordi3) (low[0], low[0], GEN_INT (1)));
12815 }
12816
12817 emit_insn ((mode == DImode
12818 ? gen_ashlsi3
12819 : gen_ashldi3) (low[0], low[0], operands[2]));
12820 emit_insn ((mode == DImode
12821 ? gen_ashlsi3
12822 : gen_ashldi3) (high[0], high[0], operands[2]));
12823 return;
12824 }
12825
12826 if (operands[1] == constm1_rtx)
12827 {
12828 /* For -1 << N, we can avoid the shld instruction, because we
12829 know that we're shifting 0...31/63 ones into a -1. */
12830 emit_move_insn (low[0], constm1_rtx);
12831 if (optimize_size)
12832 emit_move_insn (high[0], low[0]);
12833 else
12834 emit_move_insn (high[0], constm1_rtx);
12835 }
12836 else
12837 {
12838 if (!rtx_equal_p (operands[0], operands[1]))
12839 emit_move_insn (operands[0], operands[1]);
12840
12841 (mode == DImode ? split_di : split_ti) (operands, 1, low, high);
12842 emit_insn ((mode == DImode
12843 ? gen_x86_shld_1
12844 : gen_x86_64_shld) (high[0], low[0], operands[2]));
12845 }
12846
12847 emit_insn ((mode == DImode ? gen_ashlsi3 : gen_ashldi3) (low[0], low[0], operands[2]));
12848
12849 if (TARGET_CMOVE && scratch)
12850 {
12851 ix86_expand_clear (scratch);
12852 emit_insn ((mode == DImode
12853 ? gen_x86_shift_adj_1
12854 : gen_x86_64_shift_adj) (high[0], low[0], operands[2], scratch));
12855 }
12856 else
12857 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
12858 }
12859
12860 void
12861 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
12862 {
12863 rtx low[2], high[2];
12864 int count;
12865 const int single_width = mode == DImode ? 32 : 64;
12866
12867 if (CONST_INT_P (operands[2]))
12868 {
12869 (mode == DImode ? split_di : split_ti) (operands, 2, low, high);
12870 count = INTVAL (operands[2]) & (single_width * 2 - 1);
12871
12872 if (count == single_width * 2 - 1)
12873 {
12874 emit_move_insn (high[0], high[1]);
12875 emit_insn ((mode == DImode
12876 ? gen_ashrsi3
12877 : gen_ashrdi3) (high[0], high[0],
12878 GEN_INT (single_width - 1)));
12879 emit_move_insn (low[0], high[0]);
12880
12881 }
12882 else if (count >= single_width)
12883 {
12884 emit_move_insn (low[0], high[1]);
12885 emit_move_insn (high[0], low[0]);
12886 emit_insn ((mode == DImode
12887 ? gen_ashrsi3
12888 : gen_ashrdi3) (high[0], high[0],
12889 GEN_INT (single_width - 1)));
12890 if (count > single_width)
12891 emit_insn ((mode == DImode
12892 ? gen_ashrsi3
12893 : gen_ashrdi3) (low[0], low[0],
12894 GEN_INT (count - single_width)));
12895 }
12896 else
12897 {
12898 if (!rtx_equal_p (operands[0], operands[1]))
12899 emit_move_insn (operands[0], operands[1]);
12900 emit_insn ((mode == DImode
12901 ? gen_x86_shrd_1
12902 : gen_x86_64_shrd) (low[0], high[0], GEN_INT (count)));
12903 emit_insn ((mode == DImode
12904 ? gen_ashrsi3
12905 : gen_ashrdi3) (high[0], high[0], GEN_INT (count)));
12906 }
12907 }
12908 else
12909 {
12910 if (!rtx_equal_p (operands[0], operands[1]))
12911 emit_move_insn (operands[0], operands[1]);
12912
12913 (mode == DImode ? split_di : split_ti) (operands, 1, low, high);
12914
12915 emit_insn ((mode == DImode
12916 ? gen_x86_shrd_1
12917 : gen_x86_64_shrd) (low[0], high[0], operands[2]));
12918 emit_insn ((mode == DImode
12919 ? gen_ashrsi3
12920 : gen_ashrdi3) (high[0], high[0], operands[2]));
12921
12922 if (TARGET_CMOVE && scratch)
12923 {
12924 emit_move_insn (scratch, high[0]);
12925 emit_insn ((mode == DImode
12926 ? gen_ashrsi3
12927 : gen_ashrdi3) (scratch, scratch,
12928 GEN_INT (single_width - 1)));
12929 emit_insn ((mode == DImode
12930 ? gen_x86_shift_adj_1
12931 : gen_x86_64_shift_adj) (low[0], high[0], operands[2],
12932 scratch));
12933 }
12934 else
12935 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
12936 }
12937 }
12938
12939 void
12940 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
12941 {
12942 rtx low[2], high[2];
12943 int count;
12944 const int single_width = mode == DImode ? 32 : 64;
12945
12946 if (CONST_INT_P (operands[2]))
12947 {
12948 (mode == DImode ? split_di : split_ti) (operands, 2, low, high);
12949 count = INTVAL (operands[2]) & (single_width * 2 - 1);
12950
12951 if (count >= single_width)
12952 {
12953 emit_move_insn (low[0], high[1]);
12954 ix86_expand_clear (high[0]);
12955
12956 if (count > single_width)
12957 emit_insn ((mode == DImode
12958 ? gen_lshrsi3
12959 : gen_lshrdi3) (low[0], low[0],
12960 GEN_INT (count - single_width)));
12961 }
12962 else
12963 {
12964 if (!rtx_equal_p (operands[0], operands[1]))
12965 emit_move_insn (operands[0], operands[1]);
12966 emit_insn ((mode == DImode
12967 ? gen_x86_shrd_1
12968 : gen_x86_64_shrd) (low[0], high[0], GEN_INT (count)));
12969 emit_insn ((mode == DImode
12970 ? gen_lshrsi3
12971 : gen_lshrdi3) (high[0], high[0], GEN_INT (count)));
12972 }
12973 }
12974 else
12975 {
12976 if (!rtx_equal_p (operands[0], operands[1]))
12977 emit_move_insn (operands[0], operands[1]);
12978
12979 (mode == DImode ? split_di : split_ti) (operands, 1, low, high);
12980
12981 emit_insn ((mode == DImode
12982 ? gen_x86_shrd_1
12983 : gen_x86_64_shrd) (low[0], high[0], operands[2]));
12984 emit_insn ((mode == DImode
12985 ? gen_lshrsi3
12986 : gen_lshrdi3) (high[0], high[0], operands[2]));
12987
12988 /* Heh. By reversing the arguments, we can reuse this pattern. */
12989 if (TARGET_CMOVE && scratch)
12990 {
12991 ix86_expand_clear (scratch);
12992 emit_insn ((mode == DImode
12993 ? gen_x86_shift_adj_1
12994 : gen_x86_64_shift_adj) (low[0], high[0], operands[2],
12995 scratch));
12996 }
12997 else
12998 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
12999 }
13000 }
13001
13002 /* Predict just emitted jump instruction to be taken with probability PROB. */
13003 static void
13004 predict_jump (int prob)
13005 {
13006 rtx insn = get_last_insn ();
13007 gcc_assert (JUMP_P (insn));
13008 REG_NOTES (insn)
13009 = gen_rtx_EXPR_LIST (REG_BR_PROB,
13010 GEN_INT (prob),
13011 REG_NOTES (insn));
13012 }
13013
13014 /* Helper function for the string operations below. Dest VARIABLE whether
13015 it is aligned to VALUE bytes. If true, jump to the label. */
13016 static rtx
13017 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
13018 {
13019 rtx label = gen_label_rtx ();
13020 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
13021 if (GET_MODE (variable) == DImode)
13022 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
13023 else
13024 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
13025 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
13026 1, label);
13027 if (epilogue)
13028 predict_jump (REG_BR_PROB_BASE * 50 / 100);
13029 else
13030 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13031 return label;
13032 }
13033
13034 /* Adjust COUNTER by the VALUE. */
13035 static void
13036 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
13037 {
13038 if (GET_MODE (countreg) == DImode)
13039 emit_insn (gen_adddi3 (countreg, countreg, GEN_INT (-value)));
13040 else
13041 emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-value)));
13042 }
13043
13044 /* Zero extend possibly SImode EXP to Pmode register. */
13045 rtx
13046 ix86_zero_extend_to_Pmode (rtx exp)
13047 {
13048 rtx r;
13049 if (GET_MODE (exp) == VOIDmode)
13050 return force_reg (Pmode, exp);
13051 if (GET_MODE (exp) == Pmode)
13052 return copy_to_mode_reg (Pmode, exp);
13053 r = gen_reg_rtx (Pmode);
13054 emit_insn (gen_zero_extendsidi2 (r, exp));
13055 return r;
13056 }
13057
13058 /* Divide COUNTREG by SCALE. */
13059 static rtx
13060 scale_counter (rtx countreg, int scale)
13061 {
13062 rtx sc;
13063 rtx piece_size_mask;
13064
13065 if (scale == 1)
13066 return countreg;
13067 if (CONST_INT_P (countreg))
13068 return GEN_INT (INTVAL (countreg) / scale);
13069 gcc_assert (REG_P (countreg));
13070
13071 piece_size_mask = GEN_INT (scale - 1);
13072 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
13073 GEN_INT (exact_log2 (scale)),
13074 NULL, 1, OPTAB_DIRECT);
13075 return sc;
13076 }
13077
13078 /* When SRCPTR is non-NULL, output simple loop to move memory
13079 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
13080 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
13081 equivalent loop to set memory by VALUE (supposed to be in MODE).
13082
13083 The size is rounded down to whole number of chunk size moved at once.
13084 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
13085
13086
13087 static void
13088 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
13089 rtx destptr, rtx srcptr, rtx value,
13090 rtx count, enum machine_mode mode, int unroll,
13091 int expected_size)
13092 {
13093 rtx out_label, top_label, iter, tmp;
13094 enum machine_mode iter_mode;
13095 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
13096 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
13097 rtx size;
13098 rtx x_addr;
13099 rtx y_addr;
13100 int i;
13101
13102 iter_mode = GET_MODE (count);
13103 if (iter_mode == VOIDmode)
13104 iter_mode = word_mode;
13105
13106 top_label = gen_label_rtx ();
13107 out_label = gen_label_rtx ();
13108 iter = gen_reg_rtx (iter_mode);
13109
13110 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
13111 NULL, 1, OPTAB_DIRECT);
13112 /* Those two should combine. */
13113 if (piece_size == const1_rtx)
13114 {
13115 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
13116 true, out_label);
13117 predict_jump (REG_BR_PROB_BASE * 10 / 100);
13118 }
13119 emit_move_insn (iter, const0_rtx);
13120
13121 emit_label (top_label);
13122
13123 tmp = convert_modes (Pmode, iter_mode, iter, true);
13124 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
13125 destmem = change_address (destmem, mode, x_addr);
13126
13127 if (srcmem)
13128 {
13129 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
13130 srcmem = change_address (srcmem, mode, y_addr);
13131
13132 /* When unrolling for chips that reorder memory reads and writes,
13133 we can save registers by using single temporary.
13134 Also using 4 temporaries is overkill in 32bit mode. */
13135 if (!TARGET_64BIT && 0)
13136 {
13137 for (i = 0; i < unroll; i++)
13138 {
13139 if (i)
13140 {
13141 destmem =
13142 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
13143 srcmem =
13144 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
13145 }
13146 emit_move_insn (destmem, srcmem);
13147 }
13148 }
13149 else
13150 {
13151 rtx tmpreg[4];
13152 gcc_assert (unroll <= 4);
13153 for (i = 0; i < unroll; i++)
13154 {
13155 tmpreg[i] = gen_reg_rtx (mode);
13156 if (i)
13157 {
13158 srcmem =
13159 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
13160 }
13161 emit_move_insn (tmpreg[i], srcmem);
13162 }
13163 for (i = 0; i < unroll; i++)
13164 {
13165 if (i)
13166 {
13167 destmem =
13168 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
13169 }
13170 emit_move_insn (destmem, tmpreg[i]);
13171 }
13172 }
13173 }
13174 else
13175 for (i = 0; i < unroll; i++)
13176 {
13177 if (i)
13178 destmem =
13179 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
13180 emit_move_insn (destmem, value);
13181 }
13182
13183 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
13184 true, OPTAB_LIB_WIDEN);
13185 if (tmp != iter)
13186 emit_move_insn (iter, tmp);
13187
13188 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
13189 true, top_label);
13190 if (expected_size != -1)
13191 {
13192 expected_size /= GET_MODE_SIZE (mode) * unroll;
13193 if (expected_size == 0)
13194 predict_jump (0);
13195 else if (expected_size > REG_BR_PROB_BASE)
13196 predict_jump (REG_BR_PROB_BASE - 1);
13197 else
13198 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
13199 }
13200 else
13201 predict_jump (REG_BR_PROB_BASE * 80 / 100);
13202 iter = ix86_zero_extend_to_Pmode (iter);
13203 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
13204 true, OPTAB_LIB_WIDEN);
13205 if (tmp != destptr)
13206 emit_move_insn (destptr, tmp);
13207 if (srcptr)
13208 {
13209 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
13210 true, OPTAB_LIB_WIDEN);
13211 if (tmp != srcptr)
13212 emit_move_insn (srcptr, tmp);
13213 }
13214 emit_label (out_label);
13215 }
13216
13217 /* Output "rep; mov" instruction.
13218 Arguments have same meaning as for previous function */
13219 static void
13220 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
13221 rtx destptr, rtx srcptr,
13222 rtx count,
13223 enum machine_mode mode)
13224 {
13225 rtx destexp;
13226 rtx srcexp;
13227 rtx countreg;
13228
13229 /* If the size is known, it is shorter to use rep movs. */
13230 if (mode == QImode && CONST_INT_P (count)
13231 && !(INTVAL (count) & 3))
13232 mode = SImode;
13233
13234 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
13235 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
13236 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
13237 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
13238 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
13239 if (mode != QImode)
13240 {
13241 destexp = gen_rtx_ASHIFT (Pmode, countreg,
13242 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
13243 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
13244 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
13245 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
13246 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
13247 }
13248 else
13249 {
13250 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
13251 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
13252 }
13253 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
13254 destexp, srcexp));
13255 }
13256
13257 /* Output "rep; stos" instruction.
13258 Arguments have same meaning as for previous function */
13259 static void
13260 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
13261 rtx count,
13262 enum machine_mode mode)
13263 {
13264 rtx destexp;
13265 rtx countreg;
13266
13267 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
13268 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
13269 value = force_reg (mode, gen_lowpart (mode, value));
13270 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
13271 if (mode != QImode)
13272 {
13273 destexp = gen_rtx_ASHIFT (Pmode, countreg,
13274 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
13275 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
13276 }
13277 else
13278 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
13279 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
13280 }
13281
13282 static void
13283 emit_strmov (rtx destmem, rtx srcmem,
13284 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
13285 {
13286 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
13287 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
13288 emit_insn (gen_strmov (destptr, dest, srcptr, src));
13289 }
13290
13291 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
13292 static void
13293 expand_movmem_epilogue (rtx destmem, rtx srcmem,
13294 rtx destptr, rtx srcptr, rtx count, int max_size)
13295 {
13296 rtx src, dest;
13297 if (CONST_INT_P (count))
13298 {
13299 HOST_WIDE_INT countval = INTVAL (count);
13300 int offset = 0;
13301
13302 if ((countval & 0x16) && max_size > 16)
13303 {
13304 if (TARGET_64BIT)
13305 {
13306 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
13307 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
13308 }
13309 else
13310 gcc_unreachable ();
13311 offset += 16;
13312 }
13313 if ((countval & 0x08) && max_size > 8)
13314 {
13315 if (TARGET_64BIT)
13316 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
13317 else
13318 {
13319 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
13320 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 4);
13321 }
13322 offset += 8;
13323 }
13324 if ((countval & 0x04) && max_size > 4)
13325 {
13326 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
13327 offset += 4;
13328 }
13329 if ((countval & 0x02) && max_size > 2)
13330 {
13331 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
13332 offset += 2;
13333 }
13334 if ((countval & 0x01) && max_size > 1)
13335 {
13336 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
13337 offset += 1;
13338 }
13339 return;
13340 }
13341 if (max_size > 8)
13342 {
13343 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
13344 count, 1, OPTAB_DIRECT);
13345 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
13346 count, QImode, 1, 4);
13347 return;
13348 }
13349
13350 /* When there are stringops, we can cheaply increase dest and src pointers.
13351 Otherwise we save code size by maintaining offset (zero is readily
13352 available from preceding rep operation) and using x86 addressing modes.
13353 */
13354 if (TARGET_SINGLE_STRINGOP)
13355 {
13356 if (max_size > 4)
13357 {
13358 rtx label = ix86_expand_aligntest (count, 4, true);
13359 src = change_address (srcmem, SImode, srcptr);
13360 dest = change_address (destmem, SImode, destptr);
13361 emit_insn (gen_strmov (destptr, dest, srcptr, src));
13362 emit_label (label);
13363 LABEL_NUSES (label) = 1;
13364 }
13365 if (max_size > 2)
13366 {
13367 rtx label = ix86_expand_aligntest (count, 2, true);
13368 src = change_address (srcmem, HImode, srcptr);
13369 dest = change_address (destmem, HImode, destptr);
13370 emit_insn (gen_strmov (destptr, dest, srcptr, src));
13371 emit_label (label);
13372 LABEL_NUSES (label) = 1;
13373 }
13374 if (max_size > 1)
13375 {
13376 rtx label = ix86_expand_aligntest (count, 1, true);
13377 src = change_address (srcmem, QImode, srcptr);
13378 dest = change_address (destmem, QImode, destptr);
13379 emit_insn (gen_strmov (destptr, dest, srcptr, src));
13380 emit_label (label);
13381 LABEL_NUSES (label) = 1;
13382 }
13383 }
13384 else
13385 {
13386 rtx offset = force_reg (Pmode, const0_rtx);
13387 rtx tmp;
13388
13389 if (max_size > 4)
13390 {
13391 rtx label = ix86_expand_aligntest (count, 4, true);
13392 src = change_address (srcmem, SImode, srcptr);
13393 dest = change_address (destmem, SImode, destptr);
13394 emit_move_insn (dest, src);
13395 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
13396 true, OPTAB_LIB_WIDEN);
13397 if (tmp != offset)
13398 emit_move_insn (offset, tmp);
13399 emit_label (label);
13400 LABEL_NUSES (label) = 1;
13401 }
13402 if (max_size > 2)
13403 {
13404 rtx label = ix86_expand_aligntest (count, 2, true);
13405 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
13406 src = change_address (srcmem, HImode, tmp);
13407 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
13408 dest = change_address (destmem, HImode, tmp);
13409 emit_move_insn (dest, src);
13410 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
13411 true, OPTAB_LIB_WIDEN);
13412 if (tmp != offset)
13413 emit_move_insn (offset, tmp);
13414 emit_label (label);
13415 LABEL_NUSES (label) = 1;
13416 }
13417 if (max_size > 1)
13418 {
13419 rtx label = ix86_expand_aligntest (count, 1, true);
13420 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
13421 src = change_address (srcmem, QImode, tmp);
13422 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
13423 dest = change_address (destmem, QImode, tmp);
13424 emit_move_insn (dest, src);
13425 emit_label (label);
13426 LABEL_NUSES (label) = 1;
13427 }
13428 }
13429 }
13430
13431 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
13432 static void
13433 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
13434 rtx count, int max_size)
13435 {
13436 count =
13437 expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
13438 count, 1, OPTAB_DIRECT);
13439 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
13440 gen_lowpart (QImode, value), count, QImode,
13441 1, max_size / 2);
13442 }
13443
13444 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
13445 static void
13446 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
13447 {
13448 rtx dest;
13449
13450 if (CONST_INT_P (count))
13451 {
13452 HOST_WIDE_INT countval = INTVAL (count);
13453 int offset = 0;
13454
13455 if ((countval & 0x16) && max_size > 16)
13456 {
13457 if (TARGET_64BIT)
13458 {
13459 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
13460 emit_insn (gen_strset (destptr, dest, value));
13461 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
13462 emit_insn (gen_strset (destptr, dest, value));
13463 }
13464 else
13465 gcc_unreachable ();
13466 offset += 16;
13467 }
13468 if ((countval & 0x08) && max_size > 8)
13469 {
13470 if (TARGET_64BIT)
13471 {
13472 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
13473 emit_insn (gen_strset (destptr, dest, value));
13474 }
13475 else
13476 {
13477 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
13478 emit_insn (gen_strset (destptr, dest, value));
13479 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
13480 emit_insn (gen_strset (destptr, dest, value));
13481 }
13482 offset += 8;
13483 }
13484 if ((countval & 0x04) && max_size > 4)
13485 {
13486 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
13487 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
13488 offset += 4;
13489 }
13490 if ((countval & 0x02) && max_size > 2)
13491 {
13492 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
13493 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
13494 offset += 2;
13495 }
13496 if ((countval & 0x01) && max_size > 1)
13497 {
13498 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
13499 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
13500 offset += 1;
13501 }
13502 return;
13503 }
13504 if (max_size > 32)
13505 {
13506 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
13507 return;
13508 }
13509 if (max_size > 16)
13510 {
13511 rtx label = ix86_expand_aligntest (count, 16, true);
13512 if (TARGET_64BIT)
13513 {
13514 dest = change_address (destmem, DImode, destptr);
13515 emit_insn (gen_strset (destptr, dest, value));
13516 emit_insn (gen_strset (destptr, dest, value));
13517 }
13518 else
13519 {
13520 dest = change_address (destmem, SImode, destptr);
13521 emit_insn (gen_strset (destptr, dest, value));
13522 emit_insn (gen_strset (destptr, dest, value));
13523 emit_insn (gen_strset (destptr, dest, value));
13524 emit_insn (gen_strset (destptr, dest, value));
13525 }
13526 emit_label (label);
13527 LABEL_NUSES (label) = 1;
13528 }
13529 if (max_size > 8)
13530 {
13531 rtx label = ix86_expand_aligntest (count, 8, true);
13532 if (TARGET_64BIT)
13533 {
13534 dest = change_address (destmem, DImode, destptr);
13535 emit_insn (gen_strset (destptr, dest, value));
13536 }
13537 else
13538 {
13539 dest = change_address (destmem, SImode, destptr);
13540 emit_insn (gen_strset (destptr, dest, value));
13541 emit_insn (gen_strset (destptr, dest, value));
13542 }
13543 emit_label (label);
13544 LABEL_NUSES (label) = 1;
13545 }
13546 if (max_size > 4)
13547 {
13548 rtx label = ix86_expand_aligntest (count, 4, true);
13549 dest = change_address (destmem, SImode, destptr);
13550 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
13551 emit_label (label);
13552 LABEL_NUSES (label) = 1;
13553 }
13554 if (max_size > 2)
13555 {
13556 rtx label = ix86_expand_aligntest (count, 2, true);
13557 dest = change_address (destmem, HImode, destptr);
13558 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
13559 emit_label (label);
13560 LABEL_NUSES (label) = 1;
13561 }
13562 if (max_size > 1)
13563 {
13564 rtx label = ix86_expand_aligntest (count, 1, true);
13565 dest = change_address (destmem, QImode, destptr);
13566 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
13567 emit_label (label);
13568 LABEL_NUSES (label) = 1;
13569 }
13570 }
13571
13572 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
13573 DESIRED_ALIGNMENT. */
13574 static void
13575 expand_movmem_prologue (rtx destmem, rtx srcmem,
13576 rtx destptr, rtx srcptr, rtx count,
13577 int align, int desired_alignment)
13578 {
13579 if (align <= 1 && desired_alignment > 1)
13580 {
13581 rtx label = ix86_expand_aligntest (destptr, 1, false);
13582 srcmem = change_address (srcmem, QImode, srcptr);
13583 destmem = change_address (destmem, QImode, destptr);
13584 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
13585 ix86_adjust_counter (count, 1);
13586 emit_label (label);
13587 LABEL_NUSES (label) = 1;
13588 }
13589 if (align <= 2 && desired_alignment > 2)
13590 {
13591 rtx label = ix86_expand_aligntest (destptr, 2, false);
13592 srcmem = change_address (srcmem, HImode, srcptr);
13593 destmem = change_address (destmem, HImode, destptr);
13594 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
13595 ix86_adjust_counter (count, 2);
13596 emit_label (label);
13597 LABEL_NUSES (label) = 1;
13598 }
13599 if (align <= 4 && desired_alignment > 4)
13600 {
13601 rtx label = ix86_expand_aligntest (destptr, 4, false);
13602 srcmem = change_address (srcmem, SImode, srcptr);
13603 destmem = change_address (destmem, SImode, destptr);
13604 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
13605 ix86_adjust_counter (count, 4);
13606 emit_label (label);
13607 LABEL_NUSES (label) = 1;
13608 }
13609 gcc_assert (desired_alignment <= 8);
13610 }
13611
13612 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
13613 DESIRED_ALIGNMENT. */
13614 static void
13615 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
13616 int align, int desired_alignment)
13617 {
13618 if (align <= 1 && desired_alignment > 1)
13619 {
13620 rtx label = ix86_expand_aligntest (destptr, 1, false);
13621 destmem = change_address (destmem, QImode, destptr);
13622 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
13623 ix86_adjust_counter (count, 1);
13624 emit_label (label);
13625 LABEL_NUSES (label) = 1;
13626 }
13627 if (align <= 2 && desired_alignment > 2)
13628 {
13629 rtx label = ix86_expand_aligntest (destptr, 2, false);
13630 destmem = change_address (destmem, HImode, destptr);
13631 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
13632 ix86_adjust_counter (count, 2);
13633 emit_label (label);
13634 LABEL_NUSES (label) = 1;
13635 }
13636 if (align <= 4 && desired_alignment > 4)
13637 {
13638 rtx label = ix86_expand_aligntest (destptr, 4, false);
13639 destmem = change_address (destmem, SImode, destptr);
13640 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
13641 ix86_adjust_counter (count, 4);
13642 emit_label (label);
13643 LABEL_NUSES (label) = 1;
13644 }
13645 gcc_assert (desired_alignment <= 8);
13646 }
13647
13648 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
13649 static enum stringop_alg
13650 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
13651 int *dynamic_check)
13652 {
13653 const struct stringop_algs * algs;
13654
13655 *dynamic_check = -1;
13656 if (memset)
13657 algs = &ix86_cost->memset[TARGET_64BIT != 0];
13658 else
13659 algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
13660 if (stringop_alg != no_stringop)
13661 return stringop_alg;
13662 /* rep; movq or rep; movl is the smallest variant. */
13663 else if (optimize_size)
13664 {
13665 if (!count || (count & 3))
13666 return rep_prefix_1_byte;
13667 else
13668 return rep_prefix_4_byte;
13669 }
13670 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
13671 */
13672 else if (expected_size != -1 && expected_size < 4)
13673 return loop_1_byte;
13674 else if (expected_size != -1)
13675 {
13676 unsigned int i;
13677 enum stringop_alg alg = libcall;
13678 for (i = 0; i < NAX_STRINGOP_ALGS; i++)
13679 {
13680 gcc_assert (algs->size[i].max);
13681 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
13682 {
13683 if (algs->size[i].alg != libcall)
13684 alg = algs->size[i].alg;
13685 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
13686 last non-libcall inline algorithm. */
13687 if (TARGET_INLINE_ALL_STRINGOPS)
13688 {
13689 /* When the current size is best to be copied by a libcall,
13690 but we are still forced to inline, run the heuristic bellow
13691 that will pick code for medium sized blocks. */
13692 if (alg != libcall)
13693 return alg;
13694 break;
13695 }
13696 else
13697 return algs->size[i].alg;
13698 }
13699 }
13700 gcc_assert (TARGET_INLINE_ALL_STRINGOPS);
13701 }
13702 /* When asked to inline the call anyway, try to pick meaningful choice.
13703 We look for maximal size of block that is faster to copy by hand and
13704 take blocks of at most of that size guessing that average size will
13705 be roughly half of the block.
13706
13707 If this turns out to be bad, we might simply specify the preferred
13708 choice in ix86_costs. */
13709 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
13710 && algs->unknown_size == libcall)
13711 {
13712 int max = -1;
13713 enum stringop_alg alg;
13714 int i;
13715
13716 for (i = 0; i < NAX_STRINGOP_ALGS; i++)
13717 if (algs->size[i].alg != libcall && algs->size[i].alg)
13718 max = algs->size[i].max;
13719 if (max == -1)
13720 max = 4096;
13721 alg = decide_alg (count, max / 2, memset, dynamic_check);
13722 gcc_assert (*dynamic_check == -1);
13723 gcc_assert (alg != libcall);
13724 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
13725 *dynamic_check = max;
13726 return alg;
13727 }
13728 return algs->unknown_size;
13729 }
13730
13731 /* Decide on alignment. We know that the operand is already aligned to ALIGN
13732 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
13733 static int
13734 decide_alignment (int align,
13735 enum stringop_alg alg,
13736 int expected_size)
13737 {
13738 int desired_align = 0;
13739 switch (alg)
13740 {
13741 case no_stringop:
13742 gcc_unreachable ();
13743 case loop:
13744 case unrolled_loop:
13745 desired_align = GET_MODE_SIZE (Pmode);
13746 break;
13747 case rep_prefix_8_byte:
13748 desired_align = 8;
13749 break;
13750 case rep_prefix_4_byte:
13751 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
13752 copying whole cacheline at once. */
13753 if (TARGET_PENTIUMPRO)
13754 desired_align = 8;
13755 else
13756 desired_align = 4;
13757 break;
13758 case rep_prefix_1_byte:
13759 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
13760 copying whole cacheline at once. */
13761 if (TARGET_PENTIUMPRO)
13762 desired_align = 8;
13763 else
13764 desired_align = 1;
13765 break;
13766 case loop_1_byte:
13767 desired_align = 1;
13768 break;
13769 case libcall:
13770 return 0;
13771 }
13772
13773 if (optimize_size)
13774 desired_align = 1;
13775 if (desired_align < align)
13776 desired_align = align;
13777 if (expected_size != -1 && expected_size < 4)
13778 desired_align = align;
13779 return desired_align;
13780 }
13781
13782 /* Return the smallest power of 2 greater than VAL. */
13783 static int
13784 smallest_pow2_greater_than (int val)
13785 {
13786 int ret = 1;
13787 while (ret <= val)
13788 ret <<= 1;
13789 return ret;
13790 }
13791
13792 /* Expand string move (memcpy) operation. Use i386 string operations when
13793 profitable. expand_clrmem contains similar code. The code depends upon
13794 architecture, block size and alignment, but always has the same
13795 overall structure:
13796
13797 1) Prologue guard: Conditional that jumps up to epilogues for small
13798 blocks that can be handled by epilogue alone. This is faster but
13799 also needed for correctness, since prologue assume the block is larger
13800 than the desired alignment.
13801
13802 Optional dynamic check for size and libcall for large
13803 blocks is emitted here too, with -minline-stringops-dynamically.
13804
13805 2) Prologue: copy first few bytes in order to get destination aligned
13806 to DESIRED_ALIGN. It is emitted only when ALIGN is less than
13807 DESIRED_ALIGN and and up to DESIRED_ALIGN - ALIGN bytes can be copied.
13808 We emit either a jump tree on power of two sized blocks, or a byte loop.
13809
13810 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
13811 with specified algorithm.
13812
13813 4) Epilogue: code copying tail of the block that is too small to be
13814 handled by main body (or up to size guarded by prologue guard). */
13815
13816 int
13817 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
13818 rtx expected_align_exp, rtx expected_size_exp)
13819 {
13820 rtx destreg;
13821 rtx srcreg;
13822 rtx label = NULL;
13823 rtx tmp;
13824 rtx jump_around_label = NULL;
13825 HOST_WIDE_INT align = 1;
13826 unsigned HOST_WIDE_INT count = 0;
13827 HOST_WIDE_INT expected_size = -1;
13828 int size_needed = 0, epilogue_size_needed;
13829 int desired_align = 0;
13830 enum stringop_alg alg;
13831 int dynamic_check;
13832
13833 if (CONST_INT_P (align_exp))
13834 align = INTVAL (align_exp);
13835 /* i386 can do misaligned access on reasonably increased cost. */
13836 if (CONST_INT_P (expected_align_exp)
13837 && INTVAL (expected_align_exp) > align)
13838 align = INTVAL (expected_align_exp);
13839 if (CONST_INT_P (count_exp))
13840 count = expected_size = INTVAL (count_exp);
13841 if (CONST_INT_P (expected_size_exp) && count == 0)
13842 expected_size = INTVAL (expected_size_exp);
13843
13844 /* Step 0: Decide on preferred algorithm, desired alignment and
13845 size of chunks to be copied by main loop. */
13846
13847 alg = decide_alg (count, expected_size, false, &dynamic_check);
13848 desired_align = decide_alignment (align, alg, expected_size);
13849
13850 if (!TARGET_ALIGN_STRINGOPS)
13851 align = desired_align;
13852
13853 if (alg == libcall)
13854 return 0;
13855 gcc_assert (alg != no_stringop);
13856 if (!count)
13857 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
13858 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13859 srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
13860 switch (alg)
13861 {
13862 case libcall:
13863 case no_stringop:
13864 gcc_unreachable ();
13865 case loop:
13866 size_needed = GET_MODE_SIZE (Pmode);
13867 break;
13868 case unrolled_loop:
13869 size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2);
13870 break;
13871 case rep_prefix_8_byte:
13872 size_needed = 8;
13873 break;
13874 case rep_prefix_4_byte:
13875 size_needed = 4;
13876 break;
13877 case rep_prefix_1_byte:
13878 case loop_1_byte:
13879 size_needed = 1;
13880 break;
13881 }
13882
13883 epilogue_size_needed = size_needed;
13884
13885 /* Step 1: Prologue guard. */
13886
13887 /* Alignment code needs count to be in register. */
13888 if (CONST_INT_P (count_exp) && desired_align > align)
13889 {
13890 enum machine_mode mode = SImode;
13891 if (TARGET_64BIT && (count & ~0xffffffff))
13892 mode = DImode;
13893 count_exp = force_reg (mode, count_exp);
13894 }
13895 gcc_assert (desired_align >= 1 && align >= 1);
13896
13897 /* Ensure that alignment prologue won't copy past end of block. */
13898 if ((size_needed > 1 || (desired_align > 1 && desired_align > align))
13899 && !count)
13900 {
13901 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
13902
13903 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
13904 Make sure it is power of 2. */
13905 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
13906
13907 label = gen_label_rtx ();
13908 emit_cmp_and_jump_insns (count_exp,
13909 GEN_INT (epilogue_size_needed),
13910 LTU, 0, GET_MODE (count_exp), 1, label);
13911 if (expected_size == -1 || expected_size < epilogue_size_needed)
13912 predict_jump (REG_BR_PROB_BASE * 60 / 100);
13913 else
13914 predict_jump (REG_BR_PROB_BASE * 20 / 100);
13915 }
13916 /* Emit code to decide on runtime whether library call or inline should be
13917 used. */
13918 if (dynamic_check != -1)
13919 {
13920 rtx hot_label = gen_label_rtx ();
13921 jump_around_label = gen_label_rtx ();
13922 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
13923 LEU, 0, GET_MODE (count_exp), 1, hot_label);
13924 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13925 emit_block_move_via_libcall (dst, src, count_exp, false);
13926 emit_jump (jump_around_label);
13927 emit_label (hot_label);
13928 }
13929
13930 /* Step 2: Alignment prologue. */
13931
13932 if (desired_align > align)
13933 {
13934 /* Except for the first move in epilogue, we no longer know
13935 constant offset in aliasing info. It don't seems to worth
13936 the pain to maintain it for the first move, so throw away
13937 the info early. */
13938 src = change_address (src, BLKmode, srcreg);
13939 dst = change_address (dst, BLKmode, destreg);
13940 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
13941 desired_align);
13942 }
13943 if (label && size_needed == 1)
13944 {
13945 emit_label (label);
13946 LABEL_NUSES (label) = 1;
13947 label = NULL;
13948 }
13949
13950 /* Step 3: Main loop. */
13951
13952 switch (alg)
13953 {
13954 case libcall:
13955 case no_stringop:
13956 gcc_unreachable ();
13957 case loop_1_byte:
13958 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
13959 count_exp, QImode, 1, expected_size);
13960 break;
13961 case loop:
13962 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
13963 count_exp, Pmode, 1, expected_size);
13964 break;
13965 case unrolled_loop:
13966 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
13967 registers for 4 temporaries anyway. */
13968 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
13969 count_exp, Pmode, TARGET_64BIT ? 4 : 2,
13970 expected_size);
13971 break;
13972 case rep_prefix_8_byte:
13973 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
13974 DImode);
13975 break;
13976 case rep_prefix_4_byte:
13977 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
13978 SImode);
13979 break;
13980 case rep_prefix_1_byte:
13981 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
13982 QImode);
13983 break;
13984 }
13985 /* Adjust properly the offset of src and dest memory for aliasing. */
13986 if (CONST_INT_P (count_exp))
13987 {
13988 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
13989 (count / size_needed) * size_needed);
13990 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
13991 (count / size_needed) * size_needed);
13992 }
13993 else
13994 {
13995 src = change_address (src, BLKmode, srcreg);
13996 dst = change_address (dst, BLKmode, destreg);
13997 }
13998
13999 /* Step 4: Epilogue to copy the remaining bytes. */
14000
14001 if (label)
14002 {
14003 /* When the main loop is done, COUNT_EXP might hold original count,
14004 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
14005 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
14006 bytes. Compensate if needed. */
14007
14008 if (size_needed < epilogue_size_needed)
14009 {
14010 tmp =
14011 expand_simple_binop (GET_MODE (count_exp), AND, count_exp,
14012 GEN_INT (size_needed - 1), count_exp, 1,
14013 OPTAB_DIRECT);
14014 if (tmp != count_exp)
14015 emit_move_insn (count_exp, tmp);
14016 }
14017 emit_label (label);
14018 LABEL_NUSES (label) = 1;
14019 }
14020
14021 if (count_exp != const0_rtx && epilogue_size_needed > 1)
14022 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
14023 epilogue_size_needed);
14024 if (jump_around_label)
14025 emit_label (jump_around_label);
14026 return 1;
14027 }
14028
14029 /* Helper function for memcpy. For QImode value 0xXY produce
14030 0xXYXYXYXY of wide specified by MODE. This is essentially
14031 a * 0x10101010, but we can do slightly better than
14032 synth_mult by unwinding the sequence by hand on CPUs with
14033 slow multiply. */
14034 static rtx
14035 promote_duplicated_reg (enum machine_mode mode, rtx val)
14036 {
14037 enum machine_mode valmode = GET_MODE (val);
14038 rtx tmp;
14039 int nops = mode == DImode ? 3 : 2;
14040
14041 gcc_assert (mode == SImode || mode == DImode);
14042 if (val == const0_rtx)
14043 return copy_to_mode_reg (mode, const0_rtx);
14044 if (CONST_INT_P (val))
14045 {
14046 HOST_WIDE_INT v = INTVAL (val) & 255;
14047
14048 v |= v << 8;
14049 v |= v << 16;
14050 if (mode == DImode)
14051 v |= (v << 16) << 16;
14052 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
14053 }
14054
14055 if (valmode == VOIDmode)
14056 valmode = QImode;
14057 if (valmode != QImode)
14058 val = gen_lowpart (QImode, val);
14059 if (mode == QImode)
14060 return val;
14061 if (!TARGET_PARTIAL_REG_STALL)
14062 nops--;
14063 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
14064 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
14065 <= (ix86_cost->shift_const + ix86_cost->add) * nops
14066 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
14067 {
14068 rtx reg = convert_modes (mode, QImode, val, true);
14069 tmp = promote_duplicated_reg (mode, const1_rtx);
14070 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
14071 OPTAB_DIRECT);
14072 }
14073 else
14074 {
14075 rtx reg = convert_modes (mode, QImode, val, true);
14076
14077 if (!TARGET_PARTIAL_REG_STALL)
14078 if (mode == SImode)
14079 emit_insn (gen_movsi_insv_1 (reg, reg));
14080 else
14081 emit_insn (gen_movdi_insv_1_rex64 (reg, reg));
14082 else
14083 {
14084 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
14085 NULL, 1, OPTAB_DIRECT);
14086 reg =
14087 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
14088 }
14089 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
14090 NULL, 1, OPTAB_DIRECT);
14091 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
14092 if (mode == SImode)
14093 return reg;
14094 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
14095 NULL, 1, OPTAB_DIRECT);
14096 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
14097 return reg;
14098 }
14099 }
14100
14101 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
14102 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
14103 alignment from ALIGN to DESIRED_ALIGN. */
14104 static rtx
14105 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
14106 {
14107 rtx promoted_val;
14108
14109 if (TARGET_64BIT
14110 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
14111 promoted_val = promote_duplicated_reg (DImode, val);
14112 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
14113 promoted_val = promote_duplicated_reg (SImode, val);
14114 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
14115 promoted_val = promote_duplicated_reg (HImode, val);
14116 else
14117 promoted_val = val;
14118
14119 return promoted_val;
14120 }
14121
14122 /* Expand string clear operation (bzero). Use i386 string operations when
14123 profitable. See expand_movmem comment for explanation of individual
14124 steps performed. */
14125 int
14126 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
14127 rtx expected_align_exp, rtx expected_size_exp)
14128 {
14129 rtx destreg;
14130 rtx label = NULL;
14131 rtx tmp;
14132 rtx jump_around_label = NULL;
14133 HOST_WIDE_INT align = 1;
14134 unsigned HOST_WIDE_INT count = 0;
14135 HOST_WIDE_INT expected_size = -1;
14136 int size_needed = 0, epilogue_size_needed;
14137 int desired_align = 0;
14138 enum stringop_alg alg;
14139 rtx promoted_val = NULL;
14140 bool force_loopy_epilogue = false;
14141 int dynamic_check;
14142
14143 if (CONST_INT_P (align_exp))
14144 align = INTVAL (align_exp);
14145 /* i386 can do misaligned access on reasonably increased cost. */
14146 if (CONST_INT_P (expected_align_exp)
14147 && INTVAL (expected_align_exp) > align)
14148 align = INTVAL (expected_align_exp);
14149 if (CONST_INT_P (count_exp))
14150 count = expected_size = INTVAL (count_exp);
14151 if (CONST_INT_P (expected_size_exp) && count == 0)
14152 expected_size = INTVAL (expected_size_exp);
14153
14154 /* Step 0: Decide on preferred algorithm, desired alignment and
14155 size of chunks to be copied by main loop. */
14156
14157 alg = decide_alg (count, expected_size, true, &dynamic_check);
14158 desired_align = decide_alignment (align, alg, expected_size);
14159
14160 if (!TARGET_ALIGN_STRINGOPS)
14161 align = desired_align;
14162
14163 if (alg == libcall)
14164 return 0;
14165 gcc_assert (alg != no_stringop);
14166 if (!count)
14167 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
14168 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
14169 switch (alg)
14170 {
14171 case libcall:
14172 case no_stringop:
14173 gcc_unreachable ();
14174 case loop:
14175 size_needed = GET_MODE_SIZE (Pmode);
14176 break;
14177 case unrolled_loop:
14178 size_needed = GET_MODE_SIZE (Pmode) * 4;
14179 break;
14180 case rep_prefix_8_byte:
14181 size_needed = 8;
14182 break;
14183 case rep_prefix_4_byte:
14184 size_needed = 4;
14185 break;
14186 case rep_prefix_1_byte:
14187 case loop_1_byte:
14188 size_needed = 1;
14189 break;
14190 }
14191 epilogue_size_needed = size_needed;
14192
14193 /* Step 1: Prologue guard. */
14194
14195 /* Alignment code needs count to be in register. */
14196 if (CONST_INT_P (count_exp) && desired_align > align)
14197 {
14198 enum machine_mode mode = SImode;
14199 if (TARGET_64BIT && (count & ~0xffffffff))
14200 mode = DImode;
14201 count_exp = force_reg (mode, count_exp);
14202 }
14203 /* Do the cheap promotion to allow better CSE across the
14204 main loop and epilogue (ie one load of the big constant in the
14205 front of all code. */
14206 if (CONST_INT_P (val_exp))
14207 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
14208 desired_align, align);
14209 /* Ensure that alignment prologue won't copy past end of block. */
14210 if ((size_needed > 1 || (desired_align > 1 && desired_align > align))
14211 && !count)
14212 {
14213 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
14214
14215 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
14216 Make sure it is power of 2. */
14217 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
14218
14219 /* To improve performance of small blocks, we jump around the VAL
14220 promoting mode. This mean that if the promoted VAL is not constant,
14221 we might not use it in the epilogue and have to use byte
14222 loop variant. */
14223 if (epilogue_size_needed > 2 && !promoted_val)
14224 force_loopy_epilogue = true;
14225 label = gen_label_rtx ();
14226 emit_cmp_and_jump_insns (count_exp,
14227 GEN_INT (epilogue_size_needed),
14228 LTU, 0, GET_MODE (count_exp), 1, label);
14229 if (expected_size == -1 || expected_size <= epilogue_size_needed)
14230 predict_jump (REG_BR_PROB_BASE * 60 / 100);
14231 else
14232 predict_jump (REG_BR_PROB_BASE * 20 / 100);
14233 }
14234 if (dynamic_check != -1)
14235 {
14236 rtx hot_label = gen_label_rtx ();
14237 jump_around_label = gen_label_rtx ();
14238 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
14239 LEU, 0, GET_MODE (count_exp), 1, hot_label);
14240 predict_jump (REG_BR_PROB_BASE * 90 / 100);
14241 set_storage_via_libcall (dst, count_exp, val_exp, false);
14242 emit_jump (jump_around_label);
14243 emit_label (hot_label);
14244 }
14245
14246 /* Step 2: Alignment prologue. */
14247
14248 /* Do the expensive promotion once we branched off the small blocks. */
14249 if (!promoted_val)
14250 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
14251 desired_align, align);
14252 gcc_assert (desired_align >= 1 && align >= 1);
14253
14254 if (desired_align > align)
14255 {
14256 /* Except for the first move in epilogue, we no longer know
14257 constant offset in aliasing info. It don't seems to worth
14258 the pain to maintain it for the first move, so throw away
14259 the info early. */
14260 dst = change_address (dst, BLKmode, destreg);
14261 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
14262 desired_align);
14263 }
14264 if (label && size_needed == 1)
14265 {
14266 emit_label (label);
14267 LABEL_NUSES (label) = 1;
14268 label = NULL;
14269 }
14270
14271 /* Step 3: Main loop. */
14272
14273 switch (alg)
14274 {
14275 case libcall:
14276 case no_stringop:
14277 gcc_unreachable ();
14278 case loop_1_byte:
14279 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
14280 count_exp, QImode, 1, expected_size);
14281 break;
14282 case loop:
14283 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
14284 count_exp, Pmode, 1, expected_size);
14285 break;
14286 case unrolled_loop:
14287 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
14288 count_exp, Pmode, 4, expected_size);
14289 break;
14290 case rep_prefix_8_byte:
14291 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
14292 DImode);
14293 break;
14294 case rep_prefix_4_byte:
14295 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
14296 SImode);
14297 break;
14298 case rep_prefix_1_byte:
14299 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
14300 QImode);
14301 break;
14302 }
14303 /* Adjust properly the offset of src and dest memory for aliasing. */
14304 if (CONST_INT_P (count_exp))
14305 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
14306 (count / size_needed) * size_needed);
14307 else
14308 dst = change_address (dst, BLKmode, destreg);
14309
14310 /* Step 4: Epilogue to copy the remaining bytes. */
14311
14312 if (label)
14313 {
14314 /* When the main loop is done, COUNT_EXP might hold original count,
14315 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
14316 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
14317 bytes. Compensate if needed. */
14318
14319 if (size_needed < desired_align - align)
14320 {
14321 tmp =
14322 expand_simple_binop (GET_MODE (count_exp), AND, count_exp,
14323 GEN_INT (size_needed - 1), count_exp, 1,
14324 OPTAB_DIRECT);
14325 size_needed = desired_align - align + 1;
14326 if (tmp != count_exp)
14327 emit_move_insn (count_exp, tmp);
14328 }
14329 emit_label (label);
14330 LABEL_NUSES (label) = 1;
14331 }
14332 if (count_exp != const0_rtx && epilogue_size_needed > 1)
14333 {
14334 if (force_loopy_epilogue)
14335 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
14336 size_needed);
14337 else
14338 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
14339 size_needed);
14340 }
14341 if (jump_around_label)
14342 emit_label (jump_around_label);
14343 return 1;
14344 }
14345
14346 /* Expand strlen. */
14347 int
14348 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
14349 {
14350 rtx addr, scratch1, scratch2, scratch3, scratch4;
14351
14352 /* The generic case of strlen expander is long. Avoid it's
14353 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
14354
14355 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
14356 && !TARGET_INLINE_ALL_STRINGOPS
14357 && !optimize_size
14358 && (!CONST_INT_P (align) || INTVAL (align) < 4))
14359 return 0;
14360
14361 addr = force_reg (Pmode, XEXP (src, 0));
14362 scratch1 = gen_reg_rtx (Pmode);
14363
14364 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
14365 && !optimize_size)
14366 {
14367 /* Well it seems that some optimizer does not combine a call like
14368 foo(strlen(bar), strlen(bar));
14369 when the move and the subtraction is done here. It does calculate
14370 the length just once when these instructions are done inside of
14371 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
14372 often used and I use one fewer register for the lifetime of
14373 output_strlen_unroll() this is better. */
14374
14375 emit_move_insn (out, addr);
14376
14377 ix86_expand_strlensi_unroll_1 (out, src, align);
14378
14379 /* strlensi_unroll_1 returns the address of the zero at the end of
14380 the string, like memchr(), so compute the length by subtracting
14381 the start address. */
14382 if (TARGET_64BIT)
14383 emit_insn (gen_subdi3 (out, out, addr));
14384 else
14385 emit_insn (gen_subsi3 (out, out, addr));
14386 }
14387 else
14388 {
14389 rtx unspec;
14390 scratch2 = gen_reg_rtx (Pmode);
14391 scratch3 = gen_reg_rtx (Pmode);
14392 scratch4 = force_reg (Pmode, constm1_rtx);
14393
14394 emit_move_insn (scratch3, addr);
14395 eoschar = force_reg (QImode, eoschar);
14396
14397 src = replace_equiv_address_nv (src, scratch3);
14398
14399 /* If .md starts supporting :P, this can be done in .md. */
14400 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
14401 scratch4), UNSPEC_SCAS);
14402 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
14403 if (TARGET_64BIT)
14404 {
14405 emit_insn (gen_one_cmpldi2 (scratch2, scratch1));
14406 emit_insn (gen_adddi3 (out, scratch2, constm1_rtx));
14407 }
14408 else
14409 {
14410 emit_insn (gen_one_cmplsi2 (scratch2, scratch1));
14411 emit_insn (gen_addsi3 (out, scratch2, constm1_rtx));
14412 }
14413 }
14414 return 1;
14415 }
14416
14417 /* Expand the appropriate insns for doing strlen if not just doing
14418 repnz; scasb
14419
14420 out = result, initialized with the start address
14421 align_rtx = alignment of the address.
14422 scratch = scratch register, initialized with the startaddress when
14423 not aligned, otherwise undefined
14424
14425 This is just the body. It needs the initializations mentioned above and
14426 some address computing at the end. These things are done in i386.md. */
14427
14428 static void
14429 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
14430 {
14431 int align;
14432 rtx tmp;
14433 rtx align_2_label = NULL_RTX;
14434 rtx align_3_label = NULL_RTX;
14435 rtx align_4_label = gen_label_rtx ();
14436 rtx end_0_label = gen_label_rtx ();
14437 rtx mem;
14438 rtx tmpreg = gen_reg_rtx (SImode);
14439 rtx scratch = gen_reg_rtx (SImode);
14440 rtx cmp;
14441
14442 align = 0;
14443 if (CONST_INT_P (align_rtx))
14444 align = INTVAL (align_rtx);
14445
14446 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
14447
14448 /* Is there a known alignment and is it less than 4? */
14449 if (align < 4)
14450 {
14451 rtx scratch1 = gen_reg_rtx (Pmode);
14452 emit_move_insn (scratch1, out);
14453 /* Is there a known alignment and is it not 2? */
14454 if (align != 2)
14455 {
14456 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
14457 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
14458
14459 /* Leave just the 3 lower bits. */
14460 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
14461 NULL_RTX, 0, OPTAB_WIDEN);
14462
14463 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
14464 Pmode, 1, align_4_label);
14465 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
14466 Pmode, 1, align_2_label);
14467 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
14468 Pmode, 1, align_3_label);
14469 }
14470 else
14471 {
14472 /* Since the alignment is 2, we have to check 2 or 0 bytes;
14473 check if is aligned to 4 - byte. */
14474
14475 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
14476 NULL_RTX, 0, OPTAB_WIDEN);
14477
14478 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
14479 Pmode, 1, align_4_label);
14480 }
14481
14482 mem = change_address (src, QImode, out);
14483
14484 /* Now compare the bytes. */
14485
14486 /* Compare the first n unaligned byte on a byte per byte basis. */
14487 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
14488 QImode, 1, end_0_label);
14489
14490 /* Increment the address. */
14491 if (TARGET_64BIT)
14492 emit_insn (gen_adddi3 (out, out, const1_rtx));
14493 else
14494 emit_insn (gen_addsi3 (out, out, const1_rtx));
14495
14496 /* Not needed with an alignment of 2 */
14497 if (align != 2)
14498 {
14499 emit_label (align_2_label);
14500
14501 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
14502 end_0_label);
14503
14504 if (TARGET_64BIT)
14505 emit_insn (gen_adddi3 (out, out, const1_rtx));
14506 else
14507 emit_insn (gen_addsi3 (out, out, const1_rtx));
14508
14509 emit_label (align_3_label);
14510 }
14511
14512 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
14513 end_0_label);
14514
14515 if (TARGET_64BIT)
14516 emit_insn (gen_adddi3 (out, out, const1_rtx));
14517 else
14518 emit_insn (gen_addsi3 (out, out, const1_rtx));
14519 }
14520
14521 /* Generate loop to check 4 bytes at a time. It is not a good idea to
14522 align this loop. It gives only huge programs, but does not help to
14523 speed up. */
14524 emit_label (align_4_label);
14525
14526 mem = change_address (src, SImode, out);
14527 emit_move_insn (scratch, mem);
14528 if (TARGET_64BIT)
14529 emit_insn (gen_adddi3 (out, out, GEN_INT (4)));
14530 else
14531 emit_insn (gen_addsi3 (out, out, GEN_INT (4)));
14532
14533 /* This formula yields a nonzero result iff one of the bytes is zero.
14534 This saves three branches inside loop and many cycles. */
14535
14536 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
14537 emit_insn (gen_one_cmplsi2 (scratch, scratch));
14538 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
14539 emit_insn (gen_andsi3 (tmpreg, tmpreg,
14540 gen_int_mode (0x80808080, SImode)));
14541 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
14542 align_4_label);
14543
14544 if (TARGET_CMOVE)
14545 {
14546 rtx reg = gen_reg_rtx (SImode);
14547 rtx reg2 = gen_reg_rtx (Pmode);
14548 emit_move_insn (reg, tmpreg);
14549 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
14550
14551 /* If zero is not in the first two bytes, move two bytes forward. */
14552 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
14553 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
14554 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
14555 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
14556 gen_rtx_IF_THEN_ELSE (SImode, tmp,
14557 reg,
14558 tmpreg)));
14559 /* Emit lea manually to avoid clobbering of flags. */
14560 emit_insn (gen_rtx_SET (SImode, reg2,
14561 gen_rtx_PLUS (Pmode, out, const2_rtx)));
14562
14563 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
14564 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
14565 emit_insn (gen_rtx_SET (VOIDmode, out,
14566 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
14567 reg2,
14568 out)));
14569
14570 }
14571 else
14572 {
14573 rtx end_2_label = gen_label_rtx ();
14574 /* Is zero in the first two bytes? */
14575
14576 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
14577 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
14578 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
14579 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
14580 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
14581 pc_rtx);
14582 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
14583 JUMP_LABEL (tmp) = end_2_label;
14584
14585 /* Not in the first two. Move two bytes forward. */
14586 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
14587 if (TARGET_64BIT)
14588 emit_insn (gen_adddi3 (out, out, const2_rtx));
14589 else
14590 emit_insn (gen_addsi3 (out, out, const2_rtx));
14591
14592 emit_label (end_2_label);
14593
14594 }
14595
14596 /* Avoid branch in fixing the byte. */
14597 tmpreg = gen_lowpart (QImode, tmpreg);
14598 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
14599 cmp = gen_rtx_LTU (Pmode, gen_rtx_REG (CCmode, 17), const0_rtx);
14600 if (TARGET_64BIT)
14601 emit_insn (gen_subdi3_carry_rex64 (out, out, GEN_INT (3), cmp));
14602 else
14603 emit_insn (gen_subsi3_carry (out, out, GEN_INT (3), cmp));
14604
14605 emit_label (end_0_label);
14606 }
14607
14608 void
14609 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
14610 rtx callarg2 ATTRIBUTE_UNUSED,
14611 rtx pop, int sibcall)
14612 {
14613 rtx use = NULL, call;
14614
14615 if (pop == const0_rtx)
14616 pop = NULL;
14617 gcc_assert (!TARGET_64BIT || !pop);
14618
14619 if (TARGET_MACHO && !TARGET_64BIT)
14620 {
14621 #if TARGET_MACHO
14622 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
14623 fnaddr = machopic_indirect_call_target (fnaddr);
14624 #endif
14625 }
14626 else
14627 {
14628 /* Static functions and indirect calls don't need the pic register. */
14629 if (! TARGET_64BIT && flag_pic
14630 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
14631 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
14632 use_reg (&use, pic_offset_table_rtx);
14633 }
14634
14635 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
14636 {
14637 rtx al = gen_rtx_REG (QImode, 0);
14638 emit_move_insn (al, callarg2);
14639 use_reg (&use, al);
14640 }
14641
14642 if (! call_insn_operand (XEXP (fnaddr, 0), Pmode))
14643 {
14644 fnaddr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0));
14645 fnaddr = gen_rtx_MEM (QImode, fnaddr);
14646 }
14647 if (sibcall && TARGET_64BIT
14648 && !constant_call_address_operand (XEXP (fnaddr, 0), Pmode))
14649 {
14650 rtx addr;
14651 addr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0));
14652 fnaddr = gen_rtx_REG (Pmode, R11_REG);
14653 emit_move_insn (fnaddr, addr);
14654 fnaddr = gen_rtx_MEM (QImode, fnaddr);
14655 }
14656
14657 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
14658 if (retval)
14659 call = gen_rtx_SET (VOIDmode, retval, call);
14660 if (pop)
14661 {
14662 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
14663 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
14664 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, call, pop));
14665 }
14666
14667 call = emit_call_insn (call);
14668 if (use)
14669 CALL_INSN_FUNCTION_USAGE (call) = use;
14670 }
14671
14672 \f
14673 /* Clear stack slot assignments remembered from previous functions.
14674 This is called from INIT_EXPANDERS once before RTL is emitted for each
14675 function. */
14676
14677 static struct machine_function *
14678 ix86_init_machine_status (void)
14679 {
14680 struct machine_function *f;
14681
14682 f = ggc_alloc_cleared (sizeof (struct machine_function));
14683 f->use_fast_prologue_epilogue_nregs = -1;
14684 f->tls_descriptor_call_expanded_p = 0;
14685
14686 return f;
14687 }
14688
14689 /* Return a MEM corresponding to a stack slot with mode MODE.
14690 Allocate a new slot if necessary.
14691
14692 The RTL for a function can have several slots available: N is
14693 which slot to use. */
14694
14695 rtx
14696 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
14697 {
14698 struct stack_local_entry *s;
14699
14700 gcc_assert (n < MAX_386_STACK_LOCALS);
14701
14702 for (s = ix86_stack_locals; s; s = s->next)
14703 if (s->mode == mode && s->n == n)
14704 return copy_rtx (s->rtl);
14705
14706 s = (struct stack_local_entry *)
14707 ggc_alloc (sizeof (struct stack_local_entry));
14708 s->n = n;
14709 s->mode = mode;
14710 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
14711
14712 s->next = ix86_stack_locals;
14713 ix86_stack_locals = s;
14714 return s->rtl;
14715 }
14716
14717 /* Construct the SYMBOL_REF for the tls_get_addr function. */
14718
14719 static GTY(()) rtx ix86_tls_symbol;
14720 rtx
14721 ix86_tls_get_addr (void)
14722 {
14723
14724 if (!ix86_tls_symbol)
14725 {
14726 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode,
14727 (TARGET_ANY_GNU_TLS
14728 && !TARGET_64BIT)
14729 ? "___tls_get_addr"
14730 : "__tls_get_addr");
14731 }
14732
14733 return ix86_tls_symbol;
14734 }
14735
14736 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
14737
14738 static GTY(()) rtx ix86_tls_module_base_symbol;
14739 rtx
14740 ix86_tls_module_base (void)
14741 {
14742
14743 if (!ix86_tls_module_base_symbol)
14744 {
14745 ix86_tls_module_base_symbol = gen_rtx_SYMBOL_REF (Pmode,
14746 "_TLS_MODULE_BASE_");
14747 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
14748 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
14749 }
14750
14751 return ix86_tls_module_base_symbol;
14752 }
14753 \f
14754 /* Calculate the length of the memory address in the instruction
14755 encoding. Does not include the one-byte modrm, opcode, or prefix. */
14756
14757 int
14758 memory_address_length (rtx addr)
14759 {
14760 struct ix86_address parts;
14761 rtx base, index, disp;
14762 int len;
14763 int ok;
14764
14765 if (GET_CODE (addr) == PRE_DEC
14766 || GET_CODE (addr) == POST_INC
14767 || GET_CODE (addr) == PRE_MODIFY
14768 || GET_CODE (addr) == POST_MODIFY)
14769 return 0;
14770
14771 ok = ix86_decompose_address (addr, &parts);
14772 gcc_assert (ok);
14773
14774 if (parts.base && GET_CODE (parts.base) == SUBREG)
14775 parts.base = SUBREG_REG (parts.base);
14776 if (parts.index && GET_CODE (parts.index) == SUBREG)
14777 parts.index = SUBREG_REG (parts.index);
14778
14779 base = parts.base;
14780 index = parts.index;
14781 disp = parts.disp;
14782 len = 0;
14783
14784 /* Rule of thumb:
14785 - esp as the base always wants an index,
14786 - ebp as the base always wants a displacement. */
14787
14788 /* Register Indirect. */
14789 if (base && !index && !disp)
14790 {
14791 /* esp (for its index) and ebp (for its displacement) need
14792 the two-byte modrm form. */
14793 if (addr == stack_pointer_rtx
14794 || addr == arg_pointer_rtx
14795 || addr == frame_pointer_rtx
14796 || addr == hard_frame_pointer_rtx)
14797 len = 1;
14798 }
14799
14800 /* Direct Addressing. */
14801 else if (disp && !base && !index)
14802 len = 4;
14803
14804 else
14805 {
14806 /* Find the length of the displacement constant. */
14807 if (disp)
14808 {
14809 if (base && satisfies_constraint_K (disp))
14810 len = 1;
14811 else
14812 len = 4;
14813 }
14814 /* ebp always wants a displacement. */
14815 else if (base == hard_frame_pointer_rtx)
14816 len = 1;
14817
14818 /* An index requires the two-byte modrm form.... */
14819 if (index
14820 /* ...like esp, which always wants an index. */
14821 || base == stack_pointer_rtx
14822 || base == arg_pointer_rtx
14823 || base == frame_pointer_rtx)
14824 len += 1;
14825 }
14826
14827 return len;
14828 }
14829
14830 /* Compute default value for "length_immediate" attribute. When SHORTFORM
14831 is set, expect that insn have 8bit immediate alternative. */
14832 int
14833 ix86_attr_length_immediate_default (rtx insn, int shortform)
14834 {
14835 int len = 0;
14836 int i;
14837 extract_insn_cached (insn);
14838 for (i = recog_data.n_operands - 1; i >= 0; --i)
14839 if (CONSTANT_P (recog_data.operand[i]))
14840 {
14841 gcc_assert (!len);
14842 if (shortform && satisfies_constraint_K (recog_data.operand[i]))
14843 len = 1;
14844 else
14845 {
14846 switch (get_attr_mode (insn))
14847 {
14848 case MODE_QI:
14849 len+=1;
14850 break;
14851 case MODE_HI:
14852 len+=2;
14853 break;
14854 case MODE_SI:
14855 len+=4;
14856 break;
14857 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
14858 case MODE_DI:
14859 len+=4;
14860 break;
14861 default:
14862 fatal_insn ("unknown insn mode", insn);
14863 }
14864 }
14865 }
14866 return len;
14867 }
14868 /* Compute default value for "length_address" attribute. */
14869 int
14870 ix86_attr_length_address_default (rtx insn)
14871 {
14872 int i;
14873
14874 if (get_attr_type (insn) == TYPE_LEA)
14875 {
14876 rtx set = PATTERN (insn);
14877
14878 if (GET_CODE (set) == PARALLEL)
14879 set = XVECEXP (set, 0, 0);
14880
14881 gcc_assert (GET_CODE (set) == SET);
14882
14883 return memory_address_length (SET_SRC (set));
14884 }
14885
14886 extract_insn_cached (insn);
14887 for (i = recog_data.n_operands - 1; i >= 0; --i)
14888 if (MEM_P (recog_data.operand[i]))
14889 {
14890 return memory_address_length (XEXP (recog_data.operand[i], 0));
14891 break;
14892 }
14893 return 0;
14894 }
14895 \f
14896 /* Return the maximum number of instructions a cpu can issue. */
14897
14898 static int
14899 ix86_issue_rate (void)
14900 {
14901 switch (ix86_tune)
14902 {
14903 case PROCESSOR_PENTIUM:
14904 case PROCESSOR_K6:
14905 return 2;
14906
14907 case PROCESSOR_PENTIUMPRO:
14908 case PROCESSOR_PENTIUM4:
14909 case PROCESSOR_ATHLON:
14910 case PROCESSOR_K8:
14911 case PROCESSOR_AMDFAM10:
14912 case PROCESSOR_NOCONA:
14913 case PROCESSOR_GENERIC32:
14914 case PROCESSOR_GENERIC64:
14915 return 3;
14916
14917 case PROCESSOR_CORE2:
14918 return 4;
14919
14920 default:
14921 return 1;
14922 }
14923 }
14924
14925 /* A subroutine of ix86_adjust_cost -- return true iff INSN reads flags set
14926 by DEP_INSN and nothing set by DEP_INSN. */
14927
14928 static int
14929 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
14930 {
14931 rtx set, set2;
14932
14933 /* Simplify the test for uninteresting insns. */
14934 if (insn_type != TYPE_SETCC
14935 && insn_type != TYPE_ICMOV
14936 && insn_type != TYPE_FCMOV
14937 && insn_type != TYPE_IBR)
14938 return 0;
14939
14940 if ((set = single_set (dep_insn)) != 0)
14941 {
14942 set = SET_DEST (set);
14943 set2 = NULL_RTX;
14944 }
14945 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
14946 && XVECLEN (PATTERN (dep_insn), 0) == 2
14947 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
14948 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
14949 {
14950 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
14951 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
14952 }
14953 else
14954 return 0;
14955
14956 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
14957 return 0;
14958
14959 /* This test is true if the dependent insn reads the flags but
14960 not any other potentially set register. */
14961 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
14962 return 0;
14963
14964 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
14965 return 0;
14966
14967 return 1;
14968 }
14969
14970 /* A subroutine of ix86_adjust_cost -- return true iff INSN has a memory
14971 address with operands set by DEP_INSN. */
14972
14973 static int
14974 ix86_agi_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
14975 {
14976 rtx addr;
14977
14978 if (insn_type == TYPE_LEA
14979 && TARGET_PENTIUM)
14980 {
14981 addr = PATTERN (insn);
14982
14983 if (GET_CODE (addr) == PARALLEL)
14984 addr = XVECEXP (addr, 0, 0);
14985
14986 gcc_assert (GET_CODE (addr) == SET);
14987
14988 addr = SET_SRC (addr);
14989 }
14990 else
14991 {
14992 int i;
14993 extract_insn_cached (insn);
14994 for (i = recog_data.n_operands - 1; i >= 0; --i)
14995 if (MEM_P (recog_data.operand[i]))
14996 {
14997 addr = XEXP (recog_data.operand[i], 0);
14998 goto found;
14999 }
15000 return 0;
15001 found:;
15002 }
15003
15004 return modified_in_p (addr, dep_insn);
15005 }
15006
15007 static int
15008 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
15009 {
15010 enum attr_type insn_type, dep_insn_type;
15011 enum attr_memory memory;
15012 rtx set, set2;
15013 int dep_insn_code_number;
15014
15015 /* Anti and output dependencies have zero cost on all CPUs. */
15016 if (REG_NOTE_KIND (link) != 0)
15017 return 0;
15018
15019 dep_insn_code_number = recog_memoized (dep_insn);
15020
15021 /* If we can't recognize the insns, we can't really do anything. */
15022 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
15023 return cost;
15024
15025 insn_type = get_attr_type (insn);
15026 dep_insn_type = get_attr_type (dep_insn);
15027
15028 switch (ix86_tune)
15029 {
15030 case PROCESSOR_PENTIUM:
15031 /* Address Generation Interlock adds a cycle of latency. */
15032 if (ix86_agi_dependent (insn, dep_insn, insn_type))
15033 cost += 1;
15034
15035 /* ??? Compares pair with jump/setcc. */
15036 if (ix86_flags_dependent (insn, dep_insn, insn_type))
15037 cost = 0;
15038
15039 /* Floating point stores require value to be ready one cycle earlier. */
15040 if (insn_type == TYPE_FMOV
15041 && get_attr_memory (insn) == MEMORY_STORE
15042 && !ix86_agi_dependent (insn, dep_insn, insn_type))
15043 cost += 1;
15044 break;
15045
15046 case PROCESSOR_PENTIUMPRO:
15047 memory = get_attr_memory (insn);
15048
15049 /* INT->FP conversion is expensive. */
15050 if (get_attr_fp_int_src (dep_insn))
15051 cost += 5;
15052
15053 /* There is one cycle extra latency between an FP op and a store. */
15054 if (insn_type == TYPE_FMOV
15055 && (set = single_set (dep_insn)) != NULL_RTX
15056 && (set2 = single_set (insn)) != NULL_RTX
15057 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
15058 && MEM_P (SET_DEST (set2)))
15059 cost += 1;
15060
15061 /* Show ability of reorder buffer to hide latency of load by executing
15062 in parallel with previous instruction in case
15063 previous instruction is not needed to compute the address. */
15064 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
15065 && !ix86_agi_dependent (insn, dep_insn, insn_type))
15066 {
15067 /* Claim moves to take one cycle, as core can issue one load
15068 at time and the next load can start cycle later. */
15069 if (dep_insn_type == TYPE_IMOV
15070 || dep_insn_type == TYPE_FMOV)
15071 cost = 1;
15072 else if (cost > 1)
15073 cost--;
15074 }
15075 break;
15076
15077 case PROCESSOR_K6:
15078 memory = get_attr_memory (insn);
15079
15080 /* The esp dependency is resolved before the instruction is really
15081 finished. */
15082 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
15083 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
15084 return 1;
15085
15086 /* INT->FP conversion is expensive. */
15087 if (get_attr_fp_int_src (dep_insn))
15088 cost += 5;
15089
15090 /* Show ability of reorder buffer to hide latency of load by executing
15091 in parallel with previous instruction in case
15092 previous instruction is not needed to compute the address. */
15093 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
15094 && !ix86_agi_dependent (insn, dep_insn, insn_type))
15095 {
15096 /* Claim moves to take one cycle, as core can issue one load
15097 at time and the next load can start cycle later. */
15098 if (dep_insn_type == TYPE_IMOV
15099 || dep_insn_type == TYPE_FMOV)
15100 cost = 1;
15101 else if (cost > 2)
15102 cost -= 2;
15103 else
15104 cost = 1;
15105 }
15106 break;
15107
15108 case PROCESSOR_ATHLON:
15109 case PROCESSOR_K8:
15110 case PROCESSOR_AMDFAM10:
15111 case PROCESSOR_GENERIC32:
15112 case PROCESSOR_GENERIC64:
15113 memory = get_attr_memory (insn);
15114
15115 /* Show ability of reorder buffer to hide latency of load by executing
15116 in parallel with previous instruction in case
15117 previous instruction is not needed to compute the address. */
15118 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
15119 && !ix86_agi_dependent (insn, dep_insn, insn_type))
15120 {
15121 enum attr_unit unit = get_attr_unit (insn);
15122 int loadcost = 3;
15123
15124 /* Because of the difference between the length of integer and
15125 floating unit pipeline preparation stages, the memory operands
15126 for floating point are cheaper.
15127
15128 ??? For Athlon it the difference is most probably 2. */
15129 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
15130 loadcost = 3;
15131 else
15132 loadcost = TARGET_ATHLON ? 2 : 0;
15133
15134 if (cost >= loadcost)
15135 cost -= loadcost;
15136 else
15137 cost = 0;
15138 }
15139
15140 default:
15141 break;
15142 }
15143
15144 return cost;
15145 }
15146
15147 /* How many alternative schedules to try. This should be as wide as the
15148 scheduling freedom in the DFA, but no wider. Making this value too
15149 large results extra work for the scheduler. */
15150
15151 static int
15152 ia32_multipass_dfa_lookahead (void)
15153 {
15154 if (ix86_tune == PROCESSOR_PENTIUM)
15155 return 2;
15156
15157 if (ix86_tune == PROCESSOR_PENTIUMPRO
15158 || ix86_tune == PROCESSOR_K6)
15159 return 1;
15160
15161 else
15162 return 0;
15163 }
15164
15165 \f
15166 /* Compute the alignment given to a constant that is being placed in memory.
15167 EXP is the constant and ALIGN is the alignment that the object would
15168 ordinarily have.
15169 The value of this function is used instead of that alignment to align
15170 the object. */
15171
15172 int
15173 ix86_constant_alignment (tree exp, int align)
15174 {
15175 if (TREE_CODE (exp) == REAL_CST)
15176 {
15177 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
15178 return 64;
15179 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
15180 return 128;
15181 }
15182 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
15183 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
15184 return BITS_PER_WORD;
15185
15186 return align;
15187 }
15188
15189 /* Compute the alignment for a static variable.
15190 TYPE is the data type, and ALIGN is the alignment that
15191 the object would ordinarily have. The value of this function is used
15192 instead of that alignment to align the object. */
15193
15194 int
15195 ix86_data_alignment (tree type, int align)
15196 {
15197 int max_align = optimize_size ? BITS_PER_WORD : 256;
15198
15199 if (AGGREGATE_TYPE_P (type)
15200 && TYPE_SIZE (type)
15201 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15202 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
15203 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
15204 && align < max_align)
15205 align = max_align;
15206
15207 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
15208 to 16byte boundary. */
15209 if (TARGET_64BIT)
15210 {
15211 if (AGGREGATE_TYPE_P (type)
15212 && TYPE_SIZE (type)
15213 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15214 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
15215 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
15216 return 128;
15217 }
15218
15219 if (TREE_CODE (type) == ARRAY_TYPE)
15220 {
15221 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
15222 return 64;
15223 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
15224 return 128;
15225 }
15226 else if (TREE_CODE (type) == COMPLEX_TYPE)
15227 {
15228
15229 if (TYPE_MODE (type) == DCmode && align < 64)
15230 return 64;
15231 if (TYPE_MODE (type) == XCmode && align < 128)
15232 return 128;
15233 }
15234 else if ((TREE_CODE (type) == RECORD_TYPE
15235 || TREE_CODE (type) == UNION_TYPE
15236 || TREE_CODE (type) == QUAL_UNION_TYPE)
15237 && TYPE_FIELDS (type))
15238 {
15239 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
15240 return 64;
15241 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
15242 return 128;
15243 }
15244 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
15245 || TREE_CODE (type) == INTEGER_TYPE)
15246 {
15247 if (TYPE_MODE (type) == DFmode && align < 64)
15248 return 64;
15249 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
15250 return 128;
15251 }
15252
15253 return align;
15254 }
15255
15256 /* Compute the alignment for a local variable.
15257 TYPE is the data type, and ALIGN is the alignment that
15258 the object would ordinarily have. The value of this macro is used
15259 instead of that alignment to align the object. */
15260
15261 int
15262 ix86_local_alignment (tree type, int align)
15263 {
15264 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
15265 to 16byte boundary. */
15266 if (TARGET_64BIT)
15267 {
15268 if (AGGREGATE_TYPE_P (type)
15269 && TYPE_SIZE (type)
15270 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15271 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
15272 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
15273 return 128;
15274 }
15275 if (TREE_CODE (type) == ARRAY_TYPE)
15276 {
15277 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
15278 return 64;
15279 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
15280 return 128;
15281 }
15282 else if (TREE_CODE (type) == COMPLEX_TYPE)
15283 {
15284 if (TYPE_MODE (type) == DCmode && align < 64)
15285 return 64;
15286 if (TYPE_MODE (type) == XCmode && align < 128)
15287 return 128;
15288 }
15289 else if ((TREE_CODE (type) == RECORD_TYPE
15290 || TREE_CODE (type) == UNION_TYPE
15291 || TREE_CODE (type) == QUAL_UNION_TYPE)
15292 && TYPE_FIELDS (type))
15293 {
15294 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
15295 return 64;
15296 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
15297 return 128;
15298 }
15299 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
15300 || TREE_CODE (type) == INTEGER_TYPE)
15301 {
15302
15303 if (TYPE_MODE (type) == DFmode && align < 64)
15304 return 64;
15305 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
15306 return 128;
15307 }
15308 return align;
15309 }
15310 \f
15311 /* Emit RTL insns to initialize the variable parts of a trampoline.
15312 FNADDR is an RTX for the address of the function's pure code.
15313 CXT is an RTX for the static chain value for the function. */
15314 void
15315 x86_initialize_trampoline (rtx tramp, rtx fnaddr, rtx cxt)
15316 {
15317 if (!TARGET_64BIT)
15318 {
15319 /* Compute offset from the end of the jmp to the target function. */
15320 rtx disp = expand_binop (SImode, sub_optab, fnaddr,
15321 plus_constant (tramp, 10),
15322 NULL_RTX, 1, OPTAB_DIRECT);
15323 emit_move_insn (gen_rtx_MEM (QImode, tramp),
15324 gen_int_mode (0xb9, QImode));
15325 emit_move_insn (gen_rtx_MEM (SImode, plus_constant (tramp, 1)), cxt);
15326 emit_move_insn (gen_rtx_MEM (QImode, plus_constant (tramp, 5)),
15327 gen_int_mode (0xe9, QImode));
15328 emit_move_insn (gen_rtx_MEM (SImode, plus_constant (tramp, 6)), disp);
15329 }
15330 else
15331 {
15332 int offset = 0;
15333 /* Try to load address using shorter movl instead of movabs.
15334 We may want to support movq for kernel mode, but kernel does not use
15335 trampolines at the moment. */
15336 if (x86_64_zext_immediate_operand (fnaddr, VOIDmode))
15337 {
15338 fnaddr = copy_to_mode_reg (DImode, fnaddr);
15339 emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)),
15340 gen_int_mode (0xbb41, HImode));
15341 emit_move_insn (gen_rtx_MEM (SImode, plus_constant (tramp, offset + 2)),
15342 gen_lowpart (SImode, fnaddr));
15343 offset += 6;
15344 }
15345 else
15346 {
15347 emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)),
15348 gen_int_mode (0xbb49, HImode));
15349 emit_move_insn (gen_rtx_MEM (DImode, plus_constant (tramp, offset + 2)),
15350 fnaddr);
15351 offset += 10;
15352 }
15353 /* Load static chain using movabs to r10. */
15354 emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)),
15355 gen_int_mode (0xba49, HImode));
15356 emit_move_insn (gen_rtx_MEM (DImode, plus_constant (tramp, offset + 2)),
15357 cxt);
15358 offset += 10;
15359 /* Jump to the r11 */
15360 emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)),
15361 gen_int_mode (0xff49, HImode));
15362 emit_move_insn (gen_rtx_MEM (QImode, plus_constant (tramp, offset+2)),
15363 gen_int_mode (0xe3, QImode));
15364 offset += 3;
15365 gcc_assert (offset <= TRAMPOLINE_SIZE);
15366 }
15367
15368 #ifdef ENABLE_EXECUTE_STACK
15369 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
15370 LCT_NORMAL, VOIDmode, 1, tramp, Pmode);
15371 #endif
15372 }
15373 \f
15374 /* Codes for all the SSE/MMX builtins. */
15375 enum ix86_builtins
15376 {
15377 IX86_BUILTIN_ADDPS,
15378 IX86_BUILTIN_ADDSS,
15379 IX86_BUILTIN_DIVPS,
15380 IX86_BUILTIN_DIVSS,
15381 IX86_BUILTIN_MULPS,
15382 IX86_BUILTIN_MULSS,
15383 IX86_BUILTIN_SUBPS,
15384 IX86_BUILTIN_SUBSS,
15385
15386 IX86_BUILTIN_CMPEQPS,
15387 IX86_BUILTIN_CMPLTPS,
15388 IX86_BUILTIN_CMPLEPS,
15389 IX86_BUILTIN_CMPGTPS,
15390 IX86_BUILTIN_CMPGEPS,
15391 IX86_BUILTIN_CMPNEQPS,
15392 IX86_BUILTIN_CMPNLTPS,
15393 IX86_BUILTIN_CMPNLEPS,
15394 IX86_BUILTIN_CMPNGTPS,
15395 IX86_BUILTIN_CMPNGEPS,
15396 IX86_BUILTIN_CMPORDPS,
15397 IX86_BUILTIN_CMPUNORDPS,
15398 IX86_BUILTIN_CMPEQSS,
15399 IX86_BUILTIN_CMPLTSS,
15400 IX86_BUILTIN_CMPLESS,
15401 IX86_BUILTIN_CMPNEQSS,
15402 IX86_BUILTIN_CMPNLTSS,
15403 IX86_BUILTIN_CMPNLESS,
15404 IX86_BUILTIN_CMPNGTSS,
15405 IX86_BUILTIN_CMPNGESS,
15406 IX86_BUILTIN_CMPORDSS,
15407 IX86_BUILTIN_CMPUNORDSS,
15408
15409 IX86_BUILTIN_COMIEQSS,
15410 IX86_BUILTIN_COMILTSS,
15411 IX86_BUILTIN_COMILESS,
15412 IX86_BUILTIN_COMIGTSS,
15413 IX86_BUILTIN_COMIGESS,
15414 IX86_BUILTIN_COMINEQSS,
15415 IX86_BUILTIN_UCOMIEQSS,
15416 IX86_BUILTIN_UCOMILTSS,
15417 IX86_BUILTIN_UCOMILESS,
15418 IX86_BUILTIN_UCOMIGTSS,
15419 IX86_BUILTIN_UCOMIGESS,
15420 IX86_BUILTIN_UCOMINEQSS,
15421
15422 IX86_BUILTIN_CVTPI2PS,
15423 IX86_BUILTIN_CVTPS2PI,
15424 IX86_BUILTIN_CVTSI2SS,
15425 IX86_BUILTIN_CVTSI642SS,
15426 IX86_BUILTIN_CVTSS2SI,
15427 IX86_BUILTIN_CVTSS2SI64,
15428 IX86_BUILTIN_CVTTPS2PI,
15429 IX86_BUILTIN_CVTTSS2SI,
15430 IX86_BUILTIN_CVTTSS2SI64,
15431
15432 IX86_BUILTIN_MAXPS,
15433 IX86_BUILTIN_MAXSS,
15434 IX86_BUILTIN_MINPS,
15435 IX86_BUILTIN_MINSS,
15436
15437 IX86_BUILTIN_LOADUPS,
15438 IX86_BUILTIN_STOREUPS,
15439 IX86_BUILTIN_MOVSS,
15440
15441 IX86_BUILTIN_MOVHLPS,
15442 IX86_BUILTIN_MOVLHPS,
15443 IX86_BUILTIN_LOADHPS,
15444 IX86_BUILTIN_LOADLPS,
15445 IX86_BUILTIN_STOREHPS,
15446 IX86_BUILTIN_STORELPS,
15447
15448 IX86_BUILTIN_MASKMOVQ,
15449 IX86_BUILTIN_MOVMSKPS,
15450 IX86_BUILTIN_PMOVMSKB,
15451
15452 IX86_BUILTIN_MOVNTPS,
15453 IX86_BUILTIN_MOVNTQ,
15454
15455 IX86_BUILTIN_LOADDQU,
15456 IX86_BUILTIN_STOREDQU,
15457
15458 IX86_BUILTIN_PACKSSWB,
15459 IX86_BUILTIN_PACKSSDW,
15460 IX86_BUILTIN_PACKUSWB,
15461
15462 IX86_BUILTIN_PADDB,
15463 IX86_BUILTIN_PADDW,
15464 IX86_BUILTIN_PADDD,
15465 IX86_BUILTIN_PADDQ,
15466 IX86_BUILTIN_PADDSB,
15467 IX86_BUILTIN_PADDSW,
15468 IX86_BUILTIN_PADDUSB,
15469 IX86_BUILTIN_PADDUSW,
15470 IX86_BUILTIN_PSUBB,
15471 IX86_BUILTIN_PSUBW,
15472 IX86_BUILTIN_PSUBD,
15473 IX86_BUILTIN_PSUBQ,
15474 IX86_BUILTIN_PSUBSB,
15475 IX86_BUILTIN_PSUBSW,
15476 IX86_BUILTIN_PSUBUSB,
15477 IX86_BUILTIN_PSUBUSW,
15478
15479 IX86_BUILTIN_PAND,
15480 IX86_BUILTIN_PANDN,
15481 IX86_BUILTIN_POR,
15482 IX86_BUILTIN_PXOR,
15483
15484 IX86_BUILTIN_PAVGB,
15485 IX86_BUILTIN_PAVGW,
15486
15487 IX86_BUILTIN_PCMPEQB,
15488 IX86_BUILTIN_PCMPEQW,
15489 IX86_BUILTIN_PCMPEQD,
15490 IX86_BUILTIN_PCMPGTB,
15491 IX86_BUILTIN_PCMPGTW,
15492 IX86_BUILTIN_PCMPGTD,
15493
15494 IX86_BUILTIN_PMADDWD,
15495
15496 IX86_BUILTIN_PMAXSW,
15497 IX86_BUILTIN_PMAXUB,
15498 IX86_BUILTIN_PMINSW,
15499 IX86_BUILTIN_PMINUB,
15500
15501 IX86_BUILTIN_PMULHUW,
15502 IX86_BUILTIN_PMULHW,
15503 IX86_BUILTIN_PMULLW,
15504
15505 IX86_BUILTIN_PSADBW,
15506 IX86_BUILTIN_PSHUFW,
15507
15508 IX86_BUILTIN_PSLLW,
15509 IX86_BUILTIN_PSLLD,
15510 IX86_BUILTIN_PSLLQ,
15511 IX86_BUILTIN_PSRAW,
15512 IX86_BUILTIN_PSRAD,
15513 IX86_BUILTIN_PSRLW,
15514 IX86_BUILTIN_PSRLD,
15515 IX86_BUILTIN_PSRLQ,
15516 IX86_BUILTIN_PSLLWI,
15517 IX86_BUILTIN_PSLLDI,
15518 IX86_BUILTIN_PSLLQI,
15519 IX86_BUILTIN_PSRAWI,
15520 IX86_BUILTIN_PSRADI,
15521 IX86_BUILTIN_PSRLWI,
15522 IX86_BUILTIN_PSRLDI,
15523 IX86_BUILTIN_PSRLQI,
15524
15525 IX86_BUILTIN_PUNPCKHBW,
15526 IX86_BUILTIN_PUNPCKHWD,
15527 IX86_BUILTIN_PUNPCKHDQ,
15528 IX86_BUILTIN_PUNPCKLBW,
15529 IX86_BUILTIN_PUNPCKLWD,
15530 IX86_BUILTIN_PUNPCKLDQ,
15531
15532 IX86_BUILTIN_SHUFPS,
15533
15534 IX86_BUILTIN_RCPPS,
15535 IX86_BUILTIN_RCPSS,
15536 IX86_BUILTIN_RSQRTPS,
15537 IX86_BUILTIN_RSQRTSS,
15538 IX86_BUILTIN_SQRTPS,
15539 IX86_BUILTIN_SQRTSS,
15540
15541 IX86_BUILTIN_UNPCKHPS,
15542 IX86_BUILTIN_UNPCKLPS,
15543
15544 IX86_BUILTIN_ANDPS,
15545 IX86_BUILTIN_ANDNPS,
15546 IX86_BUILTIN_ORPS,
15547 IX86_BUILTIN_XORPS,
15548
15549 IX86_BUILTIN_EMMS,
15550 IX86_BUILTIN_LDMXCSR,
15551 IX86_BUILTIN_STMXCSR,
15552 IX86_BUILTIN_SFENCE,
15553
15554 /* 3DNow! Original */
15555 IX86_BUILTIN_FEMMS,
15556 IX86_BUILTIN_PAVGUSB,
15557 IX86_BUILTIN_PF2ID,
15558 IX86_BUILTIN_PFACC,
15559 IX86_BUILTIN_PFADD,
15560 IX86_BUILTIN_PFCMPEQ,
15561 IX86_BUILTIN_PFCMPGE,
15562 IX86_BUILTIN_PFCMPGT,
15563 IX86_BUILTIN_PFMAX,
15564 IX86_BUILTIN_PFMIN,
15565 IX86_BUILTIN_PFMUL,
15566 IX86_BUILTIN_PFRCP,
15567 IX86_BUILTIN_PFRCPIT1,
15568 IX86_BUILTIN_PFRCPIT2,
15569 IX86_BUILTIN_PFRSQIT1,
15570 IX86_BUILTIN_PFRSQRT,
15571 IX86_BUILTIN_PFSUB,
15572 IX86_BUILTIN_PFSUBR,
15573 IX86_BUILTIN_PI2FD,
15574 IX86_BUILTIN_PMULHRW,
15575
15576 /* 3DNow! Athlon Extensions */
15577 IX86_BUILTIN_PF2IW,
15578 IX86_BUILTIN_PFNACC,
15579 IX86_BUILTIN_PFPNACC,
15580 IX86_BUILTIN_PI2FW,
15581 IX86_BUILTIN_PSWAPDSI,
15582 IX86_BUILTIN_PSWAPDSF,
15583
15584 /* SSE2 */
15585 IX86_BUILTIN_ADDPD,
15586 IX86_BUILTIN_ADDSD,
15587 IX86_BUILTIN_DIVPD,
15588 IX86_BUILTIN_DIVSD,
15589 IX86_BUILTIN_MULPD,
15590 IX86_BUILTIN_MULSD,
15591 IX86_BUILTIN_SUBPD,
15592 IX86_BUILTIN_SUBSD,
15593
15594 IX86_BUILTIN_CMPEQPD,
15595 IX86_BUILTIN_CMPLTPD,
15596 IX86_BUILTIN_CMPLEPD,
15597 IX86_BUILTIN_CMPGTPD,
15598 IX86_BUILTIN_CMPGEPD,
15599 IX86_BUILTIN_CMPNEQPD,
15600 IX86_BUILTIN_CMPNLTPD,
15601 IX86_BUILTIN_CMPNLEPD,
15602 IX86_BUILTIN_CMPNGTPD,
15603 IX86_BUILTIN_CMPNGEPD,
15604 IX86_BUILTIN_CMPORDPD,
15605 IX86_BUILTIN_CMPUNORDPD,
15606 IX86_BUILTIN_CMPNEPD,
15607 IX86_BUILTIN_CMPEQSD,
15608 IX86_BUILTIN_CMPLTSD,
15609 IX86_BUILTIN_CMPLESD,
15610 IX86_BUILTIN_CMPNEQSD,
15611 IX86_BUILTIN_CMPNLTSD,
15612 IX86_BUILTIN_CMPNLESD,
15613 IX86_BUILTIN_CMPORDSD,
15614 IX86_BUILTIN_CMPUNORDSD,
15615 IX86_BUILTIN_CMPNESD,
15616
15617 IX86_BUILTIN_COMIEQSD,
15618 IX86_BUILTIN_COMILTSD,
15619 IX86_BUILTIN_COMILESD,
15620 IX86_BUILTIN_COMIGTSD,
15621 IX86_BUILTIN_COMIGESD,
15622 IX86_BUILTIN_COMINEQSD,
15623 IX86_BUILTIN_UCOMIEQSD,
15624 IX86_BUILTIN_UCOMILTSD,
15625 IX86_BUILTIN_UCOMILESD,
15626 IX86_BUILTIN_UCOMIGTSD,
15627 IX86_BUILTIN_UCOMIGESD,
15628 IX86_BUILTIN_UCOMINEQSD,
15629
15630 IX86_BUILTIN_MAXPD,
15631 IX86_BUILTIN_MAXSD,
15632 IX86_BUILTIN_MINPD,
15633 IX86_BUILTIN_MINSD,
15634
15635 IX86_BUILTIN_ANDPD,
15636 IX86_BUILTIN_ANDNPD,
15637 IX86_BUILTIN_ORPD,
15638 IX86_BUILTIN_XORPD,
15639
15640 IX86_BUILTIN_SQRTPD,
15641 IX86_BUILTIN_SQRTSD,
15642
15643 IX86_BUILTIN_UNPCKHPD,
15644 IX86_BUILTIN_UNPCKLPD,
15645
15646 IX86_BUILTIN_SHUFPD,
15647
15648 IX86_BUILTIN_LOADUPD,
15649 IX86_BUILTIN_STOREUPD,
15650 IX86_BUILTIN_MOVSD,
15651
15652 IX86_BUILTIN_LOADHPD,
15653 IX86_BUILTIN_LOADLPD,
15654
15655 IX86_BUILTIN_CVTDQ2PD,
15656 IX86_BUILTIN_CVTDQ2PS,
15657
15658 IX86_BUILTIN_CVTPD2DQ,
15659 IX86_BUILTIN_CVTPD2PI,
15660 IX86_BUILTIN_CVTPD2PS,
15661 IX86_BUILTIN_CVTTPD2DQ,
15662 IX86_BUILTIN_CVTTPD2PI,
15663
15664 IX86_BUILTIN_CVTPI2PD,
15665 IX86_BUILTIN_CVTSI2SD,
15666 IX86_BUILTIN_CVTSI642SD,
15667
15668 IX86_BUILTIN_CVTSD2SI,
15669 IX86_BUILTIN_CVTSD2SI64,
15670 IX86_BUILTIN_CVTSD2SS,
15671 IX86_BUILTIN_CVTSS2SD,
15672 IX86_BUILTIN_CVTTSD2SI,
15673 IX86_BUILTIN_CVTTSD2SI64,
15674
15675 IX86_BUILTIN_CVTPS2DQ,
15676 IX86_BUILTIN_CVTPS2PD,
15677 IX86_BUILTIN_CVTTPS2DQ,
15678
15679 IX86_BUILTIN_MOVNTI,
15680 IX86_BUILTIN_MOVNTPD,
15681 IX86_BUILTIN_MOVNTDQ,
15682
15683 /* SSE2 MMX */
15684 IX86_BUILTIN_MASKMOVDQU,
15685 IX86_BUILTIN_MOVMSKPD,
15686 IX86_BUILTIN_PMOVMSKB128,
15687
15688 IX86_BUILTIN_PACKSSWB128,
15689 IX86_BUILTIN_PACKSSDW128,
15690 IX86_BUILTIN_PACKUSWB128,
15691
15692 IX86_BUILTIN_PADDB128,
15693 IX86_BUILTIN_PADDW128,
15694 IX86_BUILTIN_PADDD128,
15695 IX86_BUILTIN_PADDQ128,
15696 IX86_BUILTIN_PADDSB128,
15697 IX86_BUILTIN_PADDSW128,
15698 IX86_BUILTIN_PADDUSB128,
15699 IX86_BUILTIN_PADDUSW128,
15700 IX86_BUILTIN_PSUBB128,
15701 IX86_BUILTIN_PSUBW128,
15702 IX86_BUILTIN_PSUBD128,
15703 IX86_BUILTIN_PSUBQ128,
15704 IX86_BUILTIN_PSUBSB128,
15705 IX86_BUILTIN_PSUBSW128,
15706 IX86_BUILTIN_PSUBUSB128,
15707 IX86_BUILTIN_PSUBUSW128,
15708
15709 IX86_BUILTIN_PAND128,
15710 IX86_BUILTIN_PANDN128,
15711 IX86_BUILTIN_POR128,
15712 IX86_BUILTIN_PXOR128,
15713
15714 IX86_BUILTIN_PAVGB128,
15715 IX86_BUILTIN_PAVGW128,
15716
15717 IX86_BUILTIN_PCMPEQB128,
15718 IX86_BUILTIN_PCMPEQW128,
15719 IX86_BUILTIN_PCMPEQD128,
15720 IX86_BUILTIN_PCMPGTB128,
15721 IX86_BUILTIN_PCMPGTW128,
15722 IX86_BUILTIN_PCMPGTD128,
15723
15724 IX86_BUILTIN_PMADDWD128,
15725
15726 IX86_BUILTIN_PMAXSW128,
15727 IX86_BUILTIN_PMAXUB128,
15728 IX86_BUILTIN_PMINSW128,
15729 IX86_BUILTIN_PMINUB128,
15730
15731 IX86_BUILTIN_PMULUDQ,
15732 IX86_BUILTIN_PMULUDQ128,
15733 IX86_BUILTIN_PMULHUW128,
15734 IX86_BUILTIN_PMULHW128,
15735 IX86_BUILTIN_PMULLW128,
15736
15737 IX86_BUILTIN_PSADBW128,
15738 IX86_BUILTIN_PSHUFHW,
15739 IX86_BUILTIN_PSHUFLW,
15740 IX86_BUILTIN_PSHUFD,
15741
15742 IX86_BUILTIN_PSLLW128,
15743 IX86_BUILTIN_PSLLD128,
15744 IX86_BUILTIN_PSLLQ128,
15745 IX86_BUILTIN_PSRAW128,
15746 IX86_BUILTIN_PSRAD128,
15747 IX86_BUILTIN_PSRLW128,
15748 IX86_BUILTIN_PSRLD128,
15749 IX86_BUILTIN_PSRLQ128,
15750 IX86_BUILTIN_PSLLDQI128,
15751 IX86_BUILTIN_PSLLWI128,
15752 IX86_BUILTIN_PSLLDI128,
15753 IX86_BUILTIN_PSLLQI128,
15754 IX86_BUILTIN_PSRAWI128,
15755 IX86_BUILTIN_PSRADI128,
15756 IX86_BUILTIN_PSRLDQI128,
15757 IX86_BUILTIN_PSRLWI128,
15758 IX86_BUILTIN_PSRLDI128,
15759 IX86_BUILTIN_PSRLQI128,
15760
15761 IX86_BUILTIN_PUNPCKHBW128,
15762 IX86_BUILTIN_PUNPCKHWD128,
15763 IX86_BUILTIN_PUNPCKHDQ128,
15764 IX86_BUILTIN_PUNPCKHQDQ128,
15765 IX86_BUILTIN_PUNPCKLBW128,
15766 IX86_BUILTIN_PUNPCKLWD128,
15767 IX86_BUILTIN_PUNPCKLDQ128,
15768 IX86_BUILTIN_PUNPCKLQDQ128,
15769
15770 IX86_BUILTIN_CLFLUSH,
15771 IX86_BUILTIN_MFENCE,
15772 IX86_BUILTIN_LFENCE,
15773
15774 /* Prescott New Instructions. */
15775 IX86_BUILTIN_ADDSUBPS,
15776 IX86_BUILTIN_HADDPS,
15777 IX86_BUILTIN_HSUBPS,
15778 IX86_BUILTIN_MOVSHDUP,
15779 IX86_BUILTIN_MOVSLDUP,
15780 IX86_BUILTIN_ADDSUBPD,
15781 IX86_BUILTIN_HADDPD,
15782 IX86_BUILTIN_HSUBPD,
15783 IX86_BUILTIN_LDDQU,
15784
15785 IX86_BUILTIN_MONITOR,
15786 IX86_BUILTIN_MWAIT,
15787
15788 /* SSSE3. */
15789 IX86_BUILTIN_PHADDW,
15790 IX86_BUILTIN_PHADDD,
15791 IX86_BUILTIN_PHADDSW,
15792 IX86_BUILTIN_PHSUBW,
15793 IX86_BUILTIN_PHSUBD,
15794 IX86_BUILTIN_PHSUBSW,
15795 IX86_BUILTIN_PMADDUBSW,
15796 IX86_BUILTIN_PMULHRSW,
15797 IX86_BUILTIN_PSHUFB,
15798 IX86_BUILTIN_PSIGNB,
15799 IX86_BUILTIN_PSIGNW,
15800 IX86_BUILTIN_PSIGND,
15801 IX86_BUILTIN_PALIGNR,
15802 IX86_BUILTIN_PABSB,
15803 IX86_BUILTIN_PABSW,
15804 IX86_BUILTIN_PABSD,
15805
15806 IX86_BUILTIN_PHADDW128,
15807 IX86_BUILTIN_PHADDD128,
15808 IX86_BUILTIN_PHADDSW128,
15809 IX86_BUILTIN_PHSUBW128,
15810 IX86_BUILTIN_PHSUBD128,
15811 IX86_BUILTIN_PHSUBSW128,
15812 IX86_BUILTIN_PMADDUBSW128,
15813 IX86_BUILTIN_PMULHRSW128,
15814 IX86_BUILTIN_PSHUFB128,
15815 IX86_BUILTIN_PSIGNB128,
15816 IX86_BUILTIN_PSIGNW128,
15817 IX86_BUILTIN_PSIGND128,
15818 IX86_BUILTIN_PALIGNR128,
15819 IX86_BUILTIN_PABSB128,
15820 IX86_BUILTIN_PABSW128,
15821 IX86_BUILTIN_PABSD128,
15822
15823 /* AMDFAM10 - SSE4A New Instructions. */
15824 IX86_BUILTIN_MOVNTSD,
15825 IX86_BUILTIN_MOVNTSS,
15826 IX86_BUILTIN_EXTRQI,
15827 IX86_BUILTIN_EXTRQ,
15828 IX86_BUILTIN_INSERTQI,
15829 IX86_BUILTIN_INSERTQ,
15830
15831 IX86_BUILTIN_VEC_INIT_V2SI,
15832 IX86_BUILTIN_VEC_INIT_V4HI,
15833 IX86_BUILTIN_VEC_INIT_V8QI,
15834 IX86_BUILTIN_VEC_EXT_V2DF,
15835 IX86_BUILTIN_VEC_EXT_V2DI,
15836 IX86_BUILTIN_VEC_EXT_V4SF,
15837 IX86_BUILTIN_VEC_EXT_V4SI,
15838 IX86_BUILTIN_VEC_EXT_V8HI,
15839 IX86_BUILTIN_VEC_EXT_V2SI,
15840 IX86_BUILTIN_VEC_EXT_V4HI,
15841 IX86_BUILTIN_VEC_SET_V8HI,
15842 IX86_BUILTIN_VEC_SET_V4HI,
15843
15844 IX86_BUILTIN_MAX
15845 };
15846
15847 /* Table for the ix86 builtin decls. */
15848 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
15849
15850 /* Add a ix86 target builtin function with CODE, NAME and TYPE. Do so,
15851 * if the target_flags include one of MASK. Stores the function decl
15852 * in the ix86_builtins array.
15853 * Returns the function decl or NULL_TREE, if the builtin was not added. */
15854
15855 static inline tree
15856 def_builtin (int mask, const char *name, tree type, enum ix86_builtins code)
15857 {
15858 tree decl = NULL_TREE;
15859
15860 if (mask & target_flags
15861 && (!(mask & MASK_64BIT) || TARGET_64BIT))
15862 {
15863 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
15864 NULL, NULL_TREE);
15865 ix86_builtins[(int) code] = decl;
15866 }
15867
15868 return decl;
15869 }
15870
15871 /* Like def_builtin, but also marks the function decl "const". */
15872
15873 static inline tree
15874 def_builtin_const (int mask, const char *name, tree type,
15875 enum ix86_builtins code)
15876 {
15877 tree decl = def_builtin (mask, name, type, code);
15878 if (decl)
15879 TREE_READONLY (decl) = 1;
15880 return decl;
15881 }
15882
15883 /* Bits for builtin_description.flag. */
15884
15885 /* Set when we don't support the comparison natively, and should
15886 swap_comparison in order to support it. */
15887 #define BUILTIN_DESC_SWAP_OPERANDS 1
15888
15889 struct builtin_description
15890 {
15891 const unsigned int mask;
15892 const enum insn_code icode;
15893 const char *const name;
15894 const enum ix86_builtins code;
15895 const enum rtx_code comparison;
15896 const unsigned int flag;
15897 };
15898
15899 static const struct builtin_description bdesc_comi[] =
15900 {
15901 { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
15902 { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
15903 { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
15904 { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
15905 { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
15906 { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
15907 { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
15908 { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
15909 { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
15910 { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
15911 { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
15912 { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
15913 { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
15914 { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
15915 { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
15916 { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
15917 { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
15918 { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
15919 { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
15920 { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
15921 { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
15922 { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
15923 { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
15924 { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
15925 };
15926
15927 static const struct builtin_description bdesc_2arg[] =
15928 {
15929 /* SSE */
15930 { MASK_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, 0, 0 },
15931 { MASK_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, 0, 0 },
15932 { MASK_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, 0, 0 },
15933 { MASK_SSE, CODE_FOR_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, 0, 0 },
15934 { MASK_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, 0, 0 },
15935 { MASK_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, 0, 0 },
15936 { MASK_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, 0, 0 },
15937 { MASK_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, 0, 0 },
15938
15939 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, 0 },
15940 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, 0 },
15941 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, 0 },
15942 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT,
15943 BUILTIN_DESC_SWAP_OPERANDS },
15944 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE,
15945 BUILTIN_DESC_SWAP_OPERANDS },
15946 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, 0 },
15947 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, 0 },
15948 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, 0 },
15949 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, 0 },
15950 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE,
15951 BUILTIN_DESC_SWAP_OPERANDS },
15952 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT,
15953 BUILTIN_DESC_SWAP_OPERANDS },
15954 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, 0 },
15955 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, 0 },
15956 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, 0 },
15957 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, 0 },
15958 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, 0 },
15959 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, 0 },
15960 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, 0 },
15961 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, 0 },
15962 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE,
15963 BUILTIN_DESC_SWAP_OPERANDS },
15964 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT,
15965 BUILTIN_DESC_SWAP_OPERANDS },
15966 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, UNORDERED, 0 },
15967
15968 { MASK_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, 0, 0 },
15969 { MASK_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, 0, 0 },
15970 { MASK_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, 0, 0 },
15971 { MASK_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, 0, 0 },
15972
15973 { MASK_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, 0, 0 },
15974 { MASK_SSE, CODE_FOR_sse_nandv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, 0, 0 },
15975 { MASK_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, 0, 0 },
15976 { MASK_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, 0, 0 },
15977
15978 { MASK_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, 0, 0 },
15979 { MASK_SSE, CODE_FOR_sse_movhlps, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, 0, 0 },
15980 { MASK_SSE, CODE_FOR_sse_movlhps, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, 0, 0 },
15981 { MASK_SSE, CODE_FOR_sse_unpckhps, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, 0, 0 },
15982 { MASK_SSE, CODE_FOR_sse_unpcklps, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, 0, 0 },
15983
15984 /* MMX */
15985 { MASK_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, 0, 0 },
15986 { MASK_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, 0, 0 },
15987 { MASK_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, 0, 0 },
15988 { MASK_SSE2, CODE_FOR_mmx_adddi3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, 0, 0 },
15989 { MASK_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, 0, 0 },
15990 { MASK_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, 0, 0 },
15991 { MASK_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, 0, 0 },
15992 { MASK_SSE2, CODE_FOR_mmx_subdi3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, 0, 0 },
15993
15994 { MASK_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, 0, 0 },
15995 { MASK_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, 0, 0 },
15996 { MASK_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, 0, 0 },
15997 { MASK_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, 0, 0 },
15998 { MASK_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, 0, 0 },
15999 { MASK_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, 0, 0 },
16000 { MASK_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, 0, 0 },
16001 { MASK_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, 0, 0 },
16002
16003 { MASK_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, 0, 0 },
16004 { MASK_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, 0, 0 },
16005 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, 0, 0 },
16006
16007 { MASK_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, 0, 0 },
16008 { MASK_MMX, CODE_FOR_mmx_nandv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, 0, 0 },
16009 { MASK_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, 0, 0 },
16010 { MASK_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, 0, 0 },
16011
16012 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, 0, 0 },
16013 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, 0, 0 },
16014
16015 { MASK_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, 0, 0 },
16016 { MASK_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, 0, 0 },
16017 { MASK_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, 0, 0 },
16018 { MASK_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, 0, 0 },
16019 { MASK_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, 0, 0 },
16020 { MASK_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, 0, 0 },
16021
16022 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, 0, 0 },
16023 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, 0, 0 },
16024 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, 0, 0 },
16025 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, 0, 0 },
16026
16027 { MASK_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, 0, 0 },
16028 { MASK_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, 0, 0 },
16029 { MASK_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, 0, 0 },
16030 { MASK_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, 0, 0 },
16031 { MASK_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, 0, 0 },
16032 { MASK_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, 0, 0 },
16033
16034 /* Special. */
16035 { MASK_MMX, CODE_FOR_mmx_packsswb, 0, IX86_BUILTIN_PACKSSWB, 0, 0 },
16036 { MASK_MMX, CODE_FOR_mmx_packssdw, 0, IX86_BUILTIN_PACKSSDW, 0, 0 },
16037 { MASK_MMX, CODE_FOR_mmx_packuswb, 0, IX86_BUILTIN_PACKUSWB, 0, 0 },
16038
16039 { MASK_SSE, CODE_FOR_sse_cvtpi2ps, 0, IX86_BUILTIN_CVTPI2PS, 0, 0 },
16040 { MASK_SSE, CODE_FOR_sse_cvtsi2ss, 0, IX86_BUILTIN_CVTSI2SS, 0, 0 },
16041 { MASK_SSE | MASK_64BIT, CODE_FOR_sse_cvtsi2ssq, 0, IX86_BUILTIN_CVTSI642SS, 0, 0 },
16042
16043 { MASK_MMX, CODE_FOR_mmx_ashlv4hi3, 0, IX86_BUILTIN_PSLLW, 0, 0 },
16044 { MASK_MMX, CODE_FOR_mmx_ashlv4hi3, 0, IX86_BUILTIN_PSLLWI, 0, 0 },
16045 { MASK_MMX, CODE_FOR_mmx_ashlv2si3, 0, IX86_BUILTIN_PSLLD, 0, 0 },
16046 { MASK_MMX, CODE_FOR_mmx_ashlv2si3, 0, IX86_BUILTIN_PSLLDI, 0, 0 },
16047 { MASK_MMX, CODE_FOR_mmx_ashldi3, 0, IX86_BUILTIN_PSLLQ, 0, 0 },
16048 { MASK_MMX, CODE_FOR_mmx_ashldi3, 0, IX86_BUILTIN_PSLLQI, 0, 0 },
16049
16050 { MASK_MMX, CODE_FOR_mmx_lshrv4hi3, 0, IX86_BUILTIN_PSRLW, 0, 0 },
16051 { MASK_MMX, CODE_FOR_mmx_lshrv4hi3, 0, IX86_BUILTIN_PSRLWI, 0, 0 },
16052 { MASK_MMX, CODE_FOR_mmx_lshrv2si3, 0, IX86_BUILTIN_PSRLD, 0, 0 },
16053 { MASK_MMX, CODE_FOR_mmx_lshrv2si3, 0, IX86_BUILTIN_PSRLDI, 0, 0 },
16054 { MASK_MMX, CODE_FOR_mmx_lshrdi3, 0, IX86_BUILTIN_PSRLQ, 0, 0 },
16055 { MASK_MMX, CODE_FOR_mmx_lshrdi3, 0, IX86_BUILTIN_PSRLQI, 0, 0 },
16056
16057 { MASK_MMX, CODE_FOR_mmx_ashrv4hi3, 0, IX86_BUILTIN_PSRAW, 0, 0 },
16058 { MASK_MMX, CODE_FOR_mmx_ashrv4hi3, 0, IX86_BUILTIN_PSRAWI, 0, 0 },
16059 { MASK_MMX, CODE_FOR_mmx_ashrv2si3, 0, IX86_BUILTIN_PSRAD, 0, 0 },
16060 { MASK_MMX, CODE_FOR_mmx_ashrv2si3, 0, IX86_BUILTIN_PSRADI, 0, 0 },
16061
16062 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_psadbw, 0, IX86_BUILTIN_PSADBW, 0, 0 },
16063 { MASK_MMX, CODE_FOR_mmx_pmaddwd, 0, IX86_BUILTIN_PMADDWD, 0, 0 },
16064
16065 /* SSE2 */
16066 { MASK_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, 0, 0 },
16067 { MASK_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, 0, 0 },
16068 { MASK_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, 0, 0 },
16069 { MASK_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, 0, 0 },
16070 { MASK_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, 0, 0 },
16071 { MASK_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, 0, 0 },
16072 { MASK_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, 0, 0 },
16073 { MASK_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, 0, 0 },
16074
16075 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, 0 },
16076 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, 0 },
16077 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, 0 },
16078 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT,
16079 BUILTIN_DESC_SWAP_OPERANDS },
16080 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE,
16081 BUILTIN_DESC_SWAP_OPERANDS },
16082 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, 0 },
16083 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, 0 },
16084 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, 0 },
16085 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, 0 },
16086 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE,
16087 BUILTIN_DESC_SWAP_OPERANDS },
16088 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT,
16089 BUILTIN_DESC_SWAP_OPERANDS },
16090 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, 0 },
16091 { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, 0 },
16092 { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, 0 },
16093 { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, 0 },
16094 { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, 0 },
16095 { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, 0 },
16096 { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, 0 },
16097 { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, 0 },
16098 { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, 0 },
16099
16100 { MASK_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, 0, 0 },
16101 { MASK_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, 0, 0 },
16102 { MASK_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, 0, 0 },
16103 { MASK_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, 0, 0 },
16104
16105 { MASK_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, 0, 0 },
16106 { MASK_SSE2, CODE_FOR_sse2_nandv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, 0, 0 },
16107 { MASK_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, 0, 0 },
16108 { MASK_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, 0, 0 },
16109
16110 { MASK_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, 0, 0 },
16111 { MASK_SSE2, CODE_FOR_sse2_unpckhpd, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, 0, 0 },
16112 { MASK_SSE2, CODE_FOR_sse2_unpcklpd, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, 0, 0 },
16113
16114 /* SSE2 MMX */
16115 { MASK_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, 0, 0 },
16116 { MASK_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, 0, 0 },
16117 { MASK_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, 0, 0 },
16118 { MASK_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, 0, 0 },
16119 { MASK_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, 0, 0 },
16120 { MASK_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, 0, 0 },
16121 { MASK_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, 0, 0 },
16122 { MASK_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, 0, 0 },
16123
16124 { MASK_MMX, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, 0, 0 },
16125 { MASK_MMX, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, 0, 0 },
16126 { MASK_MMX, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, 0, 0 },
16127 { MASK_MMX, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, 0, 0 },
16128 { MASK_MMX, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, 0, 0 },
16129 { MASK_MMX, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, 0, 0 },
16130 { MASK_MMX, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, 0, 0 },
16131 { MASK_MMX, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, 0, 0 },
16132
16133 { MASK_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, 0, 0 },
16134 { MASK_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, 0, 0 },
16135
16136 { MASK_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, 0, 0 },
16137 { MASK_SSE2, CODE_FOR_sse2_nandv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, 0, 0 },
16138 { MASK_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, 0, 0 },
16139 { MASK_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, 0, 0 },
16140
16141 { MASK_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, 0, 0 },
16142 { MASK_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, 0, 0 },
16143
16144 { MASK_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, 0, 0 },
16145 { MASK_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, 0, 0 },
16146 { MASK_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, 0, 0 },
16147 { MASK_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, 0, 0 },
16148 { MASK_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, 0, 0 },
16149 { MASK_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, 0, 0 },
16150
16151 { MASK_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, 0, 0 },
16152 { MASK_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, 0, 0 },
16153 { MASK_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, 0, 0 },
16154 { MASK_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, 0, 0 },
16155
16156 { MASK_SSE2, CODE_FOR_sse2_punpckhbw, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, 0, 0 },
16157 { MASK_SSE2, CODE_FOR_sse2_punpckhwd, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, 0, 0 },
16158 { MASK_SSE2, CODE_FOR_sse2_punpckhdq, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, 0, 0 },
16159 { MASK_SSE2, CODE_FOR_sse2_punpckhqdq, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, 0, 0 },
16160 { MASK_SSE2, CODE_FOR_sse2_punpcklbw, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, 0, 0 },
16161 { MASK_SSE2, CODE_FOR_sse2_punpcklwd, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, 0, 0 },
16162 { MASK_SSE2, CODE_FOR_sse2_punpckldq, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, 0, 0 },
16163 { MASK_SSE2, CODE_FOR_sse2_punpcklqdq, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, 0, 0 },
16164
16165 { MASK_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, 0, 0 },
16166 { MASK_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, 0, 0 },
16167 { MASK_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, 0, 0 },
16168
16169 { MASK_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, 0, 0 },
16170 { MASK_SSE2, CODE_FOR_sse2_psadbw, 0, IX86_BUILTIN_PSADBW128, 0, 0 },
16171
16172 { MASK_SSE2, CODE_FOR_sse2_umulsidi3, 0, IX86_BUILTIN_PMULUDQ, 0, 0 },
16173 { MASK_SSE2, CODE_FOR_sse2_umulv2siv2di3, 0, IX86_BUILTIN_PMULUDQ128, 0, 0 },
16174
16175 { MASK_SSE2, CODE_FOR_ashlv8hi3, 0, IX86_BUILTIN_PSLLWI128, 0, 0 },
16176 { MASK_SSE2, CODE_FOR_ashlv4si3, 0, IX86_BUILTIN_PSLLDI128, 0, 0 },
16177 { MASK_SSE2, CODE_FOR_ashlv2di3, 0, IX86_BUILTIN_PSLLQI128, 0, 0 },
16178
16179 { MASK_SSE2, CODE_FOR_lshrv8hi3, 0, IX86_BUILTIN_PSRLWI128, 0, 0 },
16180 { MASK_SSE2, CODE_FOR_lshrv4si3, 0, IX86_BUILTIN_PSRLDI128, 0, 0 },
16181 { MASK_SSE2, CODE_FOR_lshrv2di3, 0, IX86_BUILTIN_PSRLQI128, 0, 0 },
16182
16183 { MASK_SSE2, CODE_FOR_ashrv8hi3, 0, IX86_BUILTIN_PSRAWI128, 0, 0 },
16184 { MASK_SSE2, CODE_FOR_ashrv4si3, 0, IX86_BUILTIN_PSRADI128, 0, 0 },
16185
16186 { MASK_SSE2, CODE_FOR_sse2_pmaddwd, 0, IX86_BUILTIN_PMADDWD128, 0, 0 },
16187
16188 { MASK_SSE2, CODE_FOR_sse2_cvtsi2sd, 0, IX86_BUILTIN_CVTSI2SD, 0, 0 },
16189 { MASK_SSE2 | MASK_64BIT, CODE_FOR_sse2_cvtsi2sdq, 0, IX86_BUILTIN_CVTSI642SD, 0, 0 },
16190 { MASK_SSE2, CODE_FOR_sse2_cvtsd2ss, 0, IX86_BUILTIN_CVTSD2SS, 0, 0 },
16191 { MASK_SSE2, CODE_FOR_sse2_cvtss2sd, 0, IX86_BUILTIN_CVTSS2SD, 0, 0 },
16192
16193 /* SSE3 MMX */
16194 { MASK_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, 0, 0 },
16195 { MASK_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, 0, 0 },
16196 { MASK_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, 0, 0 },
16197 { MASK_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, 0, 0 },
16198 { MASK_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, 0, 0 },
16199 { MASK_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, 0, 0 },
16200
16201 /* SSSE3 */
16202 { MASK_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, 0, 0 },
16203 { MASK_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, 0, 0 },
16204 { MASK_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, 0, 0 },
16205 { MASK_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, 0, 0 },
16206 { MASK_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, 0, 0 },
16207 { MASK_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, 0, 0 },
16208 { MASK_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, 0, 0 },
16209 { MASK_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, 0, 0 },
16210 { MASK_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, 0, 0 },
16211 { MASK_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, 0, 0 },
16212 { MASK_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, 0, 0 },
16213 { MASK_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, 0, 0 },
16214 { MASK_SSSE3, CODE_FOR_ssse3_pmaddubswv8hi3, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, 0, 0 },
16215 { MASK_SSSE3, CODE_FOR_ssse3_pmaddubswv4hi3, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, 0, 0 },
16216 { MASK_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, 0, 0 },
16217 { MASK_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, 0, 0 },
16218 { MASK_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, 0, 0 },
16219 { MASK_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, 0, 0 },
16220 { MASK_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, 0, 0 },
16221 { MASK_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, 0, 0 },
16222 { MASK_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, 0, 0 },
16223 { MASK_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, 0, 0 },
16224 { MASK_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, 0, 0 },
16225 { MASK_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, 0, 0 }
16226 };
16227
16228 static const struct builtin_description bdesc_1arg[] =
16229 {
16230 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_pmovmskb, 0, IX86_BUILTIN_PMOVMSKB, 0, 0 },
16231 { MASK_SSE, CODE_FOR_sse_movmskps, 0, IX86_BUILTIN_MOVMSKPS, 0, 0 },
16232
16233 { MASK_SSE, CODE_FOR_sqrtv4sf2, 0, IX86_BUILTIN_SQRTPS, 0, 0 },
16234 { MASK_SSE, CODE_FOR_sse_rsqrtv4sf2, 0, IX86_BUILTIN_RSQRTPS, 0, 0 },
16235 { MASK_SSE, CODE_FOR_sse_rcpv4sf2, 0, IX86_BUILTIN_RCPPS, 0, 0 },
16236
16237 { MASK_SSE, CODE_FOR_sse_cvtps2pi, 0, IX86_BUILTIN_CVTPS2PI, 0, 0 },
16238 { MASK_SSE, CODE_FOR_sse_cvtss2si, 0, IX86_BUILTIN_CVTSS2SI, 0, 0 },
16239 { MASK_SSE | MASK_64BIT, CODE_FOR_sse_cvtss2siq, 0, IX86_BUILTIN_CVTSS2SI64, 0, 0 },
16240 { MASK_SSE, CODE_FOR_sse_cvttps2pi, 0, IX86_BUILTIN_CVTTPS2PI, 0, 0 },
16241 { MASK_SSE, CODE_FOR_sse_cvttss2si, 0, IX86_BUILTIN_CVTTSS2SI, 0, 0 },
16242 { MASK_SSE | MASK_64BIT, CODE_FOR_sse_cvttss2siq, 0, IX86_BUILTIN_CVTTSS2SI64, 0, 0 },
16243
16244 { MASK_SSE2, CODE_FOR_sse2_pmovmskb, 0, IX86_BUILTIN_PMOVMSKB128, 0, 0 },
16245 { MASK_SSE2, CODE_FOR_sse2_movmskpd, 0, IX86_BUILTIN_MOVMSKPD, 0, 0 },
16246
16247 { MASK_SSE2, CODE_FOR_sqrtv2df2, 0, IX86_BUILTIN_SQRTPD, 0, 0 },
16248
16249 { MASK_SSE2, CODE_FOR_sse2_cvtdq2pd, 0, IX86_BUILTIN_CVTDQ2PD, 0, 0 },
16250 { MASK_SSE2, CODE_FOR_sse2_cvtdq2ps, 0, IX86_BUILTIN_CVTDQ2PS, 0, 0 },
16251
16252 { MASK_SSE2, CODE_FOR_sse2_cvtpd2dq, 0, IX86_BUILTIN_CVTPD2DQ, 0, 0 },
16253 { MASK_SSE2, CODE_FOR_sse2_cvtpd2pi, 0, IX86_BUILTIN_CVTPD2PI, 0, 0 },
16254 { MASK_SSE2, CODE_FOR_sse2_cvtpd2ps, 0, IX86_BUILTIN_CVTPD2PS, 0, 0 },
16255 { MASK_SSE2, CODE_FOR_sse2_cvttpd2dq, 0, IX86_BUILTIN_CVTTPD2DQ, 0, 0 },
16256 { MASK_SSE2, CODE_FOR_sse2_cvttpd2pi, 0, IX86_BUILTIN_CVTTPD2PI, 0, 0 },
16257
16258 { MASK_SSE2, CODE_FOR_sse2_cvtpi2pd, 0, IX86_BUILTIN_CVTPI2PD, 0, 0 },
16259
16260 { MASK_SSE2, CODE_FOR_sse2_cvtsd2si, 0, IX86_BUILTIN_CVTSD2SI, 0, 0 },
16261 { MASK_SSE2, CODE_FOR_sse2_cvttsd2si, 0, IX86_BUILTIN_CVTTSD2SI, 0, 0 },
16262 { MASK_SSE2 | MASK_64BIT, CODE_FOR_sse2_cvtsd2siq, 0, IX86_BUILTIN_CVTSD2SI64, 0, 0 },
16263 { MASK_SSE2 | MASK_64BIT, CODE_FOR_sse2_cvttsd2siq, 0, IX86_BUILTIN_CVTTSD2SI64, 0, 0 },
16264
16265 { MASK_SSE2, CODE_FOR_sse2_cvtps2dq, 0, IX86_BUILTIN_CVTPS2DQ, 0, 0 },
16266 { MASK_SSE2, CODE_FOR_sse2_cvtps2pd, 0, IX86_BUILTIN_CVTPS2PD, 0, 0 },
16267 { MASK_SSE2, CODE_FOR_sse2_cvttps2dq, 0, IX86_BUILTIN_CVTTPS2DQ, 0, 0 },
16268
16269 /* SSE3 */
16270 { MASK_SSE3, CODE_FOR_sse3_movshdup, 0, IX86_BUILTIN_MOVSHDUP, 0, 0 },
16271 { MASK_SSE3, CODE_FOR_sse3_movsldup, 0, IX86_BUILTIN_MOVSLDUP, 0, 0 },
16272
16273 /* SSSE3 */
16274 { MASK_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, 0, 0 },
16275 { MASK_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, 0, 0 },
16276 { MASK_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, 0, 0 },
16277 { MASK_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, 0, 0 },
16278 { MASK_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, 0, 0 },
16279 { MASK_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, 0, 0 },
16280 };
16281
16282 static void
16283 ix86_init_builtins (void)
16284 {
16285 if (TARGET_MMX)
16286 ix86_init_mmx_sse_builtins ();
16287 }
16288
16289 /* Set up all the MMX/SSE builtins. This is not called if TARGET_MMX
16290 is zero. Otherwise, if TARGET_SSE is not set, only expand the MMX
16291 builtins. */
16292 static void
16293 ix86_init_mmx_sse_builtins (void)
16294 {
16295 const struct builtin_description * d;
16296 size_t i;
16297
16298 tree V16QI_type_node = build_vector_type_for_mode (char_type_node, V16QImode);
16299 tree V2SI_type_node = build_vector_type_for_mode (intSI_type_node, V2SImode);
16300 tree V2SF_type_node = build_vector_type_for_mode (float_type_node, V2SFmode);
16301 tree V2DI_type_node
16302 = build_vector_type_for_mode (long_long_integer_type_node, V2DImode);
16303 tree V2DF_type_node = build_vector_type_for_mode (double_type_node, V2DFmode);
16304 tree V4SF_type_node = build_vector_type_for_mode (float_type_node, V4SFmode);
16305 tree V4SI_type_node = build_vector_type_for_mode (intSI_type_node, V4SImode);
16306 tree V4HI_type_node = build_vector_type_for_mode (intHI_type_node, V4HImode);
16307 tree V8QI_type_node = build_vector_type_for_mode (char_type_node, V8QImode);
16308 tree V8HI_type_node = build_vector_type_for_mode (intHI_type_node, V8HImode);
16309
16310 tree pchar_type_node = build_pointer_type (char_type_node);
16311 tree pcchar_type_node = build_pointer_type (
16312 build_type_variant (char_type_node, 1, 0));
16313 tree pfloat_type_node = build_pointer_type (float_type_node);
16314 tree pcfloat_type_node = build_pointer_type (
16315 build_type_variant (float_type_node, 1, 0));
16316 tree pv2si_type_node = build_pointer_type (V2SI_type_node);
16317 tree pv2di_type_node = build_pointer_type (V2DI_type_node);
16318 tree pdi_type_node = build_pointer_type (long_long_unsigned_type_node);
16319
16320 /* Comparisons. */
16321 tree int_ftype_v4sf_v4sf
16322 = build_function_type_list (integer_type_node,
16323 V4SF_type_node, V4SF_type_node, NULL_TREE);
16324 tree v4si_ftype_v4sf_v4sf
16325 = build_function_type_list (V4SI_type_node,
16326 V4SF_type_node, V4SF_type_node, NULL_TREE);
16327 /* MMX/SSE/integer conversions. */
16328 tree int_ftype_v4sf
16329 = build_function_type_list (integer_type_node,
16330 V4SF_type_node, NULL_TREE);
16331 tree int64_ftype_v4sf
16332 = build_function_type_list (long_long_integer_type_node,
16333 V4SF_type_node, NULL_TREE);
16334 tree int_ftype_v8qi
16335 = build_function_type_list (integer_type_node, V8QI_type_node, NULL_TREE);
16336 tree v4sf_ftype_v4sf_int
16337 = build_function_type_list (V4SF_type_node,
16338 V4SF_type_node, integer_type_node, NULL_TREE);
16339 tree v4sf_ftype_v4sf_int64
16340 = build_function_type_list (V4SF_type_node,
16341 V4SF_type_node, long_long_integer_type_node,
16342 NULL_TREE);
16343 tree v4sf_ftype_v4sf_v2si
16344 = build_function_type_list (V4SF_type_node,
16345 V4SF_type_node, V2SI_type_node, NULL_TREE);
16346
16347 /* Miscellaneous. */
16348 tree v8qi_ftype_v4hi_v4hi
16349 = build_function_type_list (V8QI_type_node,
16350 V4HI_type_node, V4HI_type_node, NULL_TREE);
16351 tree v4hi_ftype_v2si_v2si
16352 = build_function_type_list (V4HI_type_node,
16353 V2SI_type_node, V2SI_type_node, NULL_TREE);
16354 tree v4sf_ftype_v4sf_v4sf_int
16355 = build_function_type_list (V4SF_type_node,
16356 V4SF_type_node, V4SF_type_node,
16357 integer_type_node, NULL_TREE);
16358 tree v2si_ftype_v4hi_v4hi
16359 = build_function_type_list (V2SI_type_node,
16360 V4HI_type_node, V4HI_type_node, NULL_TREE);
16361 tree v4hi_ftype_v4hi_int
16362 = build_function_type_list (V4HI_type_node,
16363 V4HI_type_node, integer_type_node, NULL_TREE);
16364 tree v4hi_ftype_v4hi_di
16365 = build_function_type_list (V4HI_type_node,
16366 V4HI_type_node, long_long_unsigned_type_node,
16367 NULL_TREE);
16368 tree v2si_ftype_v2si_di
16369 = build_function_type_list (V2SI_type_node,
16370 V2SI_type_node, long_long_unsigned_type_node,
16371 NULL_TREE);
16372 tree void_ftype_void
16373 = build_function_type (void_type_node, void_list_node);
16374 tree void_ftype_unsigned
16375 = build_function_type_list (void_type_node, unsigned_type_node, NULL_TREE);
16376 tree void_ftype_unsigned_unsigned
16377 = build_function_type_list (void_type_node, unsigned_type_node,
16378 unsigned_type_node, NULL_TREE);
16379 tree void_ftype_pcvoid_unsigned_unsigned
16380 = build_function_type_list (void_type_node, const_ptr_type_node,
16381 unsigned_type_node, unsigned_type_node,
16382 NULL_TREE);
16383 tree unsigned_ftype_void
16384 = build_function_type (unsigned_type_node, void_list_node);
16385 tree v2si_ftype_v4sf
16386 = build_function_type_list (V2SI_type_node, V4SF_type_node, NULL_TREE);
16387 /* Loads/stores. */
16388 tree void_ftype_v8qi_v8qi_pchar
16389 = build_function_type_list (void_type_node,
16390 V8QI_type_node, V8QI_type_node,
16391 pchar_type_node, NULL_TREE);
16392 tree v4sf_ftype_pcfloat
16393 = build_function_type_list (V4SF_type_node, pcfloat_type_node, NULL_TREE);
16394 /* @@@ the type is bogus */
16395 tree v4sf_ftype_v4sf_pv2si
16396 = build_function_type_list (V4SF_type_node,
16397 V4SF_type_node, pv2si_type_node, NULL_TREE);
16398 tree void_ftype_pv2si_v4sf
16399 = build_function_type_list (void_type_node,
16400 pv2si_type_node, V4SF_type_node, NULL_TREE);
16401 tree void_ftype_pfloat_v4sf
16402 = build_function_type_list (void_type_node,
16403 pfloat_type_node, V4SF_type_node, NULL_TREE);
16404 tree void_ftype_pdi_di
16405 = build_function_type_list (void_type_node,
16406 pdi_type_node, long_long_unsigned_type_node,
16407 NULL_TREE);
16408 tree void_ftype_pv2di_v2di
16409 = build_function_type_list (void_type_node,
16410 pv2di_type_node, V2DI_type_node, NULL_TREE);
16411 /* Normal vector unops. */
16412 tree v4sf_ftype_v4sf
16413 = build_function_type_list (V4SF_type_node, V4SF_type_node, NULL_TREE);
16414 tree v16qi_ftype_v16qi
16415 = build_function_type_list (V16QI_type_node, V16QI_type_node, NULL_TREE);
16416 tree v8hi_ftype_v8hi
16417 = build_function_type_list (V8HI_type_node, V8HI_type_node, NULL_TREE);
16418 tree v4si_ftype_v4si
16419 = build_function_type_list (V4SI_type_node, V4SI_type_node, NULL_TREE);
16420 tree v8qi_ftype_v8qi
16421 = build_function_type_list (V8QI_type_node, V8QI_type_node, NULL_TREE);
16422 tree v4hi_ftype_v4hi
16423 = build_function_type_list (V4HI_type_node, V4HI_type_node, NULL_TREE);
16424
16425 /* Normal vector binops. */
16426 tree v4sf_ftype_v4sf_v4sf
16427 = build_function_type_list (V4SF_type_node,
16428 V4SF_type_node, V4SF_type_node, NULL_TREE);
16429 tree v8qi_ftype_v8qi_v8qi
16430 = build_function_type_list (V8QI_type_node,
16431 V8QI_type_node, V8QI_type_node, NULL_TREE);
16432 tree v4hi_ftype_v4hi_v4hi
16433 = build_function_type_list (V4HI_type_node,
16434 V4HI_type_node, V4HI_type_node, NULL_TREE);
16435 tree v2si_ftype_v2si_v2si
16436 = build_function_type_list (V2SI_type_node,
16437 V2SI_type_node, V2SI_type_node, NULL_TREE);
16438 tree di_ftype_di_di
16439 = build_function_type_list (long_long_unsigned_type_node,
16440 long_long_unsigned_type_node,
16441 long_long_unsigned_type_node, NULL_TREE);
16442
16443 tree di_ftype_di_di_int
16444 = build_function_type_list (long_long_unsigned_type_node,
16445 long_long_unsigned_type_node,
16446 long_long_unsigned_type_node,
16447 integer_type_node, NULL_TREE);
16448
16449 tree v2si_ftype_v2sf
16450 = build_function_type_list (V2SI_type_node, V2SF_type_node, NULL_TREE);
16451 tree v2sf_ftype_v2si
16452 = build_function_type_list (V2SF_type_node, V2SI_type_node, NULL_TREE);
16453 tree v2si_ftype_v2si
16454 = build_function_type_list (V2SI_type_node, V2SI_type_node, NULL_TREE);
16455 tree v2sf_ftype_v2sf
16456 = build_function_type_list (V2SF_type_node, V2SF_type_node, NULL_TREE);
16457 tree v2sf_ftype_v2sf_v2sf
16458 = build_function_type_list (V2SF_type_node,
16459 V2SF_type_node, V2SF_type_node, NULL_TREE);
16460 tree v2si_ftype_v2sf_v2sf
16461 = build_function_type_list (V2SI_type_node,
16462 V2SF_type_node, V2SF_type_node, NULL_TREE);
16463 tree pint_type_node = build_pointer_type (integer_type_node);
16464 tree pdouble_type_node = build_pointer_type (double_type_node);
16465 tree pcdouble_type_node = build_pointer_type (
16466 build_type_variant (double_type_node, 1, 0));
16467 tree int_ftype_v2df_v2df
16468 = build_function_type_list (integer_type_node,
16469 V2DF_type_node, V2DF_type_node, NULL_TREE);
16470
16471 tree void_ftype_pcvoid
16472 = build_function_type_list (void_type_node, const_ptr_type_node, NULL_TREE);
16473 tree v4sf_ftype_v4si
16474 = build_function_type_list (V4SF_type_node, V4SI_type_node, NULL_TREE);
16475 tree v4si_ftype_v4sf
16476 = build_function_type_list (V4SI_type_node, V4SF_type_node, NULL_TREE);
16477 tree v2df_ftype_v4si
16478 = build_function_type_list (V2DF_type_node, V4SI_type_node, NULL_TREE);
16479 tree v4si_ftype_v2df
16480 = build_function_type_list (V4SI_type_node, V2DF_type_node, NULL_TREE);
16481 tree v2si_ftype_v2df
16482 = build_function_type_list (V2SI_type_node, V2DF_type_node, NULL_TREE);
16483 tree v4sf_ftype_v2df
16484 = build_function_type_list (V4SF_type_node, V2DF_type_node, NULL_TREE);
16485 tree v2df_ftype_v2si
16486 = build_function_type_list (V2DF_type_node, V2SI_type_node, NULL_TREE);
16487 tree v2df_ftype_v4sf
16488 = build_function_type_list (V2DF_type_node, V4SF_type_node, NULL_TREE);
16489 tree int_ftype_v2df
16490 = build_function_type_list (integer_type_node, V2DF_type_node, NULL_TREE);
16491 tree int64_ftype_v2df
16492 = build_function_type_list (long_long_integer_type_node,
16493 V2DF_type_node, NULL_TREE);
16494 tree v2df_ftype_v2df_int
16495 = build_function_type_list (V2DF_type_node,
16496 V2DF_type_node, integer_type_node, NULL_TREE);
16497 tree v2df_ftype_v2df_int64
16498 = build_function_type_list (V2DF_type_node,
16499 V2DF_type_node, long_long_integer_type_node,
16500 NULL_TREE);
16501 tree v4sf_ftype_v4sf_v2df
16502 = build_function_type_list (V4SF_type_node,
16503 V4SF_type_node, V2DF_type_node, NULL_TREE);
16504 tree v2df_ftype_v2df_v4sf
16505 = build_function_type_list (V2DF_type_node,
16506 V2DF_type_node, V4SF_type_node, NULL_TREE);
16507 tree v2df_ftype_v2df_v2df_int
16508 = build_function_type_list (V2DF_type_node,
16509 V2DF_type_node, V2DF_type_node,
16510 integer_type_node,
16511 NULL_TREE);
16512 tree v2df_ftype_v2df_pcdouble
16513 = build_function_type_list (V2DF_type_node,
16514 V2DF_type_node, pcdouble_type_node, NULL_TREE);
16515 tree void_ftype_pdouble_v2df
16516 = build_function_type_list (void_type_node,
16517 pdouble_type_node, V2DF_type_node, NULL_TREE);
16518 tree void_ftype_pint_int
16519 = build_function_type_list (void_type_node,
16520 pint_type_node, integer_type_node, NULL_TREE);
16521 tree void_ftype_v16qi_v16qi_pchar
16522 = build_function_type_list (void_type_node,
16523 V16QI_type_node, V16QI_type_node,
16524 pchar_type_node, NULL_TREE);
16525 tree v2df_ftype_pcdouble
16526 = build_function_type_list (V2DF_type_node, pcdouble_type_node, NULL_TREE);
16527 tree v2df_ftype_v2df_v2df
16528 = build_function_type_list (V2DF_type_node,
16529 V2DF_type_node, V2DF_type_node, NULL_TREE);
16530 tree v16qi_ftype_v16qi_v16qi
16531 = build_function_type_list (V16QI_type_node,
16532 V16QI_type_node, V16QI_type_node, NULL_TREE);
16533 tree v8hi_ftype_v8hi_v8hi
16534 = build_function_type_list (V8HI_type_node,
16535 V8HI_type_node, V8HI_type_node, NULL_TREE);
16536 tree v4si_ftype_v4si_v4si
16537 = build_function_type_list (V4SI_type_node,
16538 V4SI_type_node, V4SI_type_node, NULL_TREE);
16539 tree v2di_ftype_v2di_v2di
16540 = build_function_type_list (V2DI_type_node,
16541 V2DI_type_node, V2DI_type_node, NULL_TREE);
16542 tree v2di_ftype_v2df_v2df
16543 = build_function_type_list (V2DI_type_node,
16544 V2DF_type_node, V2DF_type_node, NULL_TREE);
16545 tree v2df_ftype_v2df
16546 = build_function_type_list (V2DF_type_node, V2DF_type_node, NULL_TREE);
16547 tree v2di_ftype_v2di_int
16548 = build_function_type_list (V2DI_type_node,
16549 V2DI_type_node, integer_type_node, NULL_TREE);
16550 tree v2di_ftype_v2di_v2di_int
16551 = build_function_type_list (V2DI_type_node, V2DI_type_node,
16552 V2DI_type_node, integer_type_node, NULL_TREE);
16553 tree v4si_ftype_v4si_int
16554 = build_function_type_list (V4SI_type_node,
16555 V4SI_type_node, integer_type_node, NULL_TREE);
16556 tree v8hi_ftype_v8hi_int
16557 = build_function_type_list (V8HI_type_node,
16558 V8HI_type_node, integer_type_node, NULL_TREE);
16559 tree v8hi_ftype_v8hi_v2di
16560 = build_function_type_list (V8HI_type_node,
16561 V8HI_type_node, V2DI_type_node, NULL_TREE);
16562 tree v4si_ftype_v4si_v2di
16563 = build_function_type_list (V4SI_type_node,
16564 V4SI_type_node, V2DI_type_node, NULL_TREE);
16565 tree v4si_ftype_v8hi_v8hi
16566 = build_function_type_list (V4SI_type_node,
16567 V8HI_type_node, V8HI_type_node, NULL_TREE);
16568 tree di_ftype_v8qi_v8qi
16569 = build_function_type_list (long_long_unsigned_type_node,
16570 V8QI_type_node, V8QI_type_node, NULL_TREE);
16571 tree di_ftype_v2si_v2si
16572 = build_function_type_list (long_long_unsigned_type_node,
16573 V2SI_type_node, V2SI_type_node, NULL_TREE);
16574 tree v2di_ftype_v16qi_v16qi
16575 = build_function_type_list (V2DI_type_node,
16576 V16QI_type_node, V16QI_type_node, NULL_TREE);
16577 tree v2di_ftype_v4si_v4si
16578 = build_function_type_list (V2DI_type_node,
16579 V4SI_type_node, V4SI_type_node, NULL_TREE);
16580 tree int_ftype_v16qi
16581 = build_function_type_list (integer_type_node, V16QI_type_node, NULL_TREE);
16582 tree v16qi_ftype_pcchar
16583 = build_function_type_list (V16QI_type_node, pcchar_type_node, NULL_TREE);
16584 tree void_ftype_pchar_v16qi
16585 = build_function_type_list (void_type_node,
16586 pchar_type_node, V16QI_type_node, NULL_TREE);
16587
16588 tree v2di_ftype_v2di_unsigned_unsigned
16589 = build_function_type_list (V2DI_type_node, V2DI_type_node,
16590 unsigned_type_node, unsigned_type_node,
16591 NULL_TREE);
16592 tree v2di_ftype_v2di_v2di_unsigned_unsigned
16593 = build_function_type_list (V2DI_type_node, V2DI_type_node, V2DI_type_node,
16594 unsigned_type_node, unsigned_type_node,
16595 NULL_TREE);
16596 tree v2di_ftype_v2di_v16qi
16597 = build_function_type_list (V2DI_type_node, V2DI_type_node, V16QI_type_node,
16598 NULL_TREE);
16599
16600 tree float80_type;
16601 tree float128_type;
16602 tree ftype;
16603
16604 /* The __float80 type. */
16605 if (TYPE_MODE (long_double_type_node) == XFmode)
16606 (*lang_hooks.types.register_builtin_type) (long_double_type_node,
16607 "__float80");
16608 else
16609 {
16610 /* The __float80 type. */
16611 float80_type = make_node (REAL_TYPE);
16612 TYPE_PRECISION (float80_type) = 80;
16613 layout_type (float80_type);
16614 (*lang_hooks.types.register_builtin_type) (float80_type, "__float80");
16615 }
16616
16617 if (TARGET_64BIT)
16618 {
16619 float128_type = make_node (REAL_TYPE);
16620 TYPE_PRECISION (float128_type) = 128;
16621 layout_type (float128_type);
16622 (*lang_hooks.types.register_builtin_type) (float128_type, "__float128");
16623 }
16624
16625 /* Add all builtins that are more or less simple operations on two
16626 operands. */
16627 for (i = 0, d = bdesc_2arg; i < ARRAY_SIZE (bdesc_2arg); i++, d++)
16628 {
16629 /* Use one of the operands; the target can have a different mode for
16630 mask-generating compares. */
16631 enum machine_mode mode;
16632 tree type;
16633
16634 if (d->name == 0)
16635 continue;
16636 mode = insn_data[d->icode].operand[1].mode;
16637
16638 switch (mode)
16639 {
16640 case V16QImode:
16641 type = v16qi_ftype_v16qi_v16qi;
16642 break;
16643 case V8HImode:
16644 type = v8hi_ftype_v8hi_v8hi;
16645 break;
16646 case V4SImode:
16647 type = v4si_ftype_v4si_v4si;
16648 break;
16649 case V2DImode:
16650 type = v2di_ftype_v2di_v2di;
16651 break;
16652 case V2DFmode:
16653 type = v2df_ftype_v2df_v2df;
16654 break;
16655 case V4SFmode:
16656 type = v4sf_ftype_v4sf_v4sf;
16657 break;
16658 case V8QImode:
16659 type = v8qi_ftype_v8qi_v8qi;
16660 break;
16661 case V4HImode:
16662 type = v4hi_ftype_v4hi_v4hi;
16663 break;
16664 case V2SImode:
16665 type = v2si_ftype_v2si_v2si;
16666 break;
16667 case DImode:
16668 type = di_ftype_di_di;
16669 break;
16670
16671 default:
16672 gcc_unreachable ();
16673 }
16674
16675 /* Override for comparisons. */
16676 if (d->icode == CODE_FOR_sse_maskcmpv4sf3
16677 || d->icode == CODE_FOR_sse_vmmaskcmpv4sf3)
16678 type = v4si_ftype_v4sf_v4sf;
16679
16680 if (d->icode == CODE_FOR_sse2_maskcmpv2df3
16681 || d->icode == CODE_FOR_sse2_vmmaskcmpv2df3)
16682 type = v2di_ftype_v2df_v2df;
16683
16684 def_builtin (d->mask, d->name, type, d->code);
16685 }
16686
16687 /* Add all builtins that are more or less simple operations on 1 operand. */
16688 for (i = 0, d = bdesc_1arg; i < ARRAY_SIZE (bdesc_1arg); i++, d++)
16689 {
16690 enum machine_mode mode;
16691 tree type;
16692
16693 if (d->name == 0)
16694 continue;
16695 mode = insn_data[d->icode].operand[1].mode;
16696
16697 switch (mode)
16698 {
16699 case V16QImode:
16700 type = v16qi_ftype_v16qi;
16701 break;
16702 case V8HImode:
16703 type = v8hi_ftype_v8hi;
16704 break;
16705 case V4SImode:
16706 type = v4si_ftype_v4si;
16707 break;
16708 case V2DFmode:
16709 type = v2df_ftype_v2df;
16710 break;
16711 case V4SFmode:
16712 type = v4sf_ftype_v4sf;
16713 break;
16714 case V8QImode:
16715 type = v8qi_ftype_v8qi;
16716 break;
16717 case V4HImode:
16718 type = v4hi_ftype_v4hi;
16719 break;
16720 case V2SImode:
16721 type = v2si_ftype_v2si;
16722 break;
16723
16724 default:
16725 abort ();
16726 }
16727
16728 def_builtin (d->mask, d->name, type, d->code);
16729 }
16730
16731 /* Add the remaining MMX insns with somewhat more complicated types. */
16732 def_builtin (MASK_MMX, "__builtin_ia32_emms", void_ftype_void, IX86_BUILTIN_EMMS);
16733 def_builtin (MASK_MMX, "__builtin_ia32_psllw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSLLW);
16734 def_builtin (MASK_MMX, "__builtin_ia32_pslld", v2si_ftype_v2si_di, IX86_BUILTIN_PSLLD);
16735 def_builtin (MASK_MMX, "__builtin_ia32_psllq", di_ftype_di_di, IX86_BUILTIN_PSLLQ);
16736
16737 def_builtin (MASK_MMX, "__builtin_ia32_psrlw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSRLW);
16738 def_builtin (MASK_MMX, "__builtin_ia32_psrld", v2si_ftype_v2si_di, IX86_BUILTIN_PSRLD);
16739 def_builtin (MASK_MMX, "__builtin_ia32_psrlq", di_ftype_di_di, IX86_BUILTIN_PSRLQ);
16740
16741 def_builtin (MASK_MMX, "__builtin_ia32_psraw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSRAW);
16742 def_builtin (MASK_MMX, "__builtin_ia32_psrad", v2si_ftype_v2si_di, IX86_BUILTIN_PSRAD);
16743
16744 def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_pshufw", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSHUFW);
16745 def_builtin (MASK_MMX, "__builtin_ia32_pmaddwd", v2si_ftype_v4hi_v4hi, IX86_BUILTIN_PMADDWD);
16746
16747 /* comi/ucomi insns. */
16748 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
16749 if (d->mask == MASK_SSE2)
16750 def_builtin (d->mask, d->name, int_ftype_v2df_v2df, d->code);
16751 else
16752 def_builtin (d->mask, d->name, int_ftype_v4sf_v4sf, d->code);
16753
16754 def_builtin (MASK_MMX, "__builtin_ia32_packsswb", v8qi_ftype_v4hi_v4hi, IX86_BUILTIN_PACKSSWB);
16755 def_builtin (MASK_MMX, "__builtin_ia32_packssdw", v4hi_ftype_v2si_v2si, IX86_BUILTIN_PACKSSDW);
16756 def_builtin (MASK_MMX, "__builtin_ia32_packuswb", v8qi_ftype_v4hi_v4hi, IX86_BUILTIN_PACKUSWB);
16757
16758 def_builtin (MASK_SSE, "__builtin_ia32_ldmxcsr", void_ftype_unsigned, IX86_BUILTIN_LDMXCSR);
16759 def_builtin (MASK_SSE, "__builtin_ia32_stmxcsr", unsigned_ftype_void, IX86_BUILTIN_STMXCSR);
16760 def_builtin_const (MASK_SSE, "__builtin_ia32_cvtpi2ps", v4sf_ftype_v4sf_v2si, IX86_BUILTIN_CVTPI2PS);
16761 def_builtin_const (MASK_SSE, "__builtin_ia32_cvtps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTPS2PI);
16762 def_builtin_const (MASK_SSE, "__builtin_ia32_cvtsi2ss", v4sf_ftype_v4sf_int, IX86_BUILTIN_CVTSI2SS);
16763 def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtsi642ss", v4sf_ftype_v4sf_int64, IX86_BUILTIN_CVTSI642SS);
16764 def_builtin_const (MASK_SSE, "__builtin_ia32_cvtss2si", int_ftype_v4sf, IX86_BUILTIN_CVTSS2SI);
16765 def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTSS2SI64);
16766 def_builtin_const (MASK_SSE, "__builtin_ia32_cvttps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTTPS2PI);
16767 def_builtin_const (MASK_SSE, "__builtin_ia32_cvttss2si", int_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI);
16768 def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvttss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI64);
16769
16770 def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_maskmovq", void_ftype_v8qi_v8qi_pchar, IX86_BUILTIN_MASKMOVQ);
16771
16772 def_builtin (MASK_SSE, "__builtin_ia32_loadups", v4sf_ftype_pcfloat, IX86_BUILTIN_LOADUPS);
16773 def_builtin (MASK_SSE, "__builtin_ia32_storeups", void_ftype_pfloat_v4sf, IX86_BUILTIN_STOREUPS);
16774
16775 def_builtin (MASK_SSE, "__builtin_ia32_loadhps", v4sf_ftype_v4sf_pv2si, IX86_BUILTIN_LOADHPS);
16776 def_builtin (MASK_SSE, "__builtin_ia32_loadlps", v4sf_ftype_v4sf_pv2si, IX86_BUILTIN_LOADLPS);
16777 def_builtin (MASK_SSE, "__builtin_ia32_storehps", void_ftype_pv2si_v4sf, IX86_BUILTIN_STOREHPS);
16778 def_builtin (MASK_SSE, "__builtin_ia32_storelps", void_ftype_pv2si_v4sf, IX86_BUILTIN_STORELPS);
16779
16780 def_builtin (MASK_SSE, "__builtin_ia32_movmskps", int_ftype_v4sf, IX86_BUILTIN_MOVMSKPS);
16781 def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_pmovmskb", int_ftype_v8qi, IX86_BUILTIN_PMOVMSKB);
16782 def_builtin (MASK_SSE, "__builtin_ia32_movntps", void_ftype_pfloat_v4sf, IX86_BUILTIN_MOVNTPS);
16783 def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_movntq", void_ftype_pdi_di, IX86_BUILTIN_MOVNTQ);
16784
16785 def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_sfence", void_ftype_void, IX86_BUILTIN_SFENCE);
16786
16787 def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_psadbw", di_ftype_v8qi_v8qi, IX86_BUILTIN_PSADBW);
16788
16789 def_builtin (MASK_SSE, "__builtin_ia32_rcpps", v4sf_ftype_v4sf, IX86_BUILTIN_RCPPS);
16790 def_builtin (MASK_SSE, "__builtin_ia32_rcpss", v4sf_ftype_v4sf, IX86_BUILTIN_RCPSS);
16791 def_builtin (MASK_SSE, "__builtin_ia32_rsqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS);
16792 def_builtin (MASK_SSE, "__builtin_ia32_rsqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTSS);
16793 def_builtin_const (MASK_SSE, "__builtin_ia32_sqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS);
16794 def_builtin_const (MASK_SSE, "__builtin_ia32_sqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTSS);
16795
16796 def_builtin (MASK_SSE, "__builtin_ia32_shufps", v4sf_ftype_v4sf_v4sf_int, IX86_BUILTIN_SHUFPS);
16797
16798 /* Original 3DNow! */
16799 def_builtin (MASK_3DNOW, "__builtin_ia32_femms", void_ftype_void, IX86_BUILTIN_FEMMS);
16800 def_builtin (MASK_3DNOW, "__builtin_ia32_pavgusb", v8qi_ftype_v8qi_v8qi, IX86_BUILTIN_PAVGUSB);
16801 def_builtin (MASK_3DNOW, "__builtin_ia32_pf2id", v2si_ftype_v2sf, IX86_BUILTIN_PF2ID);
16802 def_builtin (MASK_3DNOW, "__builtin_ia32_pfacc", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFACC);
16803 def_builtin (MASK_3DNOW, "__builtin_ia32_pfadd", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFADD);
16804 def_builtin (MASK_3DNOW, "__builtin_ia32_pfcmpeq", v2si_ftype_v2sf_v2sf, IX86_BUILTIN_PFCMPEQ);
16805 def_builtin (MASK_3DNOW, "__builtin_ia32_pfcmpge", v2si_ftype_v2sf_v2sf, IX86_BUILTIN_PFCMPGE);
16806 def_builtin (MASK_3DNOW, "__builtin_ia32_pfcmpgt", v2si_ftype_v2sf_v2sf, IX86_BUILTIN_PFCMPGT);
16807 def_builtin (MASK_3DNOW, "__builtin_ia32_pfmax", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFMAX);
16808 def_builtin (MASK_3DNOW, "__builtin_ia32_pfmin", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFMIN);
16809 def_builtin (MASK_3DNOW, "__builtin_ia32_pfmul", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFMUL);
16810 def_builtin (MASK_3DNOW, "__builtin_ia32_pfrcp", v2sf_ftype_v2sf, IX86_BUILTIN_PFRCP);
16811 def_builtin (MASK_3DNOW, "__builtin_ia32_pfrcpit1", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFRCPIT1);
16812 def_builtin (MASK_3DNOW, "__builtin_ia32_pfrcpit2", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFRCPIT2);
16813 def_builtin (MASK_3DNOW, "__builtin_ia32_pfrsqrt", v2sf_ftype_v2sf, IX86_BUILTIN_PFRSQRT);
16814 def_builtin (MASK_3DNOW, "__builtin_ia32_pfrsqit1", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFRSQIT1);
16815 def_builtin (MASK_3DNOW, "__builtin_ia32_pfsub", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFSUB);
16816 def_builtin (MASK_3DNOW, "__builtin_ia32_pfsubr", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFSUBR);
16817 def_builtin (MASK_3DNOW, "__builtin_ia32_pi2fd", v2sf_ftype_v2si, IX86_BUILTIN_PI2FD);
16818 def_builtin (MASK_3DNOW, "__builtin_ia32_pmulhrw", v4hi_ftype_v4hi_v4hi, IX86_BUILTIN_PMULHRW);
16819
16820 /* 3DNow! extension as used in the Athlon CPU. */
16821 def_builtin (MASK_3DNOW_A, "__builtin_ia32_pf2iw", v2si_ftype_v2sf, IX86_BUILTIN_PF2IW);
16822 def_builtin (MASK_3DNOW_A, "__builtin_ia32_pfnacc", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFNACC);
16823 def_builtin (MASK_3DNOW_A, "__builtin_ia32_pfpnacc", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFPNACC);
16824 def_builtin (MASK_3DNOW_A, "__builtin_ia32_pi2fw", v2sf_ftype_v2si, IX86_BUILTIN_PI2FW);
16825 def_builtin (MASK_3DNOW_A, "__builtin_ia32_pswapdsf", v2sf_ftype_v2sf, IX86_BUILTIN_PSWAPDSF);
16826 def_builtin (MASK_3DNOW_A, "__builtin_ia32_pswapdsi", v2si_ftype_v2si, IX86_BUILTIN_PSWAPDSI);
16827
16828 /* SSE2 */
16829 def_builtin (MASK_SSE2, "__builtin_ia32_maskmovdqu", void_ftype_v16qi_v16qi_pchar, IX86_BUILTIN_MASKMOVDQU);
16830
16831 def_builtin (MASK_SSE2, "__builtin_ia32_loadupd", v2df_ftype_pcdouble, IX86_BUILTIN_LOADUPD);
16832 def_builtin (MASK_SSE2, "__builtin_ia32_storeupd", void_ftype_pdouble_v2df, IX86_BUILTIN_STOREUPD);
16833
16834 def_builtin (MASK_SSE2, "__builtin_ia32_loadhpd", v2df_ftype_v2df_pcdouble, IX86_BUILTIN_LOADHPD);
16835 def_builtin (MASK_SSE2, "__builtin_ia32_loadlpd", v2df_ftype_v2df_pcdouble, IX86_BUILTIN_LOADLPD);
16836
16837 def_builtin (MASK_SSE2, "__builtin_ia32_movmskpd", int_ftype_v2df, IX86_BUILTIN_MOVMSKPD);
16838 def_builtin (MASK_SSE2, "__builtin_ia32_pmovmskb128", int_ftype_v16qi, IX86_BUILTIN_PMOVMSKB128);
16839 def_builtin (MASK_SSE2, "__builtin_ia32_movnti", void_ftype_pint_int, IX86_BUILTIN_MOVNTI);
16840 def_builtin (MASK_SSE2, "__builtin_ia32_movntpd", void_ftype_pdouble_v2df, IX86_BUILTIN_MOVNTPD);
16841 def_builtin (MASK_SSE2, "__builtin_ia32_movntdq", void_ftype_pv2di_v2di, IX86_BUILTIN_MOVNTDQ);
16842
16843 def_builtin (MASK_SSE2, "__builtin_ia32_pshufd", v4si_ftype_v4si_int, IX86_BUILTIN_PSHUFD);
16844 def_builtin (MASK_SSE2, "__builtin_ia32_pshuflw", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSHUFLW);
16845 def_builtin (MASK_SSE2, "__builtin_ia32_pshufhw", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSHUFHW);
16846 def_builtin (MASK_SSE2, "__builtin_ia32_psadbw128", v2di_ftype_v16qi_v16qi, IX86_BUILTIN_PSADBW128);
16847
16848 def_builtin_const (MASK_SSE2, "__builtin_ia32_sqrtpd", v2df_ftype_v2df, IX86_BUILTIN_SQRTPD);
16849 def_builtin_const (MASK_SSE2, "__builtin_ia32_sqrtsd", v2df_ftype_v2df, IX86_BUILTIN_SQRTSD);
16850
16851 def_builtin (MASK_SSE2, "__builtin_ia32_shufpd", v2df_ftype_v2df_v2df_int, IX86_BUILTIN_SHUFPD);
16852
16853 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtdq2pd", v2df_ftype_v4si, IX86_BUILTIN_CVTDQ2PD);
16854 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtdq2ps", v4sf_ftype_v4si, IX86_BUILTIN_CVTDQ2PS);
16855
16856 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTPD2DQ);
16857 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTPD2PI);
16858 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2ps", v4sf_ftype_v2df, IX86_BUILTIN_CVTPD2PS);
16859 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTTPD2DQ);
16860 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTTPD2PI);
16861
16862 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpi2pd", v2df_ftype_v2si, IX86_BUILTIN_CVTPI2PD);
16863
16864 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsd2si", int_ftype_v2df, IX86_BUILTIN_CVTSD2SI);
16865 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttsd2si", int_ftype_v2df, IX86_BUILTIN_CVTTSD2SI);
16866 def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTSD2SI64);
16867 def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvttsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTTSD2SI64);
16868
16869 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTPS2DQ);
16870 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtps2pd", v2df_ftype_v4sf, IX86_BUILTIN_CVTPS2PD);
16871 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTTPS2DQ);
16872
16873 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsi2sd", v2df_ftype_v2df_int, IX86_BUILTIN_CVTSI2SD);
16874 def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsi642sd", v2df_ftype_v2df_int64, IX86_BUILTIN_CVTSI642SD);
16875 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsd2ss", v4sf_ftype_v4sf_v2df, IX86_BUILTIN_CVTSD2SS);
16876 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtss2sd", v2df_ftype_v2df_v4sf, IX86_BUILTIN_CVTSS2SD);
16877
16878 def_builtin (MASK_SSE2, "__builtin_ia32_clflush", void_ftype_pcvoid, IX86_BUILTIN_CLFLUSH);
16879 def_builtin (MASK_SSE2, "__builtin_ia32_lfence", void_ftype_void, IX86_BUILTIN_LFENCE);
16880 def_builtin (MASK_SSE2, "__builtin_ia32_mfence", void_ftype_void, IX86_BUILTIN_MFENCE);
16881
16882 def_builtin (MASK_SSE2, "__builtin_ia32_loaddqu", v16qi_ftype_pcchar, IX86_BUILTIN_LOADDQU);
16883 def_builtin (MASK_SSE2, "__builtin_ia32_storedqu", void_ftype_pchar_v16qi, IX86_BUILTIN_STOREDQU);
16884
16885 def_builtin (MASK_SSE2, "__builtin_ia32_pmuludq", di_ftype_v2si_v2si, IX86_BUILTIN_PMULUDQ);
16886 def_builtin (MASK_SSE2, "__builtin_ia32_pmuludq128", v2di_ftype_v4si_v4si, IX86_BUILTIN_PMULUDQ128);
16887
16888 def_builtin (MASK_SSE2, "__builtin_ia32_psllw128", v8hi_ftype_v8hi_v2di, IX86_BUILTIN_PSLLW128);
16889 def_builtin (MASK_SSE2, "__builtin_ia32_pslld128", v4si_ftype_v4si_v2di, IX86_BUILTIN_PSLLD128);
16890 def_builtin (MASK_SSE2, "__builtin_ia32_psllq128", v2di_ftype_v2di_v2di, IX86_BUILTIN_PSLLQ128);
16891
16892 def_builtin (MASK_SSE2, "__builtin_ia32_psrlw128", v8hi_ftype_v8hi_v2di, IX86_BUILTIN_PSRLW128);
16893 def_builtin (MASK_SSE2, "__builtin_ia32_psrld128", v4si_ftype_v4si_v2di, IX86_BUILTIN_PSRLD128);
16894 def_builtin (MASK_SSE2, "__builtin_ia32_psrlq128", v2di_ftype_v2di_v2di, IX86_BUILTIN_PSRLQ128);
16895
16896 def_builtin (MASK_SSE2, "__builtin_ia32_psraw128", v8hi_ftype_v8hi_v2di, IX86_BUILTIN_PSRAW128);
16897 def_builtin (MASK_SSE2, "__builtin_ia32_psrad128", v4si_ftype_v4si_v2di, IX86_BUILTIN_PSRAD128);
16898
16899 def_builtin (MASK_SSE2, "__builtin_ia32_pslldqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSLLDQI128);
16900 def_builtin (MASK_SSE2, "__builtin_ia32_psllwi128", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSLLWI128);
16901 def_builtin (MASK_SSE2, "__builtin_ia32_pslldi128", v4si_ftype_v4si_int, IX86_BUILTIN_PSLLDI128);
16902 def_builtin (MASK_SSE2, "__builtin_ia32_psllqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSLLQI128);
16903
16904 def_builtin (MASK_SSE2, "__builtin_ia32_psrldqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSRLDQI128);
16905 def_builtin (MASK_SSE2, "__builtin_ia32_psrlwi128", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSRLWI128);
16906 def_builtin (MASK_SSE2, "__builtin_ia32_psrldi128", v4si_ftype_v4si_int, IX86_BUILTIN_PSRLDI128);
16907 def_builtin (MASK_SSE2, "__builtin_ia32_psrlqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSRLQI128);
16908
16909 def_builtin (MASK_SSE2, "__builtin_ia32_psrawi128", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSRAWI128);
16910 def_builtin (MASK_SSE2, "__builtin_ia32_psradi128", v4si_ftype_v4si_int, IX86_BUILTIN_PSRADI128);
16911
16912 def_builtin (MASK_SSE2, "__builtin_ia32_pmaddwd128", v4si_ftype_v8hi_v8hi, IX86_BUILTIN_PMADDWD128);
16913
16914 /* Prescott New Instructions. */
16915 def_builtin (MASK_SSE3, "__builtin_ia32_monitor",
16916 void_ftype_pcvoid_unsigned_unsigned,
16917 IX86_BUILTIN_MONITOR);
16918 def_builtin (MASK_SSE3, "__builtin_ia32_mwait",
16919 void_ftype_unsigned_unsigned,
16920 IX86_BUILTIN_MWAIT);
16921 def_builtin (MASK_SSE3, "__builtin_ia32_movshdup",
16922 v4sf_ftype_v4sf,
16923 IX86_BUILTIN_MOVSHDUP);
16924 def_builtin (MASK_SSE3, "__builtin_ia32_movsldup",
16925 v4sf_ftype_v4sf,
16926 IX86_BUILTIN_MOVSLDUP);
16927 def_builtin (MASK_SSE3, "__builtin_ia32_lddqu",
16928 v16qi_ftype_pcchar, IX86_BUILTIN_LDDQU);
16929
16930 /* SSSE3. */
16931 def_builtin (MASK_SSSE3, "__builtin_ia32_palignr128",
16932 v2di_ftype_v2di_v2di_int, IX86_BUILTIN_PALIGNR128);
16933 def_builtin (MASK_SSSE3, "__builtin_ia32_palignr", di_ftype_di_di_int,
16934 IX86_BUILTIN_PALIGNR);
16935
16936 /* AMDFAM10 SSE4A New built-ins */
16937 def_builtin (MASK_SSE4A, "__builtin_ia32_movntsd",
16938 void_ftype_pdouble_v2df, IX86_BUILTIN_MOVNTSD);
16939 def_builtin (MASK_SSE4A, "__builtin_ia32_movntss",
16940 void_ftype_pfloat_v4sf, IX86_BUILTIN_MOVNTSS);
16941 def_builtin (MASK_SSE4A, "__builtin_ia32_extrqi",
16942 v2di_ftype_v2di_unsigned_unsigned, IX86_BUILTIN_EXTRQI);
16943 def_builtin (MASK_SSE4A, "__builtin_ia32_extrq",
16944 v2di_ftype_v2di_v16qi, IX86_BUILTIN_EXTRQ);
16945 def_builtin (MASK_SSE4A, "__builtin_ia32_insertqi",
16946 v2di_ftype_v2di_v2di_unsigned_unsigned, IX86_BUILTIN_INSERTQI);
16947 def_builtin (MASK_SSE4A, "__builtin_ia32_insertq",
16948 v2di_ftype_v2di_v2di, IX86_BUILTIN_INSERTQ);
16949
16950 /* Access to the vec_init patterns. */
16951 ftype = build_function_type_list (V2SI_type_node, integer_type_node,
16952 integer_type_node, NULL_TREE);
16953 def_builtin (MASK_MMX, "__builtin_ia32_vec_init_v2si",
16954 ftype, IX86_BUILTIN_VEC_INIT_V2SI);
16955
16956 ftype = build_function_type_list (V4HI_type_node, short_integer_type_node,
16957 short_integer_type_node,
16958 short_integer_type_node,
16959 short_integer_type_node, NULL_TREE);
16960 def_builtin (MASK_MMX, "__builtin_ia32_vec_init_v4hi",
16961 ftype, IX86_BUILTIN_VEC_INIT_V4HI);
16962
16963 ftype = build_function_type_list (V8QI_type_node, char_type_node,
16964 char_type_node, char_type_node,
16965 char_type_node, char_type_node,
16966 char_type_node, char_type_node,
16967 char_type_node, NULL_TREE);
16968 def_builtin (MASK_MMX, "__builtin_ia32_vec_init_v8qi",
16969 ftype, IX86_BUILTIN_VEC_INIT_V8QI);
16970
16971 /* Access to the vec_extract patterns. */
16972 ftype = build_function_type_list (double_type_node, V2DF_type_node,
16973 integer_type_node, NULL_TREE);
16974 def_builtin (MASK_SSE, "__builtin_ia32_vec_ext_v2df",
16975 ftype, IX86_BUILTIN_VEC_EXT_V2DF);
16976
16977 ftype = build_function_type_list (long_long_integer_type_node,
16978 V2DI_type_node, integer_type_node,
16979 NULL_TREE);
16980 def_builtin (MASK_SSE, "__builtin_ia32_vec_ext_v2di",
16981 ftype, IX86_BUILTIN_VEC_EXT_V2DI);
16982
16983 ftype = build_function_type_list (float_type_node, V4SF_type_node,
16984 integer_type_node, NULL_TREE);
16985 def_builtin (MASK_SSE, "__builtin_ia32_vec_ext_v4sf",
16986 ftype, IX86_BUILTIN_VEC_EXT_V4SF);
16987
16988 ftype = build_function_type_list (intSI_type_node, V4SI_type_node,
16989 integer_type_node, NULL_TREE);
16990 def_builtin (MASK_SSE, "__builtin_ia32_vec_ext_v4si",
16991 ftype, IX86_BUILTIN_VEC_EXT_V4SI);
16992
16993 ftype = build_function_type_list (intHI_type_node, V8HI_type_node,
16994 integer_type_node, NULL_TREE);
16995 def_builtin (MASK_SSE, "__builtin_ia32_vec_ext_v8hi",
16996 ftype, IX86_BUILTIN_VEC_EXT_V8HI);
16997
16998 ftype = build_function_type_list (intHI_type_node, V4HI_type_node,
16999 integer_type_node, NULL_TREE);
17000 def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_vec_ext_v4hi",
17001 ftype, IX86_BUILTIN_VEC_EXT_V4HI);
17002
17003 ftype = build_function_type_list (intSI_type_node, V2SI_type_node,
17004 integer_type_node, NULL_TREE);
17005 def_builtin (MASK_MMX, "__builtin_ia32_vec_ext_v2si",
17006 ftype, IX86_BUILTIN_VEC_EXT_V2SI);
17007
17008 /* Access to the vec_set patterns. */
17009 ftype = build_function_type_list (V8HI_type_node, V8HI_type_node,
17010 intHI_type_node,
17011 integer_type_node, NULL_TREE);
17012 def_builtin (MASK_SSE, "__builtin_ia32_vec_set_v8hi",
17013 ftype, IX86_BUILTIN_VEC_SET_V8HI);
17014
17015 ftype = build_function_type_list (V4HI_type_node, V4HI_type_node,
17016 intHI_type_node,
17017 integer_type_node, NULL_TREE);
17018 def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_vec_set_v4hi",
17019 ftype, IX86_BUILTIN_VEC_SET_V4HI);
17020 }
17021
17022 /* Errors in the source file can cause expand_expr to return const0_rtx
17023 where we expect a vector. To avoid crashing, use one of the vector
17024 clear instructions. */
17025 static rtx
17026 safe_vector_operand (rtx x, enum machine_mode mode)
17027 {
17028 if (x == const0_rtx)
17029 x = CONST0_RTX (mode);
17030 return x;
17031 }
17032
17033 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
17034
17035 static rtx
17036 ix86_expand_binop_builtin (enum insn_code icode, tree arglist, rtx target)
17037 {
17038 rtx pat, xops[3];
17039 tree arg0 = TREE_VALUE (arglist);
17040 tree arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17041 rtx op0 = expand_normal (arg0);
17042 rtx op1 = expand_normal (arg1);
17043 enum machine_mode tmode = insn_data[icode].operand[0].mode;
17044 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
17045 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
17046
17047 if (VECTOR_MODE_P (mode0))
17048 op0 = safe_vector_operand (op0, mode0);
17049 if (VECTOR_MODE_P (mode1))
17050 op1 = safe_vector_operand (op1, mode1);
17051
17052 if (optimize || !target
17053 || GET_MODE (target) != tmode
17054 || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17055 target = gen_reg_rtx (tmode);
17056
17057 if (GET_MODE (op1) == SImode && mode1 == TImode)
17058 {
17059 rtx x = gen_reg_rtx (V4SImode);
17060 emit_insn (gen_sse2_loadd (x, op1));
17061 op1 = gen_lowpart (TImode, x);
17062 }
17063
17064 /* The insn must want input operands in the same modes as the
17065 result. */
17066 gcc_assert ((GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
17067 && (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode));
17068
17069 if (!(*insn_data[icode].operand[1].predicate) (op0, mode0))
17070 op0 = copy_to_mode_reg (mode0, op0);
17071 if (!(*insn_data[icode].operand[2].predicate) (op1, mode1))
17072 op1 = copy_to_mode_reg (mode1, op1);
17073
17074 /* ??? Using ix86_fixup_binary_operands is problematic when
17075 we've got mismatched modes. Fake it. */
17076
17077 xops[0] = target;
17078 xops[1] = op0;
17079 xops[2] = op1;
17080
17081 if (tmode == mode0 && tmode == mode1)
17082 {
17083 target = ix86_fixup_binary_operands (UNKNOWN, tmode, xops);
17084 op0 = xops[1];
17085 op1 = xops[2];
17086 }
17087 else if (optimize || !ix86_binary_operator_ok (UNKNOWN, tmode, xops))
17088 {
17089 op0 = force_reg (mode0, op0);
17090 op1 = force_reg (mode1, op1);
17091 target = gen_reg_rtx (tmode);
17092 }
17093
17094 pat = GEN_FCN (icode) (target, op0, op1);
17095 if (! pat)
17096 return 0;
17097 emit_insn (pat);
17098 return target;
17099 }
17100
17101 /* Subroutine of ix86_expand_builtin to take care of stores. */
17102
17103 static rtx
17104 ix86_expand_store_builtin (enum insn_code icode, tree arglist)
17105 {
17106 rtx pat;
17107 tree arg0 = TREE_VALUE (arglist);
17108 tree arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17109 rtx op0 = expand_normal (arg0);
17110 rtx op1 = expand_normal (arg1);
17111 enum machine_mode mode0 = insn_data[icode].operand[0].mode;
17112 enum machine_mode mode1 = insn_data[icode].operand[1].mode;
17113
17114 if (VECTOR_MODE_P (mode1))
17115 op1 = safe_vector_operand (op1, mode1);
17116
17117 op0 = gen_rtx_MEM (mode0, copy_to_mode_reg (Pmode, op0));
17118 op1 = copy_to_mode_reg (mode1, op1);
17119
17120 pat = GEN_FCN (icode) (op0, op1);
17121 if (pat)
17122 emit_insn (pat);
17123 return 0;
17124 }
17125
17126 /* Subroutine of ix86_expand_builtin to take care of unop insns. */
17127
17128 static rtx
17129 ix86_expand_unop_builtin (enum insn_code icode, tree arglist,
17130 rtx target, int do_load)
17131 {
17132 rtx pat;
17133 tree arg0 = TREE_VALUE (arglist);
17134 rtx op0 = expand_normal (arg0);
17135 enum machine_mode tmode = insn_data[icode].operand[0].mode;
17136 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
17137
17138 if (optimize || !target
17139 || GET_MODE (target) != tmode
17140 || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17141 target = gen_reg_rtx (tmode);
17142 if (do_load)
17143 op0 = gen_rtx_MEM (mode0, copy_to_mode_reg (Pmode, op0));
17144 else
17145 {
17146 if (VECTOR_MODE_P (mode0))
17147 op0 = safe_vector_operand (op0, mode0);
17148
17149 if ((optimize && !register_operand (op0, mode0))
17150 || ! (*insn_data[icode].operand[1].predicate) (op0, mode0))
17151 op0 = copy_to_mode_reg (mode0, op0);
17152 }
17153
17154 pat = GEN_FCN (icode) (target, op0);
17155 if (! pat)
17156 return 0;
17157 emit_insn (pat);
17158 return target;
17159 }
17160
17161 /* Subroutine of ix86_expand_builtin to take care of three special unop insns:
17162 sqrtss, rsqrtss, rcpss. */
17163
17164 static rtx
17165 ix86_expand_unop1_builtin (enum insn_code icode, tree arglist, rtx target)
17166 {
17167 rtx pat;
17168 tree arg0 = TREE_VALUE (arglist);
17169 rtx op1, op0 = expand_normal (arg0);
17170 enum machine_mode tmode = insn_data[icode].operand[0].mode;
17171 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
17172
17173 if (optimize || !target
17174 || GET_MODE (target) != tmode
17175 || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17176 target = gen_reg_rtx (tmode);
17177
17178 if (VECTOR_MODE_P (mode0))
17179 op0 = safe_vector_operand (op0, mode0);
17180
17181 if ((optimize && !register_operand (op0, mode0))
17182 || ! (*insn_data[icode].operand[1].predicate) (op0, mode0))
17183 op0 = copy_to_mode_reg (mode0, op0);
17184
17185 op1 = op0;
17186 if (! (*insn_data[icode].operand[2].predicate) (op1, mode0))
17187 op1 = copy_to_mode_reg (mode0, op1);
17188
17189 pat = GEN_FCN (icode) (target, op0, op1);
17190 if (! pat)
17191 return 0;
17192 emit_insn (pat);
17193 return target;
17194 }
17195
17196 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
17197
17198 static rtx
17199 ix86_expand_sse_compare (const struct builtin_description *d, tree arglist,
17200 rtx target)
17201 {
17202 rtx pat;
17203 tree arg0 = TREE_VALUE (arglist);
17204 tree arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17205 rtx op0 = expand_normal (arg0);
17206 rtx op1 = expand_normal (arg1);
17207 rtx op2;
17208 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
17209 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
17210 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
17211 enum rtx_code comparison = d->comparison;
17212
17213 if (VECTOR_MODE_P (mode0))
17214 op0 = safe_vector_operand (op0, mode0);
17215 if (VECTOR_MODE_P (mode1))
17216 op1 = safe_vector_operand (op1, mode1);
17217
17218 /* Swap operands if we have a comparison that isn't available in
17219 hardware. */
17220 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
17221 {
17222 rtx tmp = gen_reg_rtx (mode1);
17223 emit_move_insn (tmp, op1);
17224 op1 = op0;
17225 op0 = tmp;
17226 }
17227
17228 if (optimize || !target
17229 || GET_MODE (target) != tmode
17230 || ! (*insn_data[d->icode].operand[0].predicate) (target, tmode))
17231 target = gen_reg_rtx (tmode);
17232
17233 if ((optimize && !register_operand (op0, mode0))
17234 || ! (*insn_data[d->icode].operand[1].predicate) (op0, mode0))
17235 op0 = copy_to_mode_reg (mode0, op0);
17236 if ((optimize && !register_operand (op1, mode1))
17237 || ! (*insn_data[d->icode].operand[2].predicate) (op1, mode1))
17238 op1 = copy_to_mode_reg (mode1, op1);
17239
17240 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
17241 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
17242 if (! pat)
17243 return 0;
17244 emit_insn (pat);
17245 return target;
17246 }
17247
17248 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
17249
17250 static rtx
17251 ix86_expand_sse_comi (const struct builtin_description *d, tree arglist,
17252 rtx target)
17253 {
17254 rtx pat;
17255 tree arg0 = TREE_VALUE (arglist);
17256 tree arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17257 rtx op0 = expand_normal (arg0);
17258 rtx op1 = expand_normal (arg1);
17259 rtx op2;
17260 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
17261 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
17262 enum rtx_code comparison = d->comparison;
17263
17264 if (VECTOR_MODE_P (mode0))
17265 op0 = safe_vector_operand (op0, mode0);
17266 if (VECTOR_MODE_P (mode1))
17267 op1 = safe_vector_operand (op1, mode1);
17268
17269 /* Swap operands if we have a comparison that isn't available in
17270 hardware. */
17271 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
17272 {
17273 rtx tmp = op1;
17274 op1 = op0;
17275 op0 = tmp;
17276 }
17277
17278 target = gen_reg_rtx (SImode);
17279 emit_move_insn (target, const0_rtx);
17280 target = gen_rtx_SUBREG (QImode, target, 0);
17281
17282 if ((optimize && !register_operand (op0, mode0))
17283 || !(*insn_data[d->icode].operand[0].predicate) (op0, mode0))
17284 op0 = copy_to_mode_reg (mode0, op0);
17285 if ((optimize && !register_operand (op1, mode1))
17286 || !(*insn_data[d->icode].operand[1].predicate) (op1, mode1))
17287 op1 = copy_to_mode_reg (mode1, op1);
17288
17289 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
17290 pat = GEN_FCN (d->icode) (op0, op1);
17291 if (! pat)
17292 return 0;
17293 emit_insn (pat);
17294 emit_insn (gen_rtx_SET (VOIDmode,
17295 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
17296 gen_rtx_fmt_ee (comparison, QImode,
17297 SET_DEST (pat),
17298 const0_rtx)));
17299
17300 return SUBREG_REG (target);
17301 }
17302
17303 /* Return the integer constant in ARG. Constrain it to be in the range
17304 of the subparts of VEC_TYPE; issue an error if not. */
17305
17306 static int
17307 get_element_number (tree vec_type, tree arg)
17308 {
17309 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
17310
17311 if (!host_integerp (arg, 1)
17312 || (elt = tree_low_cst (arg, 1), elt > max))
17313 {
17314 error ("selector must be an integer constant in the range 0..%wi", max);
17315 return 0;
17316 }
17317
17318 return elt;
17319 }
17320
17321 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
17322 ix86_expand_vector_init. We DO have language-level syntax for this, in
17323 the form of (type){ init-list }. Except that since we can't place emms
17324 instructions from inside the compiler, we can't allow the use of MMX
17325 registers unless the user explicitly asks for it. So we do *not* define
17326 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
17327 we have builtins invoked by mmintrin.h that gives us license to emit
17328 these sorts of instructions. */
17329
17330 static rtx
17331 ix86_expand_vec_init_builtin (tree type, tree arglist, rtx target)
17332 {
17333 enum machine_mode tmode = TYPE_MODE (type);
17334 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
17335 int i, n_elt = GET_MODE_NUNITS (tmode);
17336 rtvec v = rtvec_alloc (n_elt);
17337
17338 gcc_assert (VECTOR_MODE_P (tmode));
17339
17340 for (i = 0; i < n_elt; ++i, arglist = TREE_CHAIN (arglist))
17341 {
17342 rtx x = expand_normal (TREE_VALUE (arglist));
17343 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
17344 }
17345
17346 gcc_assert (arglist == NULL);
17347
17348 if (!target || !register_operand (target, tmode))
17349 target = gen_reg_rtx (tmode);
17350
17351 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
17352 return target;
17353 }
17354
17355 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
17356 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
17357 had a language-level syntax for referencing vector elements. */
17358
17359 static rtx
17360 ix86_expand_vec_ext_builtin (tree arglist, rtx target)
17361 {
17362 enum machine_mode tmode, mode0;
17363 tree arg0, arg1;
17364 int elt;
17365 rtx op0;
17366
17367 arg0 = TREE_VALUE (arglist);
17368 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17369
17370 op0 = expand_normal (arg0);
17371 elt = get_element_number (TREE_TYPE (arg0), arg1);
17372
17373 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
17374 mode0 = TYPE_MODE (TREE_TYPE (arg0));
17375 gcc_assert (VECTOR_MODE_P (mode0));
17376
17377 op0 = force_reg (mode0, op0);
17378
17379 if (optimize || !target || !register_operand (target, tmode))
17380 target = gen_reg_rtx (tmode);
17381
17382 ix86_expand_vector_extract (true, target, op0, elt);
17383
17384 return target;
17385 }
17386
17387 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
17388 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
17389 a language-level syntax for referencing vector elements. */
17390
17391 static rtx
17392 ix86_expand_vec_set_builtin (tree arglist)
17393 {
17394 enum machine_mode tmode, mode1;
17395 tree arg0, arg1, arg2;
17396 int elt;
17397 rtx op0, op1;
17398
17399 arg0 = TREE_VALUE (arglist);
17400 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17401 arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
17402
17403 tmode = TYPE_MODE (TREE_TYPE (arg0));
17404 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
17405 gcc_assert (VECTOR_MODE_P (tmode));
17406
17407 op0 = expand_expr (arg0, NULL_RTX, tmode, 0);
17408 op1 = expand_expr (arg1, NULL_RTX, mode1, 0);
17409 elt = get_element_number (TREE_TYPE (arg0), arg2);
17410
17411 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
17412 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
17413
17414 op0 = force_reg (tmode, op0);
17415 op1 = force_reg (mode1, op1);
17416
17417 ix86_expand_vector_set (true, op0, op1, elt);
17418
17419 return op0;
17420 }
17421
17422 /* Expand an expression EXP that calls a built-in function,
17423 with result going to TARGET if that's convenient
17424 (and in mode MODE if that's convenient).
17425 SUBTARGET may be used as the target for computing one of EXP's operands.
17426 IGNORE is nonzero if the value is to be ignored. */
17427
17428 static rtx
17429 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
17430 enum machine_mode mode ATTRIBUTE_UNUSED,
17431 int ignore ATTRIBUTE_UNUSED)
17432 {
17433 const struct builtin_description *d;
17434 size_t i;
17435 enum insn_code icode;
17436 tree fndecl = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
17437 tree arglist = TREE_OPERAND (exp, 1);
17438 tree arg0, arg1, arg2, arg3;
17439 rtx op0, op1, op2, op3, pat;
17440 enum machine_mode tmode, mode0, mode1, mode2, mode3, mode4;
17441 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
17442
17443 switch (fcode)
17444 {
17445 case IX86_BUILTIN_EMMS:
17446 emit_insn (gen_mmx_emms ());
17447 return 0;
17448
17449 case IX86_BUILTIN_SFENCE:
17450 emit_insn (gen_sse_sfence ());
17451 return 0;
17452
17453 case IX86_BUILTIN_MASKMOVQ:
17454 case IX86_BUILTIN_MASKMOVDQU:
17455 icode = (fcode == IX86_BUILTIN_MASKMOVQ
17456 ? CODE_FOR_mmx_maskmovq
17457 : CODE_FOR_sse2_maskmovdqu);
17458 /* Note the arg order is different from the operand order. */
17459 arg1 = TREE_VALUE (arglist);
17460 arg2 = TREE_VALUE (TREE_CHAIN (arglist));
17461 arg0 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
17462 op0 = expand_normal (arg0);
17463 op1 = expand_normal (arg1);
17464 op2 = expand_normal (arg2);
17465 mode0 = insn_data[icode].operand[0].mode;
17466 mode1 = insn_data[icode].operand[1].mode;
17467 mode2 = insn_data[icode].operand[2].mode;
17468
17469 op0 = force_reg (Pmode, op0);
17470 op0 = gen_rtx_MEM (mode1, op0);
17471
17472 if (! (*insn_data[icode].operand[0].predicate) (op0, mode0))
17473 op0 = copy_to_mode_reg (mode0, op0);
17474 if (! (*insn_data[icode].operand[1].predicate) (op1, mode1))
17475 op1 = copy_to_mode_reg (mode1, op1);
17476 if (! (*insn_data[icode].operand[2].predicate) (op2, mode2))
17477 op2 = copy_to_mode_reg (mode2, op2);
17478 pat = GEN_FCN (icode) (op0, op1, op2);
17479 if (! pat)
17480 return 0;
17481 emit_insn (pat);
17482 return 0;
17483
17484 case IX86_BUILTIN_SQRTSS:
17485 return ix86_expand_unop1_builtin (CODE_FOR_sse_vmsqrtv4sf2, arglist, target);
17486 case IX86_BUILTIN_RSQRTSS:
17487 return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrsqrtv4sf2, arglist, target);
17488 case IX86_BUILTIN_RCPSS:
17489 return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrcpv4sf2, arglist, target);
17490
17491 case IX86_BUILTIN_LOADUPS:
17492 return ix86_expand_unop_builtin (CODE_FOR_sse_movups, arglist, target, 1);
17493
17494 case IX86_BUILTIN_STOREUPS:
17495 return ix86_expand_store_builtin (CODE_FOR_sse_movups, arglist);
17496
17497 case IX86_BUILTIN_LOADHPS:
17498 case IX86_BUILTIN_LOADLPS:
17499 case IX86_BUILTIN_LOADHPD:
17500 case IX86_BUILTIN_LOADLPD:
17501 icode = (fcode == IX86_BUILTIN_LOADHPS ? CODE_FOR_sse_loadhps
17502 : fcode == IX86_BUILTIN_LOADLPS ? CODE_FOR_sse_loadlps
17503 : fcode == IX86_BUILTIN_LOADHPD ? CODE_FOR_sse2_loadhpd
17504 : CODE_FOR_sse2_loadlpd);
17505 arg0 = TREE_VALUE (arglist);
17506 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17507 op0 = expand_normal (arg0);
17508 op1 = expand_normal (arg1);
17509 tmode = insn_data[icode].operand[0].mode;
17510 mode0 = insn_data[icode].operand[1].mode;
17511 mode1 = insn_data[icode].operand[2].mode;
17512
17513 op0 = force_reg (mode0, op0);
17514 op1 = gen_rtx_MEM (mode1, copy_to_mode_reg (Pmode, op1));
17515 if (optimize || target == 0
17516 || GET_MODE (target) != tmode
17517 || !register_operand (target, tmode))
17518 target = gen_reg_rtx (tmode);
17519 pat = GEN_FCN (icode) (target, op0, op1);
17520 if (! pat)
17521 return 0;
17522 emit_insn (pat);
17523 return target;
17524
17525 case IX86_BUILTIN_STOREHPS:
17526 case IX86_BUILTIN_STORELPS:
17527 icode = (fcode == IX86_BUILTIN_STOREHPS ? CODE_FOR_sse_storehps
17528 : CODE_FOR_sse_storelps);
17529 arg0 = TREE_VALUE (arglist);
17530 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17531 op0 = expand_normal (arg0);
17532 op1 = expand_normal (arg1);
17533 mode0 = insn_data[icode].operand[0].mode;
17534 mode1 = insn_data[icode].operand[1].mode;
17535
17536 op0 = gen_rtx_MEM (mode0, copy_to_mode_reg (Pmode, op0));
17537 op1 = force_reg (mode1, op1);
17538
17539 pat = GEN_FCN (icode) (op0, op1);
17540 if (! pat)
17541 return 0;
17542 emit_insn (pat);
17543 return const0_rtx;
17544
17545 case IX86_BUILTIN_MOVNTPS:
17546 return ix86_expand_store_builtin (CODE_FOR_sse_movntv4sf, arglist);
17547 case IX86_BUILTIN_MOVNTQ:
17548 return ix86_expand_store_builtin (CODE_FOR_sse_movntdi, arglist);
17549
17550 case IX86_BUILTIN_LDMXCSR:
17551 op0 = expand_normal (TREE_VALUE (arglist));
17552 target = assign_386_stack_local (SImode, SLOT_TEMP);
17553 emit_move_insn (target, op0);
17554 emit_insn (gen_sse_ldmxcsr (target));
17555 return 0;
17556
17557 case IX86_BUILTIN_STMXCSR:
17558 target = assign_386_stack_local (SImode, SLOT_TEMP);
17559 emit_insn (gen_sse_stmxcsr (target));
17560 return copy_to_mode_reg (SImode, target);
17561
17562 case IX86_BUILTIN_SHUFPS:
17563 case IX86_BUILTIN_SHUFPD:
17564 icode = (fcode == IX86_BUILTIN_SHUFPS
17565 ? CODE_FOR_sse_shufps
17566 : CODE_FOR_sse2_shufpd);
17567 arg0 = TREE_VALUE (arglist);
17568 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17569 arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
17570 op0 = expand_normal (arg0);
17571 op1 = expand_normal (arg1);
17572 op2 = expand_normal (arg2);
17573 tmode = insn_data[icode].operand[0].mode;
17574 mode0 = insn_data[icode].operand[1].mode;
17575 mode1 = insn_data[icode].operand[2].mode;
17576 mode2 = insn_data[icode].operand[3].mode;
17577
17578 if (! (*insn_data[icode].operand[1].predicate) (op0, mode0))
17579 op0 = copy_to_mode_reg (mode0, op0);
17580 if ((optimize && !register_operand (op1, mode1))
17581 || !(*insn_data[icode].operand[2].predicate) (op1, mode1))
17582 op1 = copy_to_mode_reg (mode1, op1);
17583 if (! (*insn_data[icode].operand[3].predicate) (op2, mode2))
17584 {
17585 /* @@@ better error message */
17586 error ("mask must be an immediate");
17587 return gen_reg_rtx (tmode);
17588 }
17589 if (optimize || target == 0
17590 || GET_MODE (target) != tmode
17591 || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17592 target = gen_reg_rtx (tmode);
17593 pat = GEN_FCN (icode) (target, op0, op1, op2);
17594 if (! pat)
17595 return 0;
17596 emit_insn (pat);
17597 return target;
17598
17599 case IX86_BUILTIN_PSHUFW:
17600 case IX86_BUILTIN_PSHUFD:
17601 case IX86_BUILTIN_PSHUFHW:
17602 case IX86_BUILTIN_PSHUFLW:
17603 icode = ( fcode == IX86_BUILTIN_PSHUFHW ? CODE_FOR_sse2_pshufhw
17604 : fcode == IX86_BUILTIN_PSHUFLW ? CODE_FOR_sse2_pshuflw
17605 : fcode == IX86_BUILTIN_PSHUFD ? CODE_FOR_sse2_pshufd
17606 : CODE_FOR_mmx_pshufw);
17607 arg0 = TREE_VALUE (arglist);
17608 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17609 op0 = expand_normal (arg0);
17610 op1 = expand_normal (arg1);
17611 tmode = insn_data[icode].operand[0].mode;
17612 mode1 = insn_data[icode].operand[1].mode;
17613 mode2 = insn_data[icode].operand[2].mode;
17614
17615 if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
17616 op0 = copy_to_mode_reg (mode1, op0);
17617 if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
17618 {
17619 /* @@@ better error message */
17620 error ("mask must be an immediate");
17621 return const0_rtx;
17622 }
17623 if (target == 0
17624 || GET_MODE (target) != tmode
17625 || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17626 target = gen_reg_rtx (tmode);
17627 pat = GEN_FCN (icode) (target, op0, op1);
17628 if (! pat)
17629 return 0;
17630 emit_insn (pat);
17631 return target;
17632
17633 case IX86_BUILTIN_PSLLDQI128:
17634 case IX86_BUILTIN_PSRLDQI128:
17635 icode = ( fcode == IX86_BUILTIN_PSLLDQI128 ? CODE_FOR_sse2_ashlti3
17636 : CODE_FOR_sse2_lshrti3);
17637 arg0 = TREE_VALUE (arglist);
17638 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17639 op0 = expand_normal (arg0);
17640 op1 = expand_normal (arg1);
17641 tmode = insn_data[icode].operand[0].mode;
17642 mode1 = insn_data[icode].operand[1].mode;
17643 mode2 = insn_data[icode].operand[2].mode;
17644
17645 if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
17646 {
17647 op0 = copy_to_reg (op0);
17648 op0 = simplify_gen_subreg (mode1, op0, GET_MODE (op0), 0);
17649 }
17650 if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
17651 {
17652 error ("shift must be an immediate");
17653 return const0_rtx;
17654 }
17655 target = gen_reg_rtx (V2DImode);
17656 pat = GEN_FCN (icode) (simplify_gen_subreg (tmode, target, V2DImode, 0), op0, op1);
17657 if (! pat)
17658 return 0;
17659 emit_insn (pat);
17660 return target;
17661
17662 case IX86_BUILTIN_FEMMS:
17663 emit_insn (gen_mmx_femms ());
17664 return NULL_RTX;
17665
17666 case IX86_BUILTIN_PAVGUSB:
17667 return ix86_expand_binop_builtin (CODE_FOR_mmx_uavgv8qi3, arglist, target);
17668
17669 case IX86_BUILTIN_PF2ID:
17670 return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2id, arglist, target, 0);
17671
17672 case IX86_BUILTIN_PFACC:
17673 return ix86_expand_binop_builtin (CODE_FOR_mmx_haddv2sf3, arglist, target);
17674
17675 case IX86_BUILTIN_PFADD:
17676 return ix86_expand_binop_builtin (CODE_FOR_mmx_addv2sf3, arglist, target);
17677
17678 case IX86_BUILTIN_PFCMPEQ:
17679 return ix86_expand_binop_builtin (CODE_FOR_mmx_eqv2sf3, arglist, target);
17680
17681 case IX86_BUILTIN_PFCMPGE:
17682 return ix86_expand_binop_builtin (CODE_FOR_mmx_gev2sf3, arglist, target);
17683
17684 case IX86_BUILTIN_PFCMPGT:
17685 return ix86_expand_binop_builtin (CODE_FOR_mmx_gtv2sf3, arglist, target);
17686
17687 case IX86_BUILTIN_PFMAX:
17688 return ix86_expand_binop_builtin (CODE_FOR_mmx_smaxv2sf3, arglist, target);
17689
17690 case IX86_BUILTIN_PFMIN:
17691 return ix86_expand_binop_builtin (CODE_FOR_mmx_sminv2sf3, arglist, target);
17692
17693 case IX86_BUILTIN_PFMUL:
17694 return ix86_expand_binop_builtin (CODE_FOR_mmx_mulv2sf3, arglist, target);
17695
17696 case IX86_BUILTIN_PFRCP:
17697 return ix86_expand_unop_builtin (CODE_FOR_mmx_rcpv2sf2, arglist, target, 0);
17698
17699 case IX86_BUILTIN_PFRCPIT1:
17700 return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit1v2sf3, arglist, target);
17701
17702 case IX86_BUILTIN_PFRCPIT2:
17703 return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit2v2sf3, arglist, target);
17704
17705 case IX86_BUILTIN_PFRSQIT1:
17706 return ix86_expand_binop_builtin (CODE_FOR_mmx_rsqit1v2sf3, arglist, target);
17707
17708 case IX86_BUILTIN_PFRSQRT:
17709 return ix86_expand_unop_builtin (CODE_FOR_mmx_rsqrtv2sf2, arglist, target, 0);
17710
17711 case IX86_BUILTIN_PFSUB:
17712 return ix86_expand_binop_builtin (CODE_FOR_mmx_subv2sf3, arglist, target);
17713
17714 case IX86_BUILTIN_PFSUBR:
17715 return ix86_expand_binop_builtin (CODE_FOR_mmx_subrv2sf3, arglist, target);
17716
17717 case IX86_BUILTIN_PI2FD:
17718 return ix86_expand_unop_builtin (CODE_FOR_mmx_floatv2si2, arglist, target, 0);
17719
17720 case IX86_BUILTIN_PMULHRW:
17721 return ix86_expand_binop_builtin (CODE_FOR_mmx_pmulhrwv4hi3, arglist, target);
17722
17723 case IX86_BUILTIN_PF2IW:
17724 return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2iw, arglist, target, 0);
17725
17726 case IX86_BUILTIN_PFNACC:
17727 return ix86_expand_binop_builtin (CODE_FOR_mmx_hsubv2sf3, arglist, target);
17728
17729 case IX86_BUILTIN_PFPNACC:
17730 return ix86_expand_binop_builtin (CODE_FOR_mmx_addsubv2sf3, arglist, target);
17731
17732 case IX86_BUILTIN_PI2FW:
17733 return ix86_expand_unop_builtin (CODE_FOR_mmx_pi2fw, arglist, target, 0);
17734
17735 case IX86_BUILTIN_PSWAPDSI:
17736 return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2si2, arglist, target, 0);
17737
17738 case IX86_BUILTIN_PSWAPDSF:
17739 return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2sf2, arglist, target, 0);
17740
17741 case IX86_BUILTIN_SQRTSD:
17742 return ix86_expand_unop1_builtin (CODE_FOR_sse2_vmsqrtv2df2, arglist, target);
17743 case IX86_BUILTIN_LOADUPD:
17744 return ix86_expand_unop_builtin (CODE_FOR_sse2_movupd, arglist, target, 1);
17745 case IX86_BUILTIN_STOREUPD:
17746 return ix86_expand_store_builtin (CODE_FOR_sse2_movupd, arglist);
17747
17748 case IX86_BUILTIN_MFENCE:
17749 emit_insn (gen_sse2_mfence ());
17750 return 0;
17751 case IX86_BUILTIN_LFENCE:
17752 emit_insn (gen_sse2_lfence ());
17753 return 0;
17754
17755 case IX86_BUILTIN_CLFLUSH:
17756 arg0 = TREE_VALUE (arglist);
17757 op0 = expand_normal (arg0);
17758 icode = CODE_FOR_sse2_clflush;
17759 if (! (*insn_data[icode].operand[0].predicate) (op0, Pmode))
17760 op0 = copy_to_mode_reg (Pmode, op0);
17761
17762 emit_insn (gen_sse2_clflush (op0));
17763 return 0;
17764
17765 case IX86_BUILTIN_MOVNTPD:
17766 return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2df, arglist);
17767 case IX86_BUILTIN_MOVNTDQ:
17768 return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2di, arglist);
17769 case IX86_BUILTIN_MOVNTI:
17770 return ix86_expand_store_builtin (CODE_FOR_sse2_movntsi, arglist);
17771
17772 case IX86_BUILTIN_LOADDQU:
17773 return ix86_expand_unop_builtin (CODE_FOR_sse2_movdqu, arglist, target, 1);
17774 case IX86_BUILTIN_STOREDQU:
17775 return ix86_expand_store_builtin (CODE_FOR_sse2_movdqu, arglist);
17776
17777 case IX86_BUILTIN_MONITOR:
17778 arg0 = TREE_VALUE (arglist);
17779 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17780 arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
17781 op0 = expand_normal (arg0);
17782 op1 = expand_normal (arg1);
17783 op2 = expand_normal (arg2);
17784 if (!REG_P (op0))
17785 op0 = copy_to_mode_reg (Pmode, op0);
17786 if (!REG_P (op1))
17787 op1 = copy_to_mode_reg (SImode, op1);
17788 if (!REG_P (op2))
17789 op2 = copy_to_mode_reg (SImode, op2);
17790 if (!TARGET_64BIT)
17791 emit_insn (gen_sse3_monitor (op0, op1, op2));
17792 else
17793 emit_insn (gen_sse3_monitor64 (op0, op1, op2));
17794 return 0;
17795
17796 case IX86_BUILTIN_MWAIT:
17797 arg0 = TREE_VALUE (arglist);
17798 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17799 op0 = expand_normal (arg0);
17800 op1 = expand_normal (arg1);
17801 if (!REG_P (op0))
17802 op0 = copy_to_mode_reg (SImode, op0);
17803 if (!REG_P (op1))
17804 op1 = copy_to_mode_reg (SImode, op1);
17805 emit_insn (gen_sse3_mwait (op0, op1));
17806 return 0;
17807
17808 case IX86_BUILTIN_LDDQU:
17809 return ix86_expand_unop_builtin (CODE_FOR_sse3_lddqu, arglist,
17810 target, 1);
17811
17812 case IX86_BUILTIN_PALIGNR:
17813 case IX86_BUILTIN_PALIGNR128:
17814 if (fcode == IX86_BUILTIN_PALIGNR)
17815 {
17816 icode = CODE_FOR_ssse3_palignrdi;
17817 mode = DImode;
17818 }
17819 else
17820 {
17821 icode = CODE_FOR_ssse3_palignrti;
17822 mode = V2DImode;
17823 }
17824 arg0 = TREE_VALUE (arglist);
17825 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17826 arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
17827 op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0);
17828 op1 = expand_expr (arg1, NULL_RTX, VOIDmode, 0);
17829 op2 = expand_expr (arg2, NULL_RTX, VOIDmode, 0);
17830 tmode = insn_data[icode].operand[0].mode;
17831 mode1 = insn_data[icode].operand[1].mode;
17832 mode2 = insn_data[icode].operand[2].mode;
17833 mode3 = insn_data[icode].operand[3].mode;
17834
17835 if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
17836 {
17837 op0 = copy_to_reg (op0);
17838 op0 = simplify_gen_subreg (mode1, op0, GET_MODE (op0), 0);
17839 }
17840 if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
17841 {
17842 op1 = copy_to_reg (op1);
17843 op1 = simplify_gen_subreg (mode2, op1, GET_MODE (op1), 0);
17844 }
17845 if (! (*insn_data[icode].operand[3].predicate) (op2, mode3))
17846 {
17847 error ("shift must be an immediate");
17848 return const0_rtx;
17849 }
17850 target = gen_reg_rtx (mode);
17851 pat = GEN_FCN (icode) (simplify_gen_subreg (tmode, target, mode, 0),
17852 op0, op1, op2);
17853 if (! pat)
17854 return 0;
17855 emit_insn (pat);
17856 return target;
17857
17858 case IX86_BUILTIN_MOVNTSD:
17859 return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv2df, arglist);
17860
17861 case IX86_BUILTIN_MOVNTSS:
17862 return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv4sf, arglist);
17863
17864 case IX86_BUILTIN_INSERTQ:
17865 case IX86_BUILTIN_EXTRQ:
17866 icode = (fcode == IX86_BUILTIN_EXTRQ
17867 ? CODE_FOR_sse4a_extrq
17868 : CODE_FOR_sse4a_insertq);
17869 arg0 = TREE_VALUE (arglist);
17870 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17871 op0 = expand_normal (arg0);
17872 op1 = expand_normal (arg1);
17873 tmode = insn_data[icode].operand[0].mode;
17874 mode1 = insn_data[icode].operand[1].mode;
17875 mode2 = insn_data[icode].operand[2].mode;
17876 if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
17877 op0 = copy_to_mode_reg (mode1, op0);
17878 if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
17879 op1 = copy_to_mode_reg (mode2, op1);
17880 if (optimize || target == 0
17881 || GET_MODE (target) != tmode
17882 || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17883 target = gen_reg_rtx (tmode);
17884 pat = GEN_FCN (icode) (target, op0, op1);
17885 if (! pat)
17886 return NULL_RTX;
17887 emit_insn (pat);
17888 return target;
17889
17890 case IX86_BUILTIN_EXTRQI:
17891 icode = CODE_FOR_sse4a_extrqi;
17892 arg0 = TREE_VALUE (arglist);
17893 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17894 arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
17895 op0 = expand_normal (arg0);
17896 op1 = expand_normal (arg1);
17897 op2 = expand_normal (arg2);
17898 tmode = insn_data[icode].operand[0].mode;
17899 mode1 = insn_data[icode].operand[1].mode;
17900 mode2 = insn_data[icode].operand[2].mode;
17901 mode3 = insn_data[icode].operand[3].mode;
17902 if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
17903 op0 = copy_to_mode_reg (mode1, op0);
17904 if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
17905 {
17906 error ("index mask must be an immediate");
17907 return gen_reg_rtx (tmode);
17908 }
17909 if (! (*insn_data[icode].operand[3].predicate) (op2, mode3))
17910 {
17911 error ("length mask must be an immediate");
17912 return gen_reg_rtx (tmode);
17913 }
17914 if (optimize || target == 0
17915 || GET_MODE (target) != tmode
17916 || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17917 target = gen_reg_rtx (tmode);
17918 pat = GEN_FCN (icode) (target, op0, op1, op2);
17919 if (! pat)
17920 return NULL_RTX;
17921 emit_insn (pat);
17922 return target;
17923
17924 case IX86_BUILTIN_INSERTQI:
17925 icode = CODE_FOR_sse4a_insertqi;
17926 arg0 = TREE_VALUE (arglist);
17927 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17928 arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
17929 arg3 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist))));
17930 op0 = expand_normal (arg0);
17931 op1 = expand_normal (arg1);
17932 op2 = expand_normal (arg2);
17933 op3 = expand_normal (arg3);
17934 tmode = insn_data[icode].operand[0].mode;
17935 mode1 = insn_data[icode].operand[1].mode;
17936 mode2 = insn_data[icode].operand[2].mode;
17937 mode3 = insn_data[icode].operand[3].mode;
17938 mode4 = insn_data[icode].operand[4].mode;
17939
17940 if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
17941 op0 = copy_to_mode_reg (mode1, op0);
17942
17943 if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
17944 op1 = copy_to_mode_reg (mode2, op1);
17945
17946 if (! (*insn_data[icode].operand[3].predicate) (op2, mode3))
17947 {
17948 error ("index mask must be an immediate");
17949 return gen_reg_rtx (tmode);
17950 }
17951 if (! (*insn_data[icode].operand[4].predicate) (op3, mode4))
17952 {
17953 error ("length mask must be an immediate");
17954 return gen_reg_rtx (tmode);
17955 }
17956 if (optimize || target == 0
17957 || GET_MODE (target) != tmode
17958 || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17959 target = gen_reg_rtx (tmode);
17960 pat = GEN_FCN (icode) (target, op0, op1, op2, op3);
17961 if (! pat)
17962 return NULL_RTX;
17963 emit_insn (pat);
17964 return target;
17965
17966 case IX86_BUILTIN_VEC_INIT_V2SI:
17967 case IX86_BUILTIN_VEC_INIT_V4HI:
17968 case IX86_BUILTIN_VEC_INIT_V8QI:
17969 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), arglist, target);
17970
17971 case IX86_BUILTIN_VEC_EXT_V2DF:
17972 case IX86_BUILTIN_VEC_EXT_V2DI:
17973 case IX86_BUILTIN_VEC_EXT_V4SF:
17974 case IX86_BUILTIN_VEC_EXT_V4SI:
17975 case IX86_BUILTIN_VEC_EXT_V8HI:
17976 case IX86_BUILTIN_VEC_EXT_V2SI:
17977 case IX86_BUILTIN_VEC_EXT_V4HI:
17978 return ix86_expand_vec_ext_builtin (arglist, target);
17979
17980 case IX86_BUILTIN_VEC_SET_V8HI:
17981 case IX86_BUILTIN_VEC_SET_V4HI:
17982 return ix86_expand_vec_set_builtin (arglist);
17983
17984 default:
17985 break;
17986 }
17987
17988 for (i = 0, d = bdesc_2arg; i < ARRAY_SIZE (bdesc_2arg); i++, d++)
17989 if (d->code == fcode)
17990 {
17991 /* Compares are treated specially. */
17992 if (d->icode == CODE_FOR_sse_maskcmpv4sf3
17993 || d->icode == CODE_FOR_sse_vmmaskcmpv4sf3
17994 || d->icode == CODE_FOR_sse2_maskcmpv2df3
17995 || d->icode == CODE_FOR_sse2_vmmaskcmpv2df3)
17996 return ix86_expand_sse_compare (d, arglist, target);
17997
17998 return ix86_expand_binop_builtin (d->icode, arglist, target);
17999 }
18000
18001 for (i = 0, d = bdesc_1arg; i < ARRAY_SIZE (bdesc_1arg); i++, d++)
18002 if (d->code == fcode)
18003 return ix86_expand_unop_builtin (d->icode, arglist, target, 0);
18004
18005 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
18006 if (d->code == fcode)
18007 return ix86_expand_sse_comi (d, arglist, target);
18008
18009 gcc_unreachable ();
18010 }
18011
18012 /* Returns a function decl for a vectorized version of the builtin function
18013 with builtin function code FN and the result vector type TYPE, or NULL_TREE
18014 if it is not available. */
18015
18016 static tree
18017 ix86_builtin_vectorized_function (enum built_in_function fn, tree type_out,
18018 tree type_in)
18019 {
18020 enum machine_mode in_mode, out_mode;
18021 int in_n, out_n;
18022
18023 if (TREE_CODE (type_out) != VECTOR_TYPE
18024 || TREE_CODE (type_in) != VECTOR_TYPE)
18025 return NULL_TREE;
18026
18027 out_mode = TYPE_MODE (TREE_TYPE (type_out));
18028 out_n = TYPE_VECTOR_SUBPARTS (type_out);
18029 in_mode = TYPE_MODE (TREE_TYPE (type_in));
18030 in_n = TYPE_VECTOR_SUBPARTS (type_in);
18031
18032 switch (fn)
18033 {
18034 case BUILT_IN_SQRT:
18035 if (out_mode == DFmode && out_n == 2
18036 && in_mode == DFmode && in_n == 2)
18037 return ix86_builtins[IX86_BUILTIN_SQRTPD];
18038 return NULL_TREE;
18039
18040 case BUILT_IN_SQRTF:
18041 if (out_mode == SFmode && out_n == 4
18042 && in_mode == SFmode && in_n == 4)
18043 return ix86_builtins[IX86_BUILTIN_SQRTPS];
18044 return NULL_TREE;
18045
18046 case BUILT_IN_LRINTF:
18047 if (out_mode == SImode && out_n == 4
18048 && in_mode == SFmode && in_n == 4)
18049 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
18050 return NULL_TREE;
18051
18052 default:
18053 ;
18054 }
18055
18056 return NULL_TREE;
18057 }
18058
18059 /* Store OPERAND to the memory after reload is completed. This means
18060 that we can't easily use assign_stack_local. */
18061 rtx
18062 ix86_force_to_memory (enum machine_mode mode, rtx operand)
18063 {
18064 rtx result;
18065
18066 gcc_assert (reload_completed);
18067 if (TARGET_RED_ZONE)
18068 {
18069 result = gen_rtx_MEM (mode,
18070 gen_rtx_PLUS (Pmode,
18071 stack_pointer_rtx,
18072 GEN_INT (-RED_ZONE_SIZE)));
18073 emit_move_insn (result, operand);
18074 }
18075 else if (!TARGET_RED_ZONE && TARGET_64BIT)
18076 {
18077 switch (mode)
18078 {
18079 case HImode:
18080 case SImode:
18081 operand = gen_lowpart (DImode, operand);
18082 /* FALLTHRU */
18083 case DImode:
18084 emit_insn (
18085 gen_rtx_SET (VOIDmode,
18086 gen_rtx_MEM (DImode,
18087 gen_rtx_PRE_DEC (DImode,
18088 stack_pointer_rtx)),
18089 operand));
18090 break;
18091 default:
18092 gcc_unreachable ();
18093 }
18094 result = gen_rtx_MEM (mode, stack_pointer_rtx);
18095 }
18096 else
18097 {
18098 switch (mode)
18099 {
18100 case DImode:
18101 {
18102 rtx operands[2];
18103 split_di (&operand, 1, operands, operands + 1);
18104 emit_insn (
18105 gen_rtx_SET (VOIDmode,
18106 gen_rtx_MEM (SImode,
18107 gen_rtx_PRE_DEC (Pmode,
18108 stack_pointer_rtx)),
18109 operands[1]));
18110 emit_insn (
18111 gen_rtx_SET (VOIDmode,
18112 gen_rtx_MEM (SImode,
18113 gen_rtx_PRE_DEC (Pmode,
18114 stack_pointer_rtx)),
18115 operands[0]));
18116 }
18117 break;
18118 case HImode:
18119 /* Store HImodes as SImodes. */
18120 operand = gen_lowpart (SImode, operand);
18121 /* FALLTHRU */
18122 case SImode:
18123 emit_insn (
18124 gen_rtx_SET (VOIDmode,
18125 gen_rtx_MEM (GET_MODE (operand),
18126 gen_rtx_PRE_DEC (SImode,
18127 stack_pointer_rtx)),
18128 operand));
18129 break;
18130 default:
18131 gcc_unreachable ();
18132 }
18133 result = gen_rtx_MEM (mode, stack_pointer_rtx);
18134 }
18135 return result;
18136 }
18137
18138 /* Free operand from the memory. */
18139 void
18140 ix86_free_from_memory (enum machine_mode mode)
18141 {
18142 if (!TARGET_RED_ZONE)
18143 {
18144 int size;
18145
18146 if (mode == DImode || TARGET_64BIT)
18147 size = 8;
18148 else
18149 size = 4;
18150 /* Use LEA to deallocate stack space. In peephole2 it will be converted
18151 to pop or add instruction if registers are available. */
18152 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
18153 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
18154 GEN_INT (size))));
18155 }
18156 }
18157
18158 /* Put float CONST_DOUBLE in the constant pool instead of fp regs.
18159 QImode must go into class Q_REGS.
18160 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
18161 movdf to do mem-to-mem moves through integer regs. */
18162 enum reg_class
18163 ix86_preferred_reload_class (rtx x, enum reg_class class)
18164 {
18165 enum machine_mode mode = GET_MODE (x);
18166
18167 /* We're only allowed to return a subclass of CLASS. Many of the
18168 following checks fail for NO_REGS, so eliminate that early. */
18169 if (class == NO_REGS)
18170 return NO_REGS;
18171
18172 /* All classes can load zeros. */
18173 if (x == CONST0_RTX (mode))
18174 return class;
18175
18176 /* Force constants into memory if we are loading a (nonzero) constant into
18177 an MMX or SSE register. This is because there are no MMX/SSE instructions
18178 to load from a constant. */
18179 if (CONSTANT_P (x)
18180 && (MAYBE_MMX_CLASS_P (class) || MAYBE_SSE_CLASS_P (class)))
18181 return NO_REGS;
18182
18183 /* Prefer SSE regs only, if we can use them for math. */
18184 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
18185 return SSE_CLASS_P (class) ? class : NO_REGS;
18186
18187 /* Floating-point constants need more complex checks. */
18188 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
18189 {
18190 /* General regs can load everything. */
18191 if (reg_class_subset_p (class, GENERAL_REGS))
18192 return class;
18193
18194 /* Floats can load 0 and 1 plus some others. Note that we eliminated
18195 zero above. We only want to wind up preferring 80387 registers if
18196 we plan on doing computation with them. */
18197 if (TARGET_80387
18198 && standard_80387_constant_p (x))
18199 {
18200 /* Limit class to non-sse. */
18201 if (class == FLOAT_SSE_REGS)
18202 return FLOAT_REGS;
18203 if (class == FP_TOP_SSE_REGS)
18204 return FP_TOP_REG;
18205 if (class == FP_SECOND_SSE_REGS)
18206 return FP_SECOND_REG;
18207 if (class == FLOAT_INT_REGS || class == FLOAT_REGS)
18208 return class;
18209 }
18210
18211 return NO_REGS;
18212 }
18213
18214 /* Generally when we see PLUS here, it's the function invariant
18215 (plus soft-fp const_int). Which can only be computed into general
18216 regs. */
18217 if (GET_CODE (x) == PLUS)
18218 return reg_class_subset_p (class, GENERAL_REGS) ? class : NO_REGS;
18219
18220 /* QImode constants are easy to load, but non-constant QImode data
18221 must go into Q_REGS. */
18222 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
18223 {
18224 if (reg_class_subset_p (class, Q_REGS))
18225 return class;
18226 if (reg_class_subset_p (Q_REGS, class))
18227 return Q_REGS;
18228 return NO_REGS;
18229 }
18230
18231 return class;
18232 }
18233
18234 /* Discourage putting floating-point values in SSE registers unless
18235 SSE math is being used, and likewise for the 387 registers. */
18236 enum reg_class
18237 ix86_preferred_output_reload_class (rtx x, enum reg_class class)
18238 {
18239 enum machine_mode mode = GET_MODE (x);
18240
18241 /* Restrict the output reload class to the register bank that we are doing
18242 math on. If we would like not to return a subset of CLASS, reject this
18243 alternative: if reload cannot do this, it will still use its choice. */
18244 mode = GET_MODE (x);
18245 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
18246 return MAYBE_SSE_CLASS_P (class) ? SSE_REGS : NO_REGS;
18247
18248 if (TARGET_80387 && SCALAR_FLOAT_MODE_P (mode))
18249 {
18250 if (class == FP_TOP_SSE_REGS)
18251 return FP_TOP_REG;
18252 else if (class == FP_SECOND_SSE_REGS)
18253 return FP_SECOND_REG;
18254 else
18255 return FLOAT_CLASS_P (class) ? class : NO_REGS;
18256 }
18257
18258 return class;
18259 }
18260
18261 /* If we are copying between general and FP registers, we need a memory
18262 location. The same is true for SSE and MMX registers.
18263
18264 The macro can't work reliably when one of the CLASSES is class containing
18265 registers from multiple units (SSE, MMX, integer). We avoid this by never
18266 combining those units in single alternative in the machine description.
18267 Ensure that this constraint holds to avoid unexpected surprises.
18268
18269 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
18270 enforce these sanity checks. */
18271
18272 int
18273 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
18274 enum machine_mode mode, int strict)
18275 {
18276 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
18277 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
18278 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
18279 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
18280 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
18281 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
18282 {
18283 gcc_assert (!strict);
18284 return true;
18285 }
18286
18287 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
18288 return true;
18289
18290 /* ??? This is a lie. We do have moves between mmx/general, and for
18291 mmx/sse2. But by saying we need secondary memory we discourage the
18292 register allocator from using the mmx registers unless needed. */
18293 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
18294 return true;
18295
18296 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
18297 {
18298 /* SSE1 doesn't have any direct moves from other classes. */
18299 if (!TARGET_SSE2)
18300 return true;
18301
18302 /* If the target says that inter-unit moves are more expensive
18303 than moving through memory, then don't generate them. */
18304 if (!TARGET_INTER_UNIT_MOVES && !optimize_size)
18305 return true;
18306
18307 /* Between SSE and general, we have moves no larger than word size. */
18308 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
18309 return true;
18310
18311 /* ??? For the cost of one register reformat penalty, we could use
18312 the same instructions to move SFmode and DFmode data, but the
18313 relevant move patterns don't support those alternatives. */
18314 if (mode == SFmode || mode == DFmode)
18315 return true;
18316 }
18317
18318 return false;
18319 }
18320
18321 /* Return true if the registers in CLASS cannot represent the change from
18322 modes FROM to TO. */
18323
18324 bool
18325 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
18326 enum reg_class class)
18327 {
18328 if (from == to)
18329 return false;
18330
18331 /* x87 registers can't do subreg at all, as all values are reformatted
18332 to extended precision. */
18333 if (MAYBE_FLOAT_CLASS_P (class))
18334 return true;
18335
18336 if (MAYBE_SSE_CLASS_P (class) || MAYBE_MMX_CLASS_P (class))
18337 {
18338 /* Vector registers do not support QI or HImode loads. If we don't
18339 disallow a change to these modes, reload will assume it's ok to
18340 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
18341 the vec_dupv4hi pattern. */
18342 if (GET_MODE_SIZE (from) < 4)
18343 return true;
18344
18345 /* Vector registers do not support subreg with nonzero offsets, which
18346 are otherwise valid for integer registers. Since we can't see
18347 whether we have a nonzero offset from here, prohibit all
18348 nonparadoxical subregs changing size. */
18349 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
18350 return true;
18351 }
18352
18353 return false;
18354 }
18355
18356 /* Return the cost of moving data from a register in class CLASS1 to
18357 one in class CLASS2.
18358
18359 It is not required that the cost always equal 2 when FROM is the same as TO;
18360 on some machines it is expensive to move between registers if they are not
18361 general registers. */
18362
18363 int
18364 ix86_register_move_cost (enum machine_mode mode, enum reg_class class1,
18365 enum reg_class class2)
18366 {
18367 /* In case we require secondary memory, compute cost of the store followed
18368 by load. In order to avoid bad register allocation choices, we need
18369 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
18370
18371 if (ix86_secondary_memory_needed (class1, class2, mode, 0))
18372 {
18373 int cost = 1;
18374
18375 cost += MAX (MEMORY_MOVE_COST (mode, class1, 0),
18376 MEMORY_MOVE_COST (mode, class1, 1));
18377 cost += MAX (MEMORY_MOVE_COST (mode, class2, 0),
18378 MEMORY_MOVE_COST (mode, class2, 1));
18379
18380 /* In case of copying from general_purpose_register we may emit multiple
18381 stores followed by single load causing memory size mismatch stall.
18382 Count this as arbitrarily high cost of 20. */
18383 if (CLASS_MAX_NREGS (class1, mode) > CLASS_MAX_NREGS (class2, mode))
18384 cost += 20;
18385
18386 /* In the case of FP/MMX moves, the registers actually overlap, and we
18387 have to switch modes in order to treat them differently. */
18388 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
18389 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
18390 cost += 20;
18391
18392 return cost;
18393 }
18394
18395 /* Moves between SSE/MMX and integer unit are expensive. */
18396 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
18397 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
18398 return ix86_cost->mmxsse_to_integer;
18399 if (MAYBE_FLOAT_CLASS_P (class1))
18400 return ix86_cost->fp_move;
18401 if (MAYBE_SSE_CLASS_P (class1))
18402 return ix86_cost->sse_move;
18403 if (MAYBE_MMX_CLASS_P (class1))
18404 return ix86_cost->mmx_move;
18405 return 2;
18406 }
18407
18408 /* Return 1 if hard register REGNO can hold a value of machine-mode MODE. */
18409
18410 bool
18411 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
18412 {
18413 /* Flags and only flags can only hold CCmode values. */
18414 if (CC_REGNO_P (regno))
18415 return GET_MODE_CLASS (mode) == MODE_CC;
18416 if (GET_MODE_CLASS (mode) == MODE_CC
18417 || GET_MODE_CLASS (mode) == MODE_RANDOM
18418 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
18419 return 0;
18420 if (FP_REGNO_P (regno))
18421 return VALID_FP_MODE_P (mode);
18422 if (SSE_REGNO_P (regno))
18423 {
18424 /* We implement the move patterns for all vector modes into and
18425 out of SSE registers, even when no operation instructions
18426 are available. */
18427 return (VALID_SSE_REG_MODE (mode)
18428 || VALID_SSE2_REG_MODE (mode)
18429 || VALID_MMX_REG_MODE (mode)
18430 || VALID_MMX_REG_MODE_3DNOW (mode));
18431 }
18432 if (MMX_REGNO_P (regno))
18433 {
18434 /* We implement the move patterns for 3DNOW modes even in MMX mode,
18435 so if the register is available at all, then we can move data of
18436 the given mode into or out of it. */
18437 return (VALID_MMX_REG_MODE (mode)
18438 || VALID_MMX_REG_MODE_3DNOW (mode));
18439 }
18440
18441 if (mode == QImode)
18442 {
18443 /* Take care for QImode values - they can be in non-QI regs,
18444 but then they do cause partial register stalls. */
18445 if (regno < 4 || TARGET_64BIT)
18446 return 1;
18447 if (!TARGET_PARTIAL_REG_STALL)
18448 return 1;
18449 return reload_in_progress || reload_completed;
18450 }
18451 /* We handle both integer and floats in the general purpose registers. */
18452 else if (VALID_INT_MODE_P (mode))
18453 return 1;
18454 else if (VALID_FP_MODE_P (mode))
18455 return 1;
18456 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
18457 on to use that value in smaller contexts, this can easily force a
18458 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
18459 supporting DImode, allow it. */
18460 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
18461 return 1;
18462
18463 return 0;
18464 }
18465
18466 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
18467 tieable integer mode. */
18468
18469 static bool
18470 ix86_tieable_integer_mode_p (enum machine_mode mode)
18471 {
18472 switch (mode)
18473 {
18474 case HImode:
18475 case SImode:
18476 return true;
18477
18478 case QImode:
18479 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
18480
18481 case DImode:
18482 return TARGET_64BIT;
18483
18484 default:
18485 return false;
18486 }
18487 }
18488
18489 /* Return true if MODE1 is accessible in a register that can hold MODE2
18490 without copying. That is, all register classes that can hold MODE2
18491 can also hold MODE1. */
18492
18493 bool
18494 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
18495 {
18496 if (mode1 == mode2)
18497 return true;
18498
18499 if (ix86_tieable_integer_mode_p (mode1)
18500 && ix86_tieable_integer_mode_p (mode2))
18501 return true;
18502
18503 /* MODE2 being XFmode implies fp stack or general regs, which means we
18504 can tie any smaller floating point modes to it. Note that we do not
18505 tie this with TFmode. */
18506 if (mode2 == XFmode)
18507 return mode1 == SFmode || mode1 == DFmode;
18508
18509 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
18510 that we can tie it with SFmode. */
18511 if (mode2 == DFmode)
18512 return mode1 == SFmode;
18513
18514 /* If MODE2 is only appropriate for an SSE register, then tie with
18515 any other mode acceptable to SSE registers. */
18516 if (GET_MODE_SIZE (mode2) >= 8
18517 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
18518 return ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1);
18519
18520 /* If MODE2 is appropriate for an MMX (or SSE) register, then tie
18521 with any other mode acceptable to MMX registers. */
18522 if (GET_MODE_SIZE (mode2) == 8
18523 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
18524 return ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1);
18525
18526 return false;
18527 }
18528
18529 /* Return the cost of moving data of mode M between a
18530 register and memory. A value of 2 is the default; this cost is
18531 relative to those in `REGISTER_MOVE_COST'.
18532
18533 If moving between registers and memory is more expensive than
18534 between two registers, you should define this macro to express the
18535 relative cost.
18536
18537 Model also increased moving costs of QImode registers in non
18538 Q_REGS classes.
18539 */
18540 int
18541 ix86_memory_move_cost (enum machine_mode mode, enum reg_class class, int in)
18542 {
18543 if (FLOAT_CLASS_P (class))
18544 {
18545 int index;
18546 switch (mode)
18547 {
18548 case SFmode:
18549 index = 0;
18550 break;
18551 case DFmode:
18552 index = 1;
18553 break;
18554 case XFmode:
18555 index = 2;
18556 break;
18557 default:
18558 return 100;
18559 }
18560 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
18561 }
18562 if (SSE_CLASS_P (class))
18563 {
18564 int index;
18565 switch (GET_MODE_SIZE (mode))
18566 {
18567 case 4:
18568 index = 0;
18569 break;
18570 case 8:
18571 index = 1;
18572 break;
18573 case 16:
18574 index = 2;
18575 break;
18576 default:
18577 return 100;
18578 }
18579 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
18580 }
18581 if (MMX_CLASS_P (class))
18582 {
18583 int index;
18584 switch (GET_MODE_SIZE (mode))
18585 {
18586 case 4:
18587 index = 0;
18588 break;
18589 case 8:
18590 index = 1;
18591 break;
18592 default:
18593 return 100;
18594 }
18595 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
18596 }
18597 switch (GET_MODE_SIZE (mode))
18598 {
18599 case 1:
18600 if (in)
18601 return (Q_CLASS_P (class) ? ix86_cost->int_load[0]
18602 : ix86_cost->movzbl_load);
18603 else
18604 return (Q_CLASS_P (class) ? ix86_cost->int_store[0]
18605 : ix86_cost->int_store[0] + 4);
18606 break;
18607 case 2:
18608 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
18609 default:
18610 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
18611 if (mode == TFmode)
18612 mode = XFmode;
18613 return ((in ? ix86_cost->int_load[2] : ix86_cost->int_store[2])
18614 * (((int) GET_MODE_SIZE (mode)
18615 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
18616 }
18617 }
18618
18619 /* Compute a (partial) cost for rtx X. Return true if the complete
18620 cost has been computed, and false if subexpressions should be
18621 scanned. In either case, *TOTAL contains the cost result. */
18622
18623 static bool
18624 ix86_rtx_costs (rtx x, int code, int outer_code, int *total)
18625 {
18626 enum machine_mode mode = GET_MODE (x);
18627
18628 switch (code)
18629 {
18630 case CONST_INT:
18631 case CONST:
18632 case LABEL_REF:
18633 case SYMBOL_REF:
18634 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
18635 *total = 3;
18636 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
18637 *total = 2;
18638 else if (flag_pic && SYMBOLIC_CONST (x)
18639 && (!TARGET_64BIT
18640 || (!GET_CODE (x) != LABEL_REF
18641 && (GET_CODE (x) != SYMBOL_REF
18642 || !SYMBOL_REF_LOCAL_P (x)))))
18643 *total = 1;
18644 else
18645 *total = 0;
18646 return true;
18647
18648 case CONST_DOUBLE:
18649 if (mode == VOIDmode)
18650 *total = 0;
18651 else
18652 switch (standard_80387_constant_p (x))
18653 {
18654 case 1: /* 0.0 */
18655 *total = 1;
18656 break;
18657 default: /* Other constants */
18658 *total = 2;
18659 break;
18660 case 0:
18661 case -1:
18662 /* Start with (MEM (SYMBOL_REF)), since that's where
18663 it'll probably end up. Add a penalty for size. */
18664 *total = (COSTS_N_INSNS (1)
18665 + (flag_pic != 0 && !TARGET_64BIT)
18666 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
18667 break;
18668 }
18669 return true;
18670
18671 case ZERO_EXTEND:
18672 /* The zero extensions is often completely free on x86_64, so make
18673 it as cheap as possible. */
18674 if (TARGET_64BIT && mode == DImode
18675 && GET_MODE (XEXP (x, 0)) == SImode)
18676 *total = 1;
18677 else if (TARGET_ZERO_EXTEND_WITH_AND)
18678 *total = ix86_cost->add;
18679 else
18680 *total = ix86_cost->movzx;
18681 return false;
18682
18683 case SIGN_EXTEND:
18684 *total = ix86_cost->movsx;
18685 return false;
18686
18687 case ASHIFT:
18688 if (CONST_INT_P (XEXP (x, 1))
18689 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
18690 {
18691 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
18692 if (value == 1)
18693 {
18694 *total = ix86_cost->add;
18695 return false;
18696 }
18697 if ((value == 2 || value == 3)
18698 && ix86_cost->lea <= ix86_cost->shift_const)
18699 {
18700 *total = ix86_cost->lea;
18701 return false;
18702 }
18703 }
18704 /* FALLTHRU */
18705
18706 case ROTATE:
18707 case ASHIFTRT:
18708 case LSHIFTRT:
18709 case ROTATERT:
18710 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
18711 {
18712 if (CONST_INT_P (XEXP (x, 1)))
18713 {
18714 if (INTVAL (XEXP (x, 1)) > 32)
18715 *total = ix86_cost->shift_const + COSTS_N_INSNS (2);
18716 else
18717 *total = ix86_cost->shift_const * 2;
18718 }
18719 else
18720 {
18721 if (GET_CODE (XEXP (x, 1)) == AND)
18722 *total = ix86_cost->shift_var * 2;
18723 else
18724 *total = ix86_cost->shift_var * 6 + COSTS_N_INSNS (2);
18725 }
18726 }
18727 else
18728 {
18729 if (CONST_INT_P (XEXP (x, 1)))
18730 *total = ix86_cost->shift_const;
18731 else
18732 *total = ix86_cost->shift_var;
18733 }
18734 return false;
18735
18736 case MULT:
18737 if (FLOAT_MODE_P (mode))
18738 {
18739 *total = ix86_cost->fmul;
18740 return false;
18741 }
18742 else
18743 {
18744 rtx op0 = XEXP (x, 0);
18745 rtx op1 = XEXP (x, 1);
18746 int nbits;
18747 if (CONST_INT_P (XEXP (x, 1)))
18748 {
18749 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
18750 for (nbits = 0; value != 0; value &= value - 1)
18751 nbits++;
18752 }
18753 else
18754 /* This is arbitrary. */
18755 nbits = 7;
18756
18757 /* Compute costs correctly for widening multiplication. */
18758 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op1) == ZERO_EXTEND)
18759 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
18760 == GET_MODE_SIZE (mode))
18761 {
18762 int is_mulwiden = 0;
18763 enum machine_mode inner_mode = GET_MODE (op0);
18764
18765 if (GET_CODE (op0) == GET_CODE (op1))
18766 is_mulwiden = 1, op1 = XEXP (op1, 0);
18767 else if (CONST_INT_P (op1))
18768 {
18769 if (GET_CODE (op0) == SIGN_EXTEND)
18770 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
18771 == INTVAL (op1);
18772 else
18773 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
18774 }
18775
18776 if (is_mulwiden)
18777 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
18778 }
18779
18780 *total = (ix86_cost->mult_init[MODE_INDEX (mode)]
18781 + nbits * ix86_cost->mult_bit
18782 + rtx_cost (op0, outer_code) + rtx_cost (op1, outer_code));
18783
18784 return true;
18785 }
18786
18787 case DIV:
18788 case UDIV:
18789 case MOD:
18790 case UMOD:
18791 if (FLOAT_MODE_P (mode))
18792 *total = ix86_cost->fdiv;
18793 else
18794 *total = ix86_cost->divide[MODE_INDEX (mode)];
18795 return false;
18796
18797 case PLUS:
18798 if (FLOAT_MODE_P (mode))
18799 *total = ix86_cost->fadd;
18800 else if (GET_MODE_CLASS (mode) == MODE_INT
18801 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
18802 {
18803 if (GET_CODE (XEXP (x, 0)) == PLUS
18804 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
18805 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
18806 && CONSTANT_P (XEXP (x, 1)))
18807 {
18808 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
18809 if (val == 2 || val == 4 || val == 8)
18810 {
18811 *total = ix86_cost->lea;
18812 *total += rtx_cost (XEXP (XEXP (x, 0), 1), outer_code);
18813 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
18814 outer_code);
18815 *total += rtx_cost (XEXP (x, 1), outer_code);
18816 return true;
18817 }
18818 }
18819 else if (GET_CODE (XEXP (x, 0)) == MULT
18820 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
18821 {
18822 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
18823 if (val == 2 || val == 4 || val == 8)
18824 {
18825 *total = ix86_cost->lea;
18826 *total += rtx_cost (XEXP (XEXP (x, 0), 0), outer_code);
18827 *total += rtx_cost (XEXP (x, 1), outer_code);
18828 return true;
18829 }
18830 }
18831 else if (GET_CODE (XEXP (x, 0)) == PLUS)
18832 {
18833 *total = ix86_cost->lea;
18834 *total += rtx_cost (XEXP (XEXP (x, 0), 0), outer_code);
18835 *total += rtx_cost (XEXP (XEXP (x, 0), 1), outer_code);
18836 *total += rtx_cost (XEXP (x, 1), outer_code);
18837 return true;
18838 }
18839 }
18840 /* FALLTHRU */
18841
18842 case MINUS:
18843 if (FLOAT_MODE_P (mode))
18844 {
18845 *total = ix86_cost->fadd;
18846 return false;
18847 }
18848 /* FALLTHRU */
18849
18850 case AND:
18851 case IOR:
18852 case XOR:
18853 if (!TARGET_64BIT && mode == DImode)
18854 {
18855 *total = (ix86_cost->add * 2
18856 + (rtx_cost (XEXP (x, 0), outer_code)
18857 << (GET_MODE (XEXP (x, 0)) != DImode))
18858 + (rtx_cost (XEXP (x, 1), outer_code)
18859 << (GET_MODE (XEXP (x, 1)) != DImode)));
18860 return true;
18861 }
18862 /* FALLTHRU */
18863
18864 case NEG:
18865 if (FLOAT_MODE_P (mode))
18866 {
18867 *total = ix86_cost->fchs;
18868 return false;
18869 }
18870 /* FALLTHRU */
18871
18872 case NOT:
18873 if (!TARGET_64BIT && mode == DImode)
18874 *total = ix86_cost->add * 2;
18875 else
18876 *total = ix86_cost->add;
18877 return false;
18878
18879 case COMPARE:
18880 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
18881 && XEXP (XEXP (x, 0), 1) == const1_rtx
18882 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
18883 && XEXP (x, 1) == const0_rtx)
18884 {
18885 /* This kind of construct is implemented using test[bwl].
18886 Treat it as if we had an AND. */
18887 *total = (ix86_cost->add
18888 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code)
18889 + rtx_cost (const1_rtx, outer_code));
18890 return true;
18891 }
18892 return false;
18893
18894 case FLOAT_EXTEND:
18895 if (!TARGET_SSE_MATH
18896 || mode == XFmode
18897 || (mode == DFmode && !TARGET_SSE2))
18898 *total = 0;
18899 return false;
18900
18901 case ABS:
18902 if (FLOAT_MODE_P (mode))
18903 *total = ix86_cost->fabs;
18904 return false;
18905
18906 case SQRT:
18907 if (FLOAT_MODE_P (mode))
18908 *total = ix86_cost->fsqrt;
18909 return false;
18910
18911 case UNSPEC:
18912 if (XINT (x, 1) == UNSPEC_TP)
18913 *total = 0;
18914 return false;
18915
18916 default:
18917 return false;
18918 }
18919 }
18920
18921 #if TARGET_MACHO
18922
18923 static int current_machopic_label_num;
18924
18925 /* Given a symbol name and its associated stub, write out the
18926 definition of the stub. */
18927
18928 void
18929 machopic_output_stub (FILE *file, const char *symb, const char *stub)
18930 {
18931 unsigned int length;
18932 char *binder_name, *symbol_name, lazy_ptr_name[32];
18933 int label = ++current_machopic_label_num;
18934
18935 /* For 64-bit we shouldn't get here. */
18936 gcc_assert (!TARGET_64BIT);
18937
18938 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
18939 symb = (*targetm.strip_name_encoding) (symb);
18940
18941 length = strlen (stub);
18942 binder_name = alloca (length + 32);
18943 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
18944
18945 length = strlen (symb);
18946 symbol_name = alloca (length + 32);
18947 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
18948
18949 sprintf (lazy_ptr_name, "L%d$lz", label);
18950
18951 if (MACHOPIC_PURE)
18952 switch_to_section (darwin_sections[machopic_picsymbol_stub_section]);
18953 else
18954 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
18955
18956 fprintf (file, "%s:\n", stub);
18957 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
18958
18959 if (MACHOPIC_PURE)
18960 {
18961 fprintf (file, "\tcall\tLPC$%d\nLPC$%d:\tpopl\t%%eax\n", label, label);
18962 fprintf (file, "\tmovl\t%s-LPC$%d(%%eax),%%edx\n", lazy_ptr_name, label);
18963 fprintf (file, "\tjmp\t*%%edx\n");
18964 }
18965 else
18966 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
18967
18968 fprintf (file, "%s:\n", binder_name);
18969
18970 if (MACHOPIC_PURE)
18971 {
18972 fprintf (file, "\tlea\t%s-LPC$%d(%%eax),%%eax\n", lazy_ptr_name, label);
18973 fprintf (file, "\tpushl\t%%eax\n");
18974 }
18975 else
18976 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
18977
18978 fprintf (file, "\tjmp\tdyld_stub_binding_helper\n");
18979
18980 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr_section]);
18981 fprintf (file, "%s:\n", lazy_ptr_name);
18982 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
18983 fprintf (file, "\t.long %s\n", binder_name);
18984 }
18985
18986 void
18987 darwin_x86_file_end (void)
18988 {
18989 darwin_file_end ();
18990 ix86_file_end ();
18991 }
18992 #endif /* TARGET_MACHO */
18993
18994 /* Order the registers for register allocator. */
18995
18996 void
18997 x86_order_regs_for_local_alloc (void)
18998 {
18999 int pos = 0;
19000 int i;
19001
19002 /* First allocate the local general purpose registers. */
19003 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
19004 if (GENERAL_REGNO_P (i) && call_used_regs[i])
19005 reg_alloc_order [pos++] = i;
19006
19007 /* Global general purpose registers. */
19008 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
19009 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
19010 reg_alloc_order [pos++] = i;
19011
19012 /* x87 registers come first in case we are doing FP math
19013 using them. */
19014 if (!TARGET_SSE_MATH)
19015 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
19016 reg_alloc_order [pos++] = i;
19017
19018 /* SSE registers. */
19019 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19020 reg_alloc_order [pos++] = i;
19021 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19022 reg_alloc_order [pos++] = i;
19023
19024 /* x87 registers. */
19025 if (TARGET_SSE_MATH)
19026 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
19027 reg_alloc_order [pos++] = i;
19028
19029 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
19030 reg_alloc_order [pos++] = i;
19031
19032 /* Initialize the rest of array as we do not allocate some registers
19033 at all. */
19034 while (pos < FIRST_PSEUDO_REGISTER)
19035 reg_alloc_order [pos++] = 0;
19036 }
19037
19038 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
19039 struct attribute_spec.handler. */
19040 static tree
19041 ix86_handle_struct_attribute (tree *node, tree name,
19042 tree args ATTRIBUTE_UNUSED,
19043 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
19044 {
19045 tree *type = NULL;
19046 if (DECL_P (*node))
19047 {
19048 if (TREE_CODE (*node) == TYPE_DECL)
19049 type = &TREE_TYPE (*node);
19050 }
19051 else
19052 type = node;
19053
19054 if (!(type && (TREE_CODE (*type) == RECORD_TYPE
19055 || TREE_CODE (*type) == UNION_TYPE)))
19056 {
19057 warning (OPT_Wattributes, "%qs attribute ignored",
19058 IDENTIFIER_POINTER (name));
19059 *no_add_attrs = true;
19060 }
19061
19062 else if ((is_attribute_p ("ms_struct", name)
19063 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
19064 || ((is_attribute_p ("gcc_struct", name)
19065 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
19066 {
19067 warning (OPT_Wattributes, "%qs incompatible attribute ignored",
19068 IDENTIFIER_POINTER (name));
19069 *no_add_attrs = true;
19070 }
19071
19072 return NULL_TREE;
19073 }
19074
19075 static bool
19076 ix86_ms_bitfield_layout_p (tree record_type)
19077 {
19078 return (TARGET_MS_BITFIELD_LAYOUT &&
19079 !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
19080 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type));
19081 }
19082
19083 /* Returns an expression indicating where the this parameter is
19084 located on entry to the FUNCTION. */
19085
19086 static rtx
19087 x86_this_parameter (tree function)
19088 {
19089 tree type = TREE_TYPE (function);
19090
19091 if (TARGET_64BIT)
19092 {
19093 int n = aggregate_value_p (TREE_TYPE (type), type) != 0;
19094 return gen_rtx_REG (DImode, x86_64_int_parameter_registers[n]);
19095 }
19096
19097 if (ix86_function_regparm (type, function) > 0)
19098 {
19099 tree parm;
19100
19101 parm = TYPE_ARG_TYPES (type);
19102 /* Figure out whether or not the function has a variable number of
19103 arguments. */
19104 for (; parm; parm = TREE_CHAIN (parm))
19105 if (TREE_VALUE (parm) == void_type_node)
19106 break;
19107 /* If not, the this parameter is in the first argument. */
19108 if (parm)
19109 {
19110 int regno = 0;
19111 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type)))
19112 regno = 2;
19113 return gen_rtx_REG (SImode, regno);
19114 }
19115 }
19116
19117 if (aggregate_value_p (TREE_TYPE (type), type))
19118 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, 8));
19119 else
19120 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, 4));
19121 }
19122
19123 /* Determine whether x86_output_mi_thunk can succeed. */
19124
19125 static bool
19126 x86_can_output_mi_thunk (tree thunk ATTRIBUTE_UNUSED,
19127 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
19128 HOST_WIDE_INT vcall_offset, tree function)
19129 {
19130 /* 64-bit can handle anything. */
19131 if (TARGET_64BIT)
19132 return true;
19133
19134 /* For 32-bit, everything's fine if we have one free register. */
19135 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
19136 return true;
19137
19138 /* Need a free register for vcall_offset. */
19139 if (vcall_offset)
19140 return false;
19141
19142 /* Need a free register for GOT references. */
19143 if (flag_pic && !(*targetm.binds_local_p) (function))
19144 return false;
19145
19146 /* Otherwise ok. */
19147 return true;
19148 }
19149
19150 /* Output the assembler code for a thunk function. THUNK_DECL is the
19151 declaration for the thunk function itself, FUNCTION is the decl for
19152 the target function. DELTA is an immediate constant offset to be
19153 added to THIS. If VCALL_OFFSET is nonzero, the word at
19154 *(*this + vcall_offset) should be added to THIS. */
19155
19156 static void
19157 x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED,
19158 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
19159 HOST_WIDE_INT vcall_offset, tree function)
19160 {
19161 rtx xops[3];
19162 rtx this = x86_this_parameter (function);
19163 rtx this_reg, tmp;
19164
19165 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
19166 pull it in now and let DELTA benefit. */
19167 if (REG_P (this))
19168 this_reg = this;
19169 else if (vcall_offset)
19170 {
19171 /* Put the this parameter into %eax. */
19172 xops[0] = this;
19173 xops[1] = this_reg = gen_rtx_REG (Pmode, 0);
19174 output_asm_insn ("mov{l}\t{%0, %1|%1, %0}", xops);
19175 }
19176 else
19177 this_reg = NULL_RTX;
19178
19179 /* Adjust the this parameter by a fixed constant. */
19180 if (delta)
19181 {
19182 xops[0] = GEN_INT (delta);
19183 xops[1] = this_reg ? this_reg : this;
19184 if (TARGET_64BIT)
19185 {
19186 if (!x86_64_general_operand (xops[0], DImode))
19187 {
19188 tmp = gen_rtx_REG (DImode, R10_REG);
19189 xops[1] = tmp;
19190 output_asm_insn ("mov{q}\t{%1, %0|%0, %1}", xops);
19191 xops[0] = tmp;
19192 xops[1] = this;
19193 }
19194 output_asm_insn ("add{q}\t{%0, %1|%1, %0}", xops);
19195 }
19196 else
19197 output_asm_insn ("add{l}\t{%0, %1|%1, %0}", xops);
19198 }
19199
19200 /* Adjust the this parameter by a value stored in the vtable. */
19201 if (vcall_offset)
19202 {
19203 if (TARGET_64BIT)
19204 tmp = gen_rtx_REG (DImode, R10_REG);
19205 else
19206 {
19207 int tmp_regno = 2 /* ECX */;
19208 if (lookup_attribute ("fastcall",
19209 TYPE_ATTRIBUTES (TREE_TYPE (function))))
19210 tmp_regno = 0 /* EAX */;
19211 tmp = gen_rtx_REG (SImode, tmp_regno);
19212 }
19213
19214 xops[0] = gen_rtx_MEM (Pmode, this_reg);
19215 xops[1] = tmp;
19216 if (TARGET_64BIT)
19217 output_asm_insn ("mov{q}\t{%0, %1|%1, %0}", xops);
19218 else
19219 output_asm_insn ("mov{l}\t{%0, %1|%1, %0}", xops);
19220
19221 /* Adjust the this parameter. */
19222 xops[0] = gen_rtx_MEM (Pmode, plus_constant (tmp, vcall_offset));
19223 if (TARGET_64BIT && !memory_operand (xops[0], Pmode))
19224 {
19225 rtx tmp2 = gen_rtx_REG (DImode, R11_REG);
19226 xops[0] = GEN_INT (vcall_offset);
19227 xops[1] = tmp2;
19228 output_asm_insn ("mov{q}\t{%0, %1|%1, %0}", xops);
19229 xops[0] = gen_rtx_MEM (Pmode, gen_rtx_PLUS (Pmode, tmp, tmp2));
19230 }
19231 xops[1] = this_reg;
19232 if (TARGET_64BIT)
19233 output_asm_insn ("add{q}\t{%0, %1|%1, %0}", xops);
19234 else
19235 output_asm_insn ("add{l}\t{%0, %1|%1, %0}", xops);
19236 }
19237
19238 /* If necessary, drop THIS back to its stack slot. */
19239 if (this_reg && this_reg != this)
19240 {
19241 xops[0] = this_reg;
19242 xops[1] = this;
19243 output_asm_insn ("mov{l}\t{%0, %1|%1, %0}", xops);
19244 }
19245
19246 xops[0] = XEXP (DECL_RTL (function), 0);
19247 if (TARGET_64BIT)
19248 {
19249 if (!flag_pic || (*targetm.binds_local_p) (function))
19250 output_asm_insn ("jmp\t%P0", xops);
19251 else
19252 {
19253 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, xops[0]), UNSPEC_GOTPCREL);
19254 tmp = gen_rtx_CONST (Pmode, tmp);
19255 tmp = gen_rtx_MEM (QImode, tmp);
19256 xops[0] = tmp;
19257 output_asm_insn ("jmp\t%A0", xops);
19258 }
19259 }
19260 else
19261 {
19262 if (!flag_pic || (*targetm.binds_local_p) (function))
19263 output_asm_insn ("jmp\t%P0", xops);
19264 else
19265 #if TARGET_MACHO
19266 if (TARGET_MACHO)
19267 {
19268 rtx sym_ref = XEXP (DECL_RTL (function), 0);
19269 tmp = (gen_rtx_SYMBOL_REF
19270 (Pmode,
19271 machopic_indirection_name (sym_ref, /*stub_p=*/true)));
19272 tmp = gen_rtx_MEM (QImode, tmp);
19273 xops[0] = tmp;
19274 output_asm_insn ("jmp\t%0", xops);
19275 }
19276 else
19277 #endif /* TARGET_MACHO */
19278 {
19279 tmp = gen_rtx_REG (SImode, 2 /* ECX */);
19280 output_set_got (tmp, NULL_RTX);
19281
19282 xops[1] = tmp;
19283 output_asm_insn ("mov{l}\t{%0@GOT(%1), %1|%1, %0@GOT[%1]}", xops);
19284 output_asm_insn ("jmp\t{*}%1", xops);
19285 }
19286 }
19287 }
19288
19289 static void
19290 x86_file_start (void)
19291 {
19292 default_file_start ();
19293 #if TARGET_MACHO
19294 darwin_file_start ();
19295 #endif
19296 if (X86_FILE_START_VERSION_DIRECTIVE)
19297 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
19298 if (X86_FILE_START_FLTUSED)
19299 fputs ("\t.global\t__fltused\n", asm_out_file);
19300 if (ix86_asm_dialect == ASM_INTEL)
19301 fputs ("\t.intel_syntax\n", asm_out_file);
19302 }
19303
19304 int
19305 x86_field_alignment (tree field, int computed)
19306 {
19307 enum machine_mode mode;
19308 tree type = TREE_TYPE (field);
19309
19310 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
19311 return computed;
19312 mode = TYPE_MODE (TREE_CODE (type) == ARRAY_TYPE
19313 ? get_inner_array_type (type) : type);
19314 if (mode == DFmode || mode == DCmode
19315 || GET_MODE_CLASS (mode) == MODE_INT
19316 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
19317 return MIN (32, computed);
19318 return computed;
19319 }
19320
19321 /* Output assembler code to FILE to increment profiler label # LABELNO
19322 for profiling a function entry. */
19323 void
19324 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
19325 {
19326 if (TARGET_64BIT)
19327 if (flag_pic)
19328 {
19329 #ifndef NO_PROFILE_COUNTERS
19330 fprintf (file, "\tleaq\t%sP%d@(%%rip),%%r11\n", LPREFIX, labelno);
19331 #endif
19332 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", MCOUNT_NAME);
19333 }
19334 else
19335 {
19336 #ifndef NO_PROFILE_COUNTERS
19337 fprintf (file, "\tmovq\t$%sP%d,%%r11\n", LPREFIX, labelno);
19338 #endif
19339 fprintf (file, "\tcall\t%s\n", MCOUNT_NAME);
19340 }
19341 else if (flag_pic)
19342 {
19343 #ifndef NO_PROFILE_COUNTERS
19344 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%%s\n",
19345 LPREFIX, labelno, PROFILE_COUNT_REGISTER);
19346 #endif
19347 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", MCOUNT_NAME);
19348 }
19349 else
19350 {
19351 #ifndef NO_PROFILE_COUNTERS
19352 fprintf (file, "\tmovl\t$%sP%d,%%%s\n", LPREFIX, labelno,
19353 PROFILE_COUNT_REGISTER);
19354 #endif
19355 fprintf (file, "\tcall\t%s\n", MCOUNT_NAME);
19356 }
19357 }
19358
19359 /* We don't have exact information about the insn sizes, but we may assume
19360 quite safely that we are informed about all 1 byte insns and memory
19361 address sizes. This is enough to eliminate unnecessary padding in
19362 99% of cases. */
19363
19364 static int
19365 min_insn_size (rtx insn)
19366 {
19367 int l = 0;
19368
19369 if (!INSN_P (insn) || !active_insn_p (insn))
19370 return 0;
19371
19372 /* Discard alignments we've emit and jump instructions. */
19373 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
19374 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
19375 return 0;
19376 if (JUMP_P (insn)
19377 && (GET_CODE (PATTERN (insn)) == ADDR_VEC
19378 || GET_CODE (PATTERN (insn)) == ADDR_DIFF_VEC))
19379 return 0;
19380
19381 /* Important case - calls are always 5 bytes.
19382 It is common to have many calls in the row. */
19383 if (CALL_P (insn)
19384 && symbolic_reference_mentioned_p (PATTERN (insn))
19385 && !SIBLING_CALL_P (insn))
19386 return 5;
19387 if (get_attr_length (insn) <= 1)
19388 return 1;
19389
19390 /* For normal instructions we may rely on the sizes of addresses
19391 and the presence of symbol to require 4 bytes of encoding.
19392 This is not the case for jumps where references are PC relative. */
19393 if (!JUMP_P (insn))
19394 {
19395 l = get_attr_length_address (insn);
19396 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
19397 l = 4;
19398 }
19399 if (l)
19400 return 1+l;
19401 else
19402 return 2;
19403 }
19404
19405 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
19406 window. */
19407
19408 static void
19409 ix86_avoid_jump_misspredicts (void)
19410 {
19411 rtx insn, start = get_insns ();
19412 int nbytes = 0, njumps = 0;
19413 int isjump = 0;
19414
19415 /* Look for all minimal intervals of instructions containing 4 jumps.
19416 The intervals are bounded by START and INSN. NBYTES is the total
19417 size of instructions in the interval including INSN and not including
19418 START. When the NBYTES is smaller than 16 bytes, it is possible
19419 that the end of START and INSN ends up in the same 16byte page.
19420
19421 The smallest offset in the page INSN can start is the case where START
19422 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
19423 We add p2align to 16byte window with maxskip 17 - NBYTES + sizeof (INSN).
19424 */
19425 for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
19426 {
19427
19428 nbytes += min_insn_size (insn);
19429 if (dump_file)
19430 fprintf(dump_file, "Insn %i estimated to %i bytes\n",
19431 INSN_UID (insn), min_insn_size (insn));
19432 if ((JUMP_P (insn)
19433 && GET_CODE (PATTERN (insn)) != ADDR_VEC
19434 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
19435 || CALL_P (insn))
19436 njumps++;
19437 else
19438 continue;
19439
19440 while (njumps > 3)
19441 {
19442 start = NEXT_INSN (start);
19443 if ((JUMP_P (start)
19444 && GET_CODE (PATTERN (start)) != ADDR_VEC
19445 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
19446 || CALL_P (start))
19447 njumps--, isjump = 1;
19448 else
19449 isjump = 0;
19450 nbytes -= min_insn_size (start);
19451 }
19452 gcc_assert (njumps >= 0);
19453 if (dump_file)
19454 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
19455 INSN_UID (start), INSN_UID (insn), nbytes);
19456
19457 if (njumps == 3 && isjump && nbytes < 16)
19458 {
19459 int padsize = 15 - nbytes + min_insn_size (insn);
19460
19461 if (dump_file)
19462 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
19463 INSN_UID (insn), padsize);
19464 emit_insn_before (gen_align (GEN_INT (padsize)), insn);
19465 }
19466 }
19467 }
19468
19469 /* AMD Athlon works faster
19470 when RET is not destination of conditional jump or directly preceded
19471 by other jump instruction. We avoid the penalty by inserting NOP just
19472 before the RET instructions in such cases. */
19473 static void
19474 ix86_pad_returns (void)
19475 {
19476 edge e;
19477 edge_iterator ei;
19478
19479 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
19480 {
19481 basic_block bb = e->src;
19482 rtx ret = BB_END (bb);
19483 rtx prev;
19484 bool replace = false;
19485
19486 if (!JUMP_P (ret) || GET_CODE (PATTERN (ret)) != RETURN
19487 || !maybe_hot_bb_p (bb))
19488 continue;
19489 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
19490 if (active_insn_p (prev) || LABEL_P (prev))
19491 break;
19492 if (prev && LABEL_P (prev))
19493 {
19494 edge e;
19495 edge_iterator ei;
19496
19497 FOR_EACH_EDGE (e, ei, bb->preds)
19498 if (EDGE_FREQUENCY (e) && e->src->index >= 0
19499 && !(e->flags & EDGE_FALLTHRU))
19500 replace = true;
19501 }
19502 if (!replace)
19503 {
19504 prev = prev_active_insn (ret);
19505 if (prev
19506 && ((JUMP_P (prev) && any_condjump_p (prev))
19507 || CALL_P (prev)))
19508 replace = true;
19509 /* Empty functions get branch mispredict even when the jump destination
19510 is not visible to us. */
19511 if (!prev && cfun->function_frequency > FUNCTION_FREQUENCY_UNLIKELY_EXECUTED)
19512 replace = true;
19513 }
19514 if (replace)
19515 {
19516 emit_insn_before (gen_return_internal_long (), ret);
19517 delete_insn (ret);
19518 }
19519 }
19520 }
19521
19522 /* Implement machine specific optimizations. We implement padding of returns
19523 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
19524 static void
19525 ix86_reorg (void)
19526 {
19527 if (TARGET_PAD_RETURNS && optimize && !optimize_size)
19528 ix86_pad_returns ();
19529 if (TARGET_FOUR_JUMP_LIMIT && optimize && !optimize_size)
19530 ix86_avoid_jump_misspredicts ();
19531 }
19532
19533 /* Return nonzero when QImode register that must be represented via REX prefix
19534 is used. */
19535 bool
19536 x86_extended_QIreg_mentioned_p (rtx insn)
19537 {
19538 int i;
19539 extract_insn_cached (insn);
19540 for (i = 0; i < recog_data.n_operands; i++)
19541 if (REG_P (recog_data.operand[i])
19542 && REGNO (recog_data.operand[i]) >= 4)
19543 return true;
19544 return false;
19545 }
19546
19547 /* Return nonzero when P points to register encoded via REX prefix.
19548 Called via for_each_rtx. */
19549 static int
19550 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
19551 {
19552 unsigned int regno;
19553 if (!REG_P (*p))
19554 return 0;
19555 regno = REGNO (*p);
19556 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
19557 }
19558
19559 /* Return true when INSN mentions register that must be encoded using REX
19560 prefix. */
19561 bool
19562 x86_extended_reg_mentioned_p (rtx insn)
19563 {
19564 return for_each_rtx (&PATTERN (insn), extended_reg_mentioned_1, NULL);
19565 }
19566
19567 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
19568 optabs would emit if we didn't have TFmode patterns. */
19569
19570 void
19571 x86_emit_floatuns (rtx operands[2])
19572 {
19573 rtx neglab, donelab, i0, i1, f0, in, out;
19574 enum machine_mode mode, inmode;
19575
19576 inmode = GET_MODE (operands[1]);
19577 gcc_assert (inmode == SImode || inmode == DImode);
19578
19579 out = operands[0];
19580 in = force_reg (inmode, operands[1]);
19581 mode = GET_MODE (out);
19582 neglab = gen_label_rtx ();
19583 donelab = gen_label_rtx ();
19584 i1 = gen_reg_rtx (Pmode);
19585 f0 = gen_reg_rtx (mode);
19586
19587 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, Pmode, 0, neglab);
19588
19589 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_FLOAT (mode, in)));
19590 emit_jump_insn (gen_jump (donelab));
19591 emit_barrier ();
19592
19593 emit_label (neglab);
19594
19595 i0 = expand_simple_binop (Pmode, LSHIFTRT, in, const1_rtx, NULL, 1, OPTAB_DIRECT);
19596 i1 = expand_simple_binop (Pmode, AND, in, const1_rtx, NULL, 1, OPTAB_DIRECT);
19597 i0 = expand_simple_binop (Pmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
19598 expand_float (f0, i0, 0);
19599 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
19600
19601 emit_label (donelab);
19602 }
19603 \f
19604 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
19605 with all elements equal to VAR. Return true if successful. */
19606
19607 static bool
19608 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
19609 rtx target, rtx val)
19610 {
19611 enum machine_mode smode, wsmode, wvmode;
19612 rtx x;
19613
19614 switch (mode)
19615 {
19616 case V2SImode:
19617 case V2SFmode:
19618 if (!mmx_ok)
19619 return false;
19620 /* FALLTHRU */
19621
19622 case V2DFmode:
19623 case V2DImode:
19624 case V4SFmode:
19625 case V4SImode:
19626 val = force_reg (GET_MODE_INNER (mode), val);
19627 x = gen_rtx_VEC_DUPLICATE (mode, val);
19628 emit_insn (gen_rtx_SET (VOIDmode, target, x));
19629 return true;
19630
19631 case V4HImode:
19632 if (!mmx_ok)
19633 return false;
19634 if (TARGET_SSE || TARGET_3DNOW_A)
19635 {
19636 val = gen_lowpart (SImode, val);
19637 x = gen_rtx_TRUNCATE (HImode, val);
19638 x = gen_rtx_VEC_DUPLICATE (mode, x);
19639 emit_insn (gen_rtx_SET (VOIDmode, target, x));
19640 return true;
19641 }
19642 else
19643 {
19644 smode = HImode;
19645 wsmode = SImode;
19646 wvmode = V2SImode;
19647 goto widen;
19648 }
19649
19650 case V8QImode:
19651 if (!mmx_ok)
19652 return false;
19653 smode = QImode;
19654 wsmode = HImode;
19655 wvmode = V4HImode;
19656 goto widen;
19657 case V8HImode:
19658 if (TARGET_SSE2)
19659 {
19660 rtx tmp1, tmp2;
19661 /* Extend HImode to SImode using a paradoxical SUBREG. */
19662 tmp1 = gen_reg_rtx (SImode);
19663 emit_move_insn (tmp1, gen_lowpart (SImode, val));
19664 /* Insert the SImode value as low element of V4SImode vector. */
19665 tmp2 = gen_reg_rtx (V4SImode);
19666 tmp1 = gen_rtx_VEC_MERGE (V4SImode,
19667 gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
19668 CONST0_RTX (V4SImode),
19669 const1_rtx);
19670 emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
19671 /* Cast the V4SImode vector back to a V8HImode vector. */
19672 tmp1 = gen_reg_rtx (V8HImode);
19673 emit_move_insn (tmp1, gen_lowpart (V8HImode, tmp2));
19674 /* Duplicate the low short through the whole low SImode word. */
19675 emit_insn (gen_sse2_punpcklwd (tmp1, tmp1, tmp1));
19676 /* Cast the V8HImode vector back to a V4SImode vector. */
19677 tmp2 = gen_reg_rtx (V4SImode);
19678 emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
19679 /* Replicate the low element of the V4SImode vector. */
19680 emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
19681 /* Cast the V2SImode back to V8HImode, and store in target. */
19682 emit_move_insn (target, gen_lowpart (V8HImode, tmp2));
19683 return true;
19684 }
19685 smode = HImode;
19686 wsmode = SImode;
19687 wvmode = V4SImode;
19688 goto widen;
19689 case V16QImode:
19690 if (TARGET_SSE2)
19691 {
19692 rtx tmp1, tmp2;
19693 /* Extend QImode to SImode using a paradoxical SUBREG. */
19694 tmp1 = gen_reg_rtx (SImode);
19695 emit_move_insn (tmp1, gen_lowpart (SImode, val));
19696 /* Insert the SImode value as low element of V4SImode vector. */
19697 tmp2 = gen_reg_rtx (V4SImode);
19698 tmp1 = gen_rtx_VEC_MERGE (V4SImode,
19699 gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
19700 CONST0_RTX (V4SImode),
19701 const1_rtx);
19702 emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
19703 /* Cast the V4SImode vector back to a V16QImode vector. */
19704 tmp1 = gen_reg_rtx (V16QImode);
19705 emit_move_insn (tmp1, gen_lowpart (V16QImode, tmp2));
19706 /* Duplicate the low byte through the whole low SImode word. */
19707 emit_insn (gen_sse2_punpcklbw (tmp1, tmp1, tmp1));
19708 emit_insn (gen_sse2_punpcklbw (tmp1, tmp1, tmp1));
19709 /* Cast the V16QImode vector back to a V4SImode vector. */
19710 tmp2 = gen_reg_rtx (V4SImode);
19711 emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
19712 /* Replicate the low element of the V4SImode vector. */
19713 emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
19714 /* Cast the V2SImode back to V16QImode, and store in target. */
19715 emit_move_insn (target, gen_lowpart (V16QImode, tmp2));
19716 return true;
19717 }
19718 smode = QImode;
19719 wsmode = HImode;
19720 wvmode = V8HImode;
19721 goto widen;
19722 widen:
19723 /* Replicate the value once into the next wider mode and recurse. */
19724 val = convert_modes (wsmode, smode, val, true);
19725 x = expand_simple_binop (wsmode, ASHIFT, val,
19726 GEN_INT (GET_MODE_BITSIZE (smode)),
19727 NULL_RTX, 1, OPTAB_LIB_WIDEN);
19728 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
19729
19730 x = gen_reg_rtx (wvmode);
19731 if (!ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val))
19732 gcc_unreachable ();
19733 emit_move_insn (target, gen_lowpart (mode, x));
19734 return true;
19735
19736 default:
19737 return false;
19738 }
19739 }
19740
19741 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
19742 whose ONE_VAR element is VAR, and other elements are zero. Return true
19743 if successful. */
19744
19745 static bool
19746 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
19747 rtx target, rtx var, int one_var)
19748 {
19749 enum machine_mode vsimode;
19750 rtx new_target;
19751 rtx x, tmp;
19752
19753 switch (mode)
19754 {
19755 case V2SFmode:
19756 case V2SImode:
19757 if (!mmx_ok)
19758 return false;
19759 /* FALLTHRU */
19760
19761 case V2DFmode:
19762 case V2DImode:
19763 if (one_var != 0)
19764 return false;
19765 var = force_reg (GET_MODE_INNER (mode), var);
19766 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
19767 emit_insn (gen_rtx_SET (VOIDmode, target, x));
19768 return true;
19769
19770 case V4SFmode:
19771 case V4SImode:
19772 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
19773 new_target = gen_reg_rtx (mode);
19774 else
19775 new_target = target;
19776 var = force_reg (GET_MODE_INNER (mode), var);
19777 x = gen_rtx_VEC_DUPLICATE (mode, var);
19778 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
19779 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
19780 if (one_var != 0)
19781 {
19782 /* We need to shuffle the value to the correct position, so
19783 create a new pseudo to store the intermediate result. */
19784
19785 /* With SSE2, we can use the integer shuffle insns. */
19786 if (mode != V4SFmode && TARGET_SSE2)
19787 {
19788 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
19789 GEN_INT (1),
19790 GEN_INT (one_var == 1 ? 0 : 1),
19791 GEN_INT (one_var == 2 ? 0 : 1),
19792 GEN_INT (one_var == 3 ? 0 : 1)));
19793 if (target != new_target)
19794 emit_move_insn (target, new_target);
19795 return true;
19796 }
19797
19798 /* Otherwise convert the intermediate result to V4SFmode and
19799 use the SSE1 shuffle instructions. */
19800 if (mode != V4SFmode)
19801 {
19802 tmp = gen_reg_rtx (V4SFmode);
19803 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
19804 }
19805 else
19806 tmp = new_target;
19807
19808 emit_insn (gen_sse_shufps_1 (tmp, tmp, tmp,
19809 GEN_INT (1),
19810 GEN_INT (one_var == 1 ? 0 : 1),
19811 GEN_INT (one_var == 2 ? 0+4 : 1+4),
19812 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
19813
19814 if (mode != V4SFmode)
19815 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
19816 else if (tmp != target)
19817 emit_move_insn (target, tmp);
19818 }
19819 else if (target != new_target)
19820 emit_move_insn (target, new_target);
19821 return true;
19822
19823 case V8HImode:
19824 case V16QImode:
19825 vsimode = V4SImode;
19826 goto widen;
19827 case V4HImode:
19828 case V8QImode:
19829 if (!mmx_ok)
19830 return false;
19831 vsimode = V2SImode;
19832 goto widen;
19833 widen:
19834 if (one_var != 0)
19835 return false;
19836
19837 /* Zero extend the variable element to SImode and recurse. */
19838 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
19839
19840 x = gen_reg_rtx (vsimode);
19841 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
19842 var, one_var))
19843 gcc_unreachable ();
19844
19845 emit_move_insn (target, gen_lowpart (mode, x));
19846 return true;
19847
19848 default:
19849 return false;
19850 }
19851 }
19852
19853 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
19854 consisting of the values in VALS. It is known that all elements
19855 except ONE_VAR are constants. Return true if successful. */
19856
19857 static bool
19858 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
19859 rtx target, rtx vals, int one_var)
19860 {
19861 rtx var = XVECEXP (vals, 0, one_var);
19862 enum machine_mode wmode;
19863 rtx const_vec, x;
19864
19865 const_vec = copy_rtx (vals);
19866 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
19867 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
19868
19869 switch (mode)
19870 {
19871 case V2DFmode:
19872 case V2DImode:
19873 case V2SFmode:
19874 case V2SImode:
19875 /* For the two element vectors, it's just as easy to use
19876 the general case. */
19877 return false;
19878
19879 case V4SFmode:
19880 case V4SImode:
19881 case V8HImode:
19882 case V4HImode:
19883 break;
19884
19885 case V16QImode:
19886 wmode = V8HImode;
19887 goto widen;
19888 case V8QImode:
19889 wmode = V4HImode;
19890 goto widen;
19891 widen:
19892 /* There's no way to set one QImode entry easily. Combine
19893 the variable value with its adjacent constant value, and
19894 promote to an HImode set. */
19895 x = XVECEXP (vals, 0, one_var ^ 1);
19896 if (one_var & 1)
19897 {
19898 var = convert_modes (HImode, QImode, var, true);
19899 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
19900 NULL_RTX, 1, OPTAB_LIB_WIDEN);
19901 x = GEN_INT (INTVAL (x) & 0xff);
19902 }
19903 else
19904 {
19905 var = convert_modes (HImode, QImode, var, true);
19906 x = gen_int_mode (INTVAL (x) << 8, HImode);
19907 }
19908 if (x != const0_rtx)
19909 var = expand_simple_binop (HImode, IOR, var, x, var,
19910 1, OPTAB_LIB_WIDEN);
19911
19912 x = gen_reg_rtx (wmode);
19913 emit_move_insn (x, gen_lowpart (wmode, const_vec));
19914 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
19915
19916 emit_move_insn (target, gen_lowpart (mode, x));
19917 return true;
19918
19919 default:
19920 return false;
19921 }
19922
19923 emit_move_insn (target, const_vec);
19924 ix86_expand_vector_set (mmx_ok, target, var, one_var);
19925 return true;
19926 }
19927
19928 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
19929 all values variable, and none identical. */
19930
19931 static void
19932 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
19933 rtx target, rtx vals)
19934 {
19935 enum machine_mode half_mode = GET_MODE_INNER (mode);
19936 rtx op0 = NULL, op1 = NULL;
19937 bool use_vec_concat = false;
19938
19939 switch (mode)
19940 {
19941 case V2SFmode:
19942 case V2SImode:
19943 if (!mmx_ok && !TARGET_SSE)
19944 break;
19945 /* FALLTHRU */
19946
19947 case V2DFmode:
19948 case V2DImode:
19949 /* For the two element vectors, we always implement VEC_CONCAT. */
19950 op0 = XVECEXP (vals, 0, 0);
19951 op1 = XVECEXP (vals, 0, 1);
19952 use_vec_concat = true;
19953 break;
19954
19955 case V4SFmode:
19956 half_mode = V2SFmode;
19957 goto half;
19958 case V4SImode:
19959 half_mode = V2SImode;
19960 goto half;
19961 half:
19962 {
19963 rtvec v;
19964
19965 /* For V4SF and V4SI, we implement a concat of two V2 vectors.
19966 Recurse to load the two halves. */
19967
19968 op0 = gen_reg_rtx (half_mode);
19969 v = gen_rtvec (2, XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1));
19970 ix86_expand_vector_init (false, op0, gen_rtx_PARALLEL (half_mode, v));
19971
19972 op1 = gen_reg_rtx (half_mode);
19973 v = gen_rtvec (2, XVECEXP (vals, 0, 2), XVECEXP (vals, 0, 3));
19974 ix86_expand_vector_init (false, op1, gen_rtx_PARALLEL (half_mode, v));
19975
19976 use_vec_concat = true;
19977 }
19978 break;
19979
19980 case V8HImode:
19981 case V16QImode:
19982 case V4HImode:
19983 case V8QImode:
19984 break;
19985
19986 default:
19987 gcc_unreachable ();
19988 }
19989
19990 if (use_vec_concat)
19991 {
19992 if (!register_operand (op0, half_mode))
19993 op0 = force_reg (half_mode, op0);
19994 if (!register_operand (op1, half_mode))
19995 op1 = force_reg (half_mode, op1);
19996
19997 emit_insn (gen_rtx_SET (VOIDmode, target,
19998 gen_rtx_VEC_CONCAT (mode, op0, op1)));
19999 }
20000 else
20001 {
20002 int i, j, n_elts, n_words, n_elt_per_word;
20003 enum machine_mode inner_mode;
20004 rtx words[4], shift;
20005
20006 inner_mode = GET_MODE_INNER (mode);
20007 n_elts = GET_MODE_NUNITS (mode);
20008 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
20009 n_elt_per_word = n_elts / n_words;
20010 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
20011
20012 for (i = 0; i < n_words; ++i)
20013 {
20014 rtx word = NULL_RTX;
20015
20016 for (j = 0; j < n_elt_per_word; ++j)
20017 {
20018 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
20019 elt = convert_modes (word_mode, inner_mode, elt, true);
20020
20021 if (j == 0)
20022 word = elt;
20023 else
20024 {
20025 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
20026 word, 1, OPTAB_LIB_WIDEN);
20027 word = expand_simple_binop (word_mode, IOR, word, elt,
20028 word, 1, OPTAB_LIB_WIDEN);
20029 }
20030 }
20031
20032 words[i] = word;
20033 }
20034
20035 if (n_words == 1)
20036 emit_move_insn (target, gen_lowpart (mode, words[0]));
20037 else if (n_words == 2)
20038 {
20039 rtx tmp = gen_reg_rtx (mode);
20040 emit_insn (gen_rtx_CLOBBER (VOIDmode, tmp));
20041 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
20042 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
20043 emit_move_insn (target, tmp);
20044 }
20045 else if (n_words == 4)
20046 {
20047 rtx tmp = gen_reg_rtx (V4SImode);
20048 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
20049 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
20050 emit_move_insn (target, gen_lowpart (mode, tmp));
20051 }
20052 else
20053 gcc_unreachable ();
20054 }
20055 }
20056
20057 /* Initialize vector TARGET via VALS. Suppress the use of MMX
20058 instructions unless MMX_OK is true. */
20059
20060 void
20061 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
20062 {
20063 enum machine_mode mode = GET_MODE (target);
20064 enum machine_mode inner_mode = GET_MODE_INNER (mode);
20065 int n_elts = GET_MODE_NUNITS (mode);
20066 int n_var = 0, one_var = -1;
20067 bool all_same = true, all_const_zero = true;
20068 int i;
20069 rtx x;
20070
20071 for (i = 0; i < n_elts; ++i)
20072 {
20073 x = XVECEXP (vals, 0, i);
20074 if (!CONSTANT_P (x))
20075 n_var++, one_var = i;
20076 else if (x != CONST0_RTX (inner_mode))
20077 all_const_zero = false;
20078 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
20079 all_same = false;
20080 }
20081
20082 /* Constants are best loaded from the constant pool. */
20083 if (n_var == 0)
20084 {
20085 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
20086 return;
20087 }
20088
20089 /* If all values are identical, broadcast the value. */
20090 if (all_same
20091 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
20092 XVECEXP (vals, 0, 0)))
20093 return;
20094
20095 /* Values where only one field is non-constant are best loaded from
20096 the pool and overwritten via move later. */
20097 if (n_var == 1)
20098 {
20099 if (all_const_zero
20100 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
20101 XVECEXP (vals, 0, one_var),
20102 one_var))
20103 return;
20104
20105 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
20106 return;
20107 }
20108
20109 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
20110 }
20111
20112 void
20113 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
20114 {
20115 enum machine_mode mode = GET_MODE (target);
20116 enum machine_mode inner_mode = GET_MODE_INNER (mode);
20117 bool use_vec_merge = false;
20118 rtx tmp;
20119
20120 switch (mode)
20121 {
20122 case V2SFmode:
20123 case V2SImode:
20124 if (mmx_ok)
20125 {
20126 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
20127 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
20128 if (elt == 0)
20129 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
20130 else
20131 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
20132 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
20133 return;
20134 }
20135 break;
20136
20137 case V2DFmode:
20138 case V2DImode:
20139 {
20140 rtx op0, op1;
20141
20142 /* For the two element vectors, we implement a VEC_CONCAT with
20143 the extraction of the other element. */
20144
20145 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
20146 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
20147
20148 if (elt == 0)
20149 op0 = val, op1 = tmp;
20150 else
20151 op0 = tmp, op1 = val;
20152
20153 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
20154 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
20155 }
20156 return;
20157
20158 case V4SFmode:
20159 switch (elt)
20160 {
20161 case 0:
20162 use_vec_merge = true;
20163 break;
20164
20165 case 1:
20166 /* tmp = target = A B C D */
20167 tmp = copy_to_reg (target);
20168 /* target = A A B B */
20169 emit_insn (gen_sse_unpcklps (target, target, target));
20170 /* target = X A B B */
20171 ix86_expand_vector_set (false, target, val, 0);
20172 /* target = A X C D */
20173 emit_insn (gen_sse_shufps_1 (target, target, tmp,
20174 GEN_INT (1), GEN_INT (0),
20175 GEN_INT (2+4), GEN_INT (3+4)));
20176 return;
20177
20178 case 2:
20179 /* tmp = target = A B C D */
20180 tmp = copy_to_reg (target);
20181 /* tmp = X B C D */
20182 ix86_expand_vector_set (false, tmp, val, 0);
20183 /* target = A B X D */
20184 emit_insn (gen_sse_shufps_1 (target, target, tmp,
20185 GEN_INT (0), GEN_INT (1),
20186 GEN_INT (0+4), GEN_INT (3+4)));
20187 return;
20188
20189 case 3:
20190 /* tmp = target = A B C D */
20191 tmp = copy_to_reg (target);
20192 /* tmp = X B C D */
20193 ix86_expand_vector_set (false, tmp, val, 0);
20194 /* target = A B X D */
20195 emit_insn (gen_sse_shufps_1 (target, target, tmp,
20196 GEN_INT (0), GEN_INT (1),
20197 GEN_INT (2+4), GEN_INT (0+4)));
20198 return;
20199
20200 default:
20201 gcc_unreachable ();
20202 }
20203 break;
20204
20205 case V4SImode:
20206 /* Element 0 handled by vec_merge below. */
20207 if (elt == 0)
20208 {
20209 use_vec_merge = true;
20210 break;
20211 }
20212
20213 if (TARGET_SSE2)
20214 {
20215 /* With SSE2, use integer shuffles to swap element 0 and ELT,
20216 store into element 0, then shuffle them back. */
20217
20218 rtx order[4];
20219
20220 order[0] = GEN_INT (elt);
20221 order[1] = const1_rtx;
20222 order[2] = const2_rtx;
20223 order[3] = GEN_INT (3);
20224 order[elt] = const0_rtx;
20225
20226 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
20227 order[1], order[2], order[3]));
20228
20229 ix86_expand_vector_set (false, target, val, 0);
20230
20231 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
20232 order[1], order[2], order[3]));
20233 }
20234 else
20235 {
20236 /* For SSE1, we have to reuse the V4SF code. */
20237 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
20238 gen_lowpart (SFmode, val), elt);
20239 }
20240 return;
20241
20242 case V8HImode:
20243 use_vec_merge = TARGET_SSE2;
20244 break;
20245 case V4HImode:
20246 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
20247 break;
20248
20249 case V16QImode:
20250 case V8QImode:
20251 default:
20252 break;
20253 }
20254
20255 if (use_vec_merge)
20256 {
20257 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
20258 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
20259 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
20260 }
20261 else
20262 {
20263 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
20264
20265 emit_move_insn (mem, target);
20266
20267 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
20268 emit_move_insn (tmp, val);
20269
20270 emit_move_insn (target, mem);
20271 }
20272 }
20273
20274 void
20275 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
20276 {
20277 enum machine_mode mode = GET_MODE (vec);
20278 enum machine_mode inner_mode = GET_MODE_INNER (mode);
20279 bool use_vec_extr = false;
20280 rtx tmp;
20281
20282 switch (mode)
20283 {
20284 case V2SImode:
20285 case V2SFmode:
20286 if (!mmx_ok)
20287 break;
20288 /* FALLTHRU */
20289
20290 case V2DFmode:
20291 case V2DImode:
20292 use_vec_extr = true;
20293 break;
20294
20295 case V4SFmode:
20296 switch (elt)
20297 {
20298 case 0:
20299 tmp = vec;
20300 break;
20301
20302 case 1:
20303 case 3:
20304 tmp = gen_reg_rtx (mode);
20305 emit_insn (gen_sse_shufps_1 (tmp, vec, vec,
20306 GEN_INT (elt), GEN_INT (elt),
20307 GEN_INT (elt+4), GEN_INT (elt+4)));
20308 break;
20309
20310 case 2:
20311 tmp = gen_reg_rtx (mode);
20312 emit_insn (gen_sse_unpckhps (tmp, vec, vec));
20313 break;
20314
20315 default:
20316 gcc_unreachable ();
20317 }
20318 vec = tmp;
20319 use_vec_extr = true;
20320 elt = 0;
20321 break;
20322
20323 case V4SImode:
20324 if (TARGET_SSE2)
20325 {
20326 switch (elt)
20327 {
20328 case 0:
20329 tmp = vec;
20330 break;
20331
20332 case 1:
20333 case 3:
20334 tmp = gen_reg_rtx (mode);
20335 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
20336 GEN_INT (elt), GEN_INT (elt),
20337 GEN_INT (elt), GEN_INT (elt)));
20338 break;
20339
20340 case 2:
20341 tmp = gen_reg_rtx (mode);
20342 emit_insn (gen_sse2_punpckhdq (tmp, vec, vec));
20343 break;
20344
20345 default:
20346 gcc_unreachable ();
20347 }
20348 vec = tmp;
20349 use_vec_extr = true;
20350 elt = 0;
20351 }
20352 else
20353 {
20354 /* For SSE1, we have to reuse the V4SF code. */
20355 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
20356 gen_lowpart (V4SFmode, vec), elt);
20357 return;
20358 }
20359 break;
20360
20361 case V8HImode:
20362 use_vec_extr = TARGET_SSE2;
20363 break;
20364 case V4HImode:
20365 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
20366 break;
20367
20368 case V16QImode:
20369 case V8QImode:
20370 /* ??? Could extract the appropriate HImode element and shift. */
20371 default:
20372 break;
20373 }
20374
20375 if (use_vec_extr)
20376 {
20377 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
20378 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
20379
20380 /* Let the rtl optimizers know about the zero extension performed. */
20381 if (inner_mode == HImode)
20382 {
20383 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
20384 target = gen_lowpart (SImode, target);
20385 }
20386
20387 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
20388 }
20389 else
20390 {
20391 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
20392
20393 emit_move_insn (mem, vec);
20394
20395 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
20396 emit_move_insn (target, tmp);
20397 }
20398 }
20399
20400 /* Expand a vector reduction on V4SFmode for SSE1. FN is the binary
20401 pattern to reduce; DEST is the destination; IN is the input vector. */
20402
20403 void
20404 ix86_expand_reduc_v4sf (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
20405 {
20406 rtx tmp1, tmp2, tmp3;
20407
20408 tmp1 = gen_reg_rtx (V4SFmode);
20409 tmp2 = gen_reg_rtx (V4SFmode);
20410 tmp3 = gen_reg_rtx (V4SFmode);
20411
20412 emit_insn (gen_sse_movhlps (tmp1, in, in));
20413 emit_insn (fn (tmp2, tmp1, in));
20414
20415 emit_insn (gen_sse_shufps_1 (tmp3, tmp2, tmp2,
20416 GEN_INT (1), GEN_INT (1),
20417 GEN_INT (1+4), GEN_INT (1+4)));
20418 emit_insn (fn (dest, tmp2, tmp3));
20419 }
20420 \f
20421 /* Target hook for scalar_mode_supported_p. */
20422 static bool
20423 ix86_scalar_mode_supported_p (enum machine_mode mode)
20424 {
20425 if (DECIMAL_FLOAT_MODE_P (mode))
20426 return true;
20427 else
20428 return default_scalar_mode_supported_p (mode);
20429 }
20430
20431 /* Implements target hook vector_mode_supported_p. */
20432 static bool
20433 ix86_vector_mode_supported_p (enum machine_mode mode)
20434 {
20435 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
20436 return true;
20437 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
20438 return true;
20439 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
20440 return true;
20441 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
20442 return true;
20443 return false;
20444 }
20445
20446 /* Worker function for TARGET_MD_ASM_CLOBBERS.
20447
20448 We do this in the new i386 backend to maintain source compatibility
20449 with the old cc0-based compiler. */
20450
20451 static tree
20452 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
20453 tree inputs ATTRIBUTE_UNUSED,
20454 tree clobbers)
20455 {
20456 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
20457 clobbers);
20458 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
20459 clobbers);
20460 return clobbers;
20461 }
20462
20463 /* Return true if this goes in small data/bss. */
20464
20465 static bool
20466 ix86_in_large_data_p (tree exp)
20467 {
20468 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
20469 return false;
20470
20471 /* Functions are never large data. */
20472 if (TREE_CODE (exp) == FUNCTION_DECL)
20473 return false;
20474
20475 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
20476 {
20477 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
20478 if (strcmp (section, ".ldata") == 0
20479 || strcmp (section, ".lbss") == 0)
20480 return true;
20481 return false;
20482 }
20483 else
20484 {
20485 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
20486
20487 /* If this is an incomplete type with size 0, then we can't put it
20488 in data because it might be too big when completed. */
20489 if (!size || size > ix86_section_threshold)
20490 return true;
20491 }
20492
20493 return false;
20494 }
20495 static void
20496 ix86_encode_section_info (tree decl, rtx rtl, int first)
20497 {
20498 default_encode_section_info (decl, rtl, first);
20499
20500 if (TREE_CODE (decl) == VAR_DECL
20501 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
20502 && ix86_in_large_data_p (decl))
20503 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
20504 }
20505
20506 /* Worker function for REVERSE_CONDITION. */
20507
20508 enum rtx_code
20509 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
20510 {
20511 return (mode != CCFPmode && mode != CCFPUmode
20512 ? reverse_condition (code)
20513 : reverse_condition_maybe_unordered (code));
20514 }
20515
20516 /* Output code to perform an x87 FP register move, from OPERANDS[1]
20517 to OPERANDS[0]. */
20518
20519 const char *
20520 output_387_reg_move (rtx insn, rtx *operands)
20521 {
20522 if (REG_P (operands[1])
20523 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
20524 {
20525 if (REGNO (operands[0]) == FIRST_STACK_REG)
20526 return output_387_ffreep (operands, 0);
20527 return "fstp\t%y0";
20528 }
20529 if (STACK_TOP_P (operands[0]))
20530 return "fld%z1\t%y1";
20531 return "fst\t%y0";
20532 }
20533
20534 /* Output code to perform a conditional jump to LABEL, if C2 flag in
20535 FP status register is set. */
20536
20537 void
20538 ix86_emit_fp_unordered_jump (rtx label)
20539 {
20540 rtx reg = gen_reg_rtx (HImode);
20541 rtx temp;
20542
20543 emit_insn (gen_x86_fnstsw_1 (reg));
20544
20545 if (TARGET_USE_SAHF)
20546 {
20547 emit_insn (gen_x86_sahf_1 (reg));
20548
20549 temp = gen_rtx_REG (CCmode, FLAGS_REG);
20550 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
20551 }
20552 else
20553 {
20554 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
20555
20556 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
20557 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
20558 }
20559
20560 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
20561 gen_rtx_LABEL_REF (VOIDmode, label),
20562 pc_rtx);
20563 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
20564 emit_jump_insn (temp);
20565 }
20566
20567 /* Output code to perform a log1p XFmode calculation. */
20568
20569 void ix86_emit_i387_log1p (rtx op0, rtx op1)
20570 {
20571 rtx label1 = gen_label_rtx ();
20572 rtx label2 = gen_label_rtx ();
20573
20574 rtx tmp = gen_reg_rtx (XFmode);
20575 rtx tmp2 = gen_reg_rtx (XFmode);
20576
20577 emit_insn (gen_absxf2 (tmp, op1));
20578 emit_insn (gen_cmpxf (tmp,
20579 CONST_DOUBLE_FROM_REAL_VALUE (
20580 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
20581 XFmode)));
20582 emit_jump_insn (gen_bge (label1));
20583
20584 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
20585 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
20586 emit_jump (label2);
20587
20588 emit_label (label1);
20589 emit_move_insn (tmp, CONST1_RTX (XFmode));
20590 emit_insn (gen_addxf3 (tmp, op1, tmp));
20591 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
20592 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
20593
20594 emit_label (label2);
20595 }
20596
20597 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
20598
20599 static void
20600 i386_solaris_elf_named_section (const char *name, unsigned int flags,
20601 tree decl)
20602 {
20603 /* With Binutils 2.15, the "@unwind" marker must be specified on
20604 every occurrence of the ".eh_frame" section, not just the first
20605 one. */
20606 if (TARGET_64BIT
20607 && strcmp (name, ".eh_frame") == 0)
20608 {
20609 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
20610 flags & SECTION_WRITE ? "aw" : "a");
20611 return;
20612 }
20613 default_elf_asm_named_section (name, flags, decl);
20614 }
20615
20616 /* Return the mangling of TYPE if it is an extended fundamental type. */
20617
20618 static const char *
20619 ix86_mangle_fundamental_type (tree type)
20620 {
20621 switch (TYPE_MODE (type))
20622 {
20623 case TFmode:
20624 /* __float128 is "g". */
20625 return "g";
20626 case XFmode:
20627 /* "long double" or __float80 is "e". */
20628 return "e";
20629 default:
20630 return NULL;
20631 }
20632 }
20633
20634 /* For 32-bit code we can save PIC register setup by using
20635 __stack_chk_fail_local hidden function instead of calling
20636 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
20637 register, so it is better to call __stack_chk_fail directly. */
20638
20639 static tree
20640 ix86_stack_protect_fail (void)
20641 {
20642 return TARGET_64BIT
20643 ? default_external_stack_protect_fail ()
20644 : default_hidden_stack_protect_fail ();
20645 }
20646
20647 /* Select a format to encode pointers in exception handling data. CODE
20648 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
20649 true if the symbol may be affected by dynamic relocations.
20650
20651 ??? All x86 object file formats are capable of representing this.
20652 After all, the relocation needed is the same as for the call insn.
20653 Whether or not a particular assembler allows us to enter such, I
20654 guess we'll have to see. */
20655 int
20656 asm_preferred_eh_data_format (int code, int global)
20657 {
20658 if (flag_pic)
20659 {
20660 int type = DW_EH_PE_sdata8;
20661 if (!TARGET_64BIT
20662 || ix86_cmodel == CM_SMALL_PIC
20663 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
20664 type = DW_EH_PE_sdata4;
20665 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
20666 }
20667 if (ix86_cmodel == CM_SMALL
20668 || (ix86_cmodel == CM_MEDIUM && code))
20669 return DW_EH_PE_udata4;
20670 return DW_EH_PE_absptr;
20671 }
20672 \f
20673 /* Expand copysign from SIGN to the positive value ABS_VALUE
20674 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
20675 the sign-bit. */
20676 static void
20677 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
20678 {
20679 enum machine_mode mode = GET_MODE (sign);
20680 rtx sgn = gen_reg_rtx (mode);
20681 if (mask == NULL_RTX)
20682 {
20683 mask = ix86_build_signbit_mask (mode, VECTOR_MODE_P (mode), false);
20684 if (!VECTOR_MODE_P (mode))
20685 {
20686 /* We need to generate a scalar mode mask in this case. */
20687 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
20688 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
20689 mask = gen_reg_rtx (mode);
20690 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
20691 }
20692 }
20693 else
20694 mask = gen_rtx_NOT (mode, mask);
20695 emit_insn (gen_rtx_SET (VOIDmode, sgn,
20696 gen_rtx_AND (mode, mask, sign)));
20697 emit_insn (gen_rtx_SET (VOIDmode, result,
20698 gen_rtx_IOR (mode, abs_value, sgn)));
20699 }
20700
20701 /* Expand fabs (OP0) and return a new rtx that holds the result. The
20702 mask for masking out the sign-bit is stored in *SMASK, if that is
20703 non-null. */
20704 static rtx
20705 ix86_expand_sse_fabs (rtx op0, rtx *smask)
20706 {
20707 enum machine_mode mode = GET_MODE (op0);
20708 rtx xa, mask;
20709
20710 xa = gen_reg_rtx (mode);
20711 mask = ix86_build_signbit_mask (mode, VECTOR_MODE_P (mode), true);
20712 if (!VECTOR_MODE_P (mode))
20713 {
20714 /* We need to generate a scalar mode mask in this case. */
20715 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
20716 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
20717 mask = gen_reg_rtx (mode);
20718 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
20719 }
20720 emit_insn (gen_rtx_SET (VOIDmode, xa,
20721 gen_rtx_AND (mode, op0, mask)));
20722
20723 if (smask)
20724 *smask = mask;
20725
20726 return xa;
20727 }
20728
20729 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
20730 swapping the operands if SWAP_OPERANDS is true. The expanded
20731 code is a forward jump to a newly created label in case the
20732 comparison is true. The generated label rtx is returned. */
20733 static rtx
20734 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
20735 bool swap_operands)
20736 {
20737 rtx label, tmp;
20738
20739 if (swap_operands)
20740 {
20741 tmp = op0;
20742 op0 = op1;
20743 op1 = tmp;
20744 }
20745
20746 label = gen_label_rtx ();
20747 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
20748 emit_insn (gen_rtx_SET (VOIDmode, tmp,
20749 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
20750 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
20751 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
20752 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
20753 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
20754 JUMP_LABEL (tmp) = label;
20755
20756 return label;
20757 }
20758
20759 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
20760 using comparison code CODE. Operands are swapped for the comparison if
20761 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
20762 static rtx
20763 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
20764 bool swap_operands)
20765 {
20766 enum machine_mode mode = GET_MODE (op0);
20767 rtx mask = gen_reg_rtx (mode);
20768
20769 if (swap_operands)
20770 {
20771 rtx tmp = op0;
20772 op0 = op1;
20773 op1 = tmp;
20774 }
20775
20776 if (mode == DFmode)
20777 emit_insn (gen_sse2_maskcmpdf3 (mask, op0, op1,
20778 gen_rtx_fmt_ee (code, mode, op0, op1)));
20779 else
20780 emit_insn (gen_sse_maskcmpsf3 (mask, op0, op1,
20781 gen_rtx_fmt_ee (code, mode, op0, op1)));
20782
20783 return mask;
20784 }
20785
20786 /* Generate and return a rtx of mode MODE for 2**n where n is the number
20787 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
20788 static rtx
20789 ix86_gen_TWO52 (enum machine_mode mode)
20790 {
20791 REAL_VALUE_TYPE TWO52r;
20792 rtx TWO52;
20793
20794 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
20795 TWO52 = const_double_from_real_value (TWO52r, mode);
20796 TWO52 = force_reg (mode, TWO52);
20797
20798 return TWO52;
20799 }
20800
20801 /* Expand SSE sequence for computing lround from OP1 storing
20802 into OP0. */
20803 void
20804 ix86_expand_lround (rtx op0, rtx op1)
20805 {
20806 /* C code for the stuff we're doing below:
20807 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
20808 return (long)tmp;
20809 */
20810 enum machine_mode mode = GET_MODE (op1);
20811 const struct real_format *fmt;
20812 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
20813 rtx adj;
20814
20815 /* load nextafter (0.5, 0.0) */
20816 fmt = REAL_MODE_FORMAT (mode);
20817 real_2expN (&half_minus_pred_half, -(fmt->p) - 1);
20818 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
20819
20820 /* adj = copysign (0.5, op1) */
20821 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
20822 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
20823
20824 /* adj = op1 + adj */
20825 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
20826
20827 /* op0 = (imode)adj */
20828 expand_fix (op0, adj, 0);
20829 }
20830
20831 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
20832 into OPERAND0. */
20833 void
20834 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
20835 {
20836 /* C code for the stuff we're doing below (for do_floor):
20837 xi = (long)op1;
20838 xi -= (double)xi > op1 ? 1 : 0;
20839 return xi;
20840 */
20841 enum machine_mode fmode = GET_MODE (op1);
20842 enum machine_mode imode = GET_MODE (op0);
20843 rtx ireg, freg, label, tmp;
20844
20845 /* reg = (long)op1 */
20846 ireg = gen_reg_rtx (imode);
20847 expand_fix (ireg, op1, 0);
20848
20849 /* freg = (double)reg */
20850 freg = gen_reg_rtx (fmode);
20851 expand_float (freg, ireg, 0);
20852
20853 /* ireg = (freg > op1) ? ireg - 1 : ireg */
20854 label = ix86_expand_sse_compare_and_jump (UNLE,
20855 freg, op1, !do_floor);
20856 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
20857 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
20858 emit_move_insn (ireg, tmp);
20859
20860 emit_label (label);
20861 LABEL_NUSES (label) = 1;
20862
20863 emit_move_insn (op0, ireg);
20864 }
20865
20866 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
20867 result in OPERAND0. */
20868 void
20869 ix86_expand_rint (rtx operand0, rtx operand1)
20870 {
20871 /* C code for the stuff we're doing below:
20872 xa = fabs (operand1);
20873 if (!isless (xa, 2**52))
20874 return operand1;
20875 xa = xa + 2**52 - 2**52;
20876 return copysign (xa, operand1);
20877 */
20878 enum machine_mode mode = GET_MODE (operand0);
20879 rtx res, xa, label, TWO52, mask;
20880
20881 res = gen_reg_rtx (mode);
20882 emit_move_insn (res, operand1);
20883
20884 /* xa = abs (operand1) */
20885 xa = ix86_expand_sse_fabs (res, &mask);
20886
20887 /* if (!isless (xa, TWO52)) goto label; */
20888 TWO52 = ix86_gen_TWO52 (mode);
20889 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20890
20891 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
20892 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
20893
20894 ix86_sse_copysign_to_positive (res, xa, res, mask);
20895
20896 emit_label (label);
20897 LABEL_NUSES (label) = 1;
20898
20899 emit_move_insn (operand0, res);
20900 }
20901
20902 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
20903 into OPERAND0. */
20904 void
20905 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
20906 {
20907 /* C code for the stuff we expand below.
20908 double xa = fabs (x), x2;
20909 if (!isless (xa, TWO52))
20910 return x;
20911 xa = xa + TWO52 - TWO52;
20912 x2 = copysign (xa, x);
20913 Compensate. Floor:
20914 if (x2 > x)
20915 x2 -= 1;
20916 Compensate. Ceil:
20917 if (x2 < x)
20918 x2 -= -1;
20919 return x2;
20920 */
20921 enum machine_mode mode = GET_MODE (operand0);
20922 rtx xa, TWO52, tmp, label, one, res, mask;
20923
20924 TWO52 = ix86_gen_TWO52 (mode);
20925
20926 /* Temporary for holding the result, initialized to the input
20927 operand to ease control flow. */
20928 res = gen_reg_rtx (mode);
20929 emit_move_insn (res, operand1);
20930
20931 /* xa = abs (operand1) */
20932 xa = ix86_expand_sse_fabs (res, &mask);
20933
20934 /* if (!isless (xa, TWO52)) goto label; */
20935 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20936
20937 /* xa = xa + TWO52 - TWO52; */
20938 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
20939 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
20940
20941 /* xa = copysign (xa, operand1) */
20942 ix86_sse_copysign_to_positive (xa, xa, res, mask);
20943
20944 /* generate 1.0 or -1.0 */
20945 one = force_reg (mode,
20946 const_double_from_real_value (do_floor
20947 ? dconst1 : dconstm1, mode));
20948
20949 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
20950 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
20951 emit_insn (gen_rtx_SET (VOIDmode, tmp,
20952 gen_rtx_AND (mode, one, tmp)));
20953 /* We always need to subtract here to preserve signed zero. */
20954 tmp = expand_simple_binop (mode, MINUS,
20955 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
20956 emit_move_insn (res, tmp);
20957
20958 emit_label (label);
20959 LABEL_NUSES (label) = 1;
20960
20961 emit_move_insn (operand0, res);
20962 }
20963
20964 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
20965 into OPERAND0. */
20966 void
20967 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
20968 {
20969 /* C code for the stuff we expand below.
20970 double xa = fabs (x), x2;
20971 if (!isless (xa, TWO52))
20972 return x;
20973 x2 = (double)(long)x;
20974 Compensate. Floor:
20975 if (x2 > x)
20976 x2 -= 1;
20977 Compensate. Ceil:
20978 if (x2 < x)
20979 x2 += 1;
20980 if (HONOR_SIGNED_ZEROS (mode))
20981 return copysign (x2, x);
20982 return x2;
20983 */
20984 enum machine_mode mode = GET_MODE (operand0);
20985 rtx xa, xi, TWO52, tmp, label, one, res, mask;
20986
20987 TWO52 = ix86_gen_TWO52 (mode);
20988
20989 /* Temporary for holding the result, initialized to the input
20990 operand to ease control flow. */
20991 res = gen_reg_rtx (mode);
20992 emit_move_insn (res, operand1);
20993
20994 /* xa = abs (operand1) */
20995 xa = ix86_expand_sse_fabs (res, &mask);
20996
20997 /* if (!isless (xa, TWO52)) goto label; */
20998 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20999
21000 /* xa = (double)(long)x */
21001 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
21002 expand_fix (xi, res, 0);
21003 expand_float (xa, xi, 0);
21004
21005 /* generate 1.0 */
21006 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
21007
21008 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
21009 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
21010 emit_insn (gen_rtx_SET (VOIDmode, tmp,
21011 gen_rtx_AND (mode, one, tmp)));
21012 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
21013 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
21014 emit_move_insn (res, tmp);
21015
21016 if (HONOR_SIGNED_ZEROS (mode))
21017 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
21018
21019 emit_label (label);
21020 LABEL_NUSES (label) = 1;
21021
21022 emit_move_insn (operand0, res);
21023 }
21024
21025 /* Expand SSE sequence for computing round from OPERAND1 storing
21026 into OPERAND0. Sequence that works without relying on DImode truncation
21027 via cvttsd2siq that is only available on 64bit targets. */
21028 void
21029 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
21030 {
21031 /* C code for the stuff we expand below.
21032 double xa = fabs (x), xa2, x2;
21033 if (!isless (xa, TWO52))
21034 return x;
21035 Using the absolute value and copying back sign makes
21036 -0.0 -> -0.0 correct.
21037 xa2 = xa + TWO52 - TWO52;
21038 Compensate.
21039 dxa = xa2 - xa;
21040 if (dxa <= -0.5)
21041 xa2 += 1;
21042 else if (dxa > 0.5)
21043 xa2 -= 1;
21044 x2 = copysign (xa2, x);
21045 return x2;
21046 */
21047 enum machine_mode mode = GET_MODE (operand0);
21048 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
21049
21050 TWO52 = ix86_gen_TWO52 (mode);
21051
21052 /* Temporary for holding the result, initialized to the input
21053 operand to ease control flow. */
21054 res = gen_reg_rtx (mode);
21055 emit_move_insn (res, operand1);
21056
21057 /* xa = abs (operand1) */
21058 xa = ix86_expand_sse_fabs (res, &mask);
21059
21060 /* if (!isless (xa, TWO52)) goto label; */
21061 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21062
21063 /* xa2 = xa + TWO52 - TWO52; */
21064 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
21065 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
21066
21067 /* dxa = xa2 - xa; */
21068 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
21069
21070 /* generate 0.5, 1.0 and -0.5 */
21071 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
21072 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
21073 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
21074 0, OPTAB_DIRECT);
21075
21076 /* Compensate. */
21077 tmp = gen_reg_rtx (mode);
21078 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
21079 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
21080 emit_insn (gen_rtx_SET (VOIDmode, tmp,
21081 gen_rtx_AND (mode, one, tmp)));
21082 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
21083 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
21084 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
21085 emit_insn (gen_rtx_SET (VOIDmode, tmp,
21086 gen_rtx_AND (mode, one, tmp)));
21087 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
21088
21089 /* res = copysign (xa2, operand1) */
21090 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
21091
21092 emit_label (label);
21093 LABEL_NUSES (label) = 1;
21094
21095 emit_move_insn (operand0, res);
21096 }
21097
21098 /* Expand SSE sequence for computing trunc from OPERAND1 storing
21099 into OPERAND0. */
21100 void
21101 ix86_expand_trunc (rtx operand0, rtx operand1)
21102 {
21103 /* C code for SSE variant we expand below.
21104 double xa = fabs (x), x2;
21105 if (!isless (xa, TWO52))
21106 return x;
21107 x2 = (double)(long)x;
21108 if (HONOR_SIGNED_ZEROS (mode))
21109 return copysign (x2, x);
21110 return x2;
21111 */
21112 enum machine_mode mode = GET_MODE (operand0);
21113 rtx xa, xi, TWO52, label, res, mask;
21114
21115 TWO52 = ix86_gen_TWO52 (mode);
21116
21117 /* Temporary for holding the result, initialized to the input
21118 operand to ease control flow. */
21119 res = gen_reg_rtx (mode);
21120 emit_move_insn (res, operand1);
21121
21122 /* xa = abs (operand1) */
21123 xa = ix86_expand_sse_fabs (res, &mask);
21124
21125 /* if (!isless (xa, TWO52)) goto label; */
21126 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21127
21128 /* x = (double)(long)x */
21129 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
21130 expand_fix (xi, res, 0);
21131 expand_float (res, xi, 0);
21132
21133 if (HONOR_SIGNED_ZEROS (mode))
21134 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
21135
21136 emit_label (label);
21137 LABEL_NUSES (label) = 1;
21138
21139 emit_move_insn (operand0, res);
21140 }
21141
21142 /* Expand SSE sequence for computing trunc from OPERAND1 storing
21143 into OPERAND0. */
21144 void
21145 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
21146 {
21147 enum machine_mode mode = GET_MODE (operand0);
21148 rtx xa, mask, TWO52, label, one, res, smask, tmp;
21149
21150 /* C code for SSE variant we expand below.
21151 double xa = fabs (x), x2;
21152 if (!isless (xa, TWO52))
21153 return x;
21154 xa2 = xa + TWO52 - TWO52;
21155 Compensate:
21156 if (xa2 > xa)
21157 xa2 -= 1.0;
21158 x2 = copysign (xa2, x);
21159 return x2;
21160 */
21161
21162 TWO52 = ix86_gen_TWO52 (mode);
21163
21164 /* Temporary for holding the result, initialized to the input
21165 operand to ease control flow. */
21166 res = gen_reg_rtx (mode);
21167 emit_move_insn (res, operand1);
21168
21169 /* xa = abs (operand1) */
21170 xa = ix86_expand_sse_fabs (res, &smask);
21171
21172 /* if (!isless (xa, TWO52)) goto label; */
21173 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21174
21175 /* res = xa + TWO52 - TWO52; */
21176 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
21177 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
21178 emit_move_insn (res, tmp);
21179
21180 /* generate 1.0 */
21181 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
21182
21183 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
21184 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
21185 emit_insn (gen_rtx_SET (VOIDmode, mask,
21186 gen_rtx_AND (mode, mask, one)));
21187 tmp = expand_simple_binop (mode, MINUS,
21188 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
21189 emit_move_insn (res, tmp);
21190
21191 /* res = copysign (res, operand1) */
21192 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
21193
21194 emit_label (label);
21195 LABEL_NUSES (label) = 1;
21196
21197 emit_move_insn (operand0, res);
21198 }
21199
21200 /* Expand SSE sequence for computing round from OPERAND1 storing
21201 into OPERAND0. */
21202 void
21203 ix86_expand_round (rtx operand0, rtx operand1)
21204 {
21205 /* C code for the stuff we're doing below:
21206 double xa = fabs (x);
21207 if (!isless (xa, TWO52))
21208 return x;
21209 xa = (double)(long)(xa + nextafter (0.5, 0.0));
21210 return copysign (xa, x);
21211 */
21212 enum machine_mode mode = GET_MODE (operand0);
21213 rtx res, TWO52, xa, label, xi, half, mask;
21214 const struct real_format *fmt;
21215 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
21216
21217 /* Temporary for holding the result, initialized to the input
21218 operand to ease control flow. */
21219 res = gen_reg_rtx (mode);
21220 emit_move_insn (res, operand1);
21221
21222 TWO52 = ix86_gen_TWO52 (mode);
21223 xa = ix86_expand_sse_fabs (res, &mask);
21224 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21225
21226 /* load nextafter (0.5, 0.0) */
21227 fmt = REAL_MODE_FORMAT (mode);
21228 real_2expN (&half_minus_pred_half, -(fmt->p) - 1);
21229 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
21230
21231 /* xa = xa + 0.5 */
21232 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
21233 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
21234
21235 /* xa = (double)(int64_t)xa */
21236 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
21237 expand_fix (xi, xa, 0);
21238 expand_float (xa, xi, 0);
21239
21240 /* res = copysign (xa, operand1) */
21241 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
21242
21243 emit_label (label);
21244 LABEL_NUSES (label) = 1;
21245
21246 emit_move_insn (operand0, res);
21247 }
21248
21249 #include "gt-i386.h"