fb0ac8de0b092e99124c0852bedf2618a98b9487
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "cgraph.h"
51 #include "gimple.h"
52 #include "dwarf2.h"
53 #include "df.h"
54 #include "tm-constrs.h"
55 #include "params.h"
56 #include "cselib.h"
57 #include "debug.h"
58 #include "sched-int.h"
59 #include "sbitmap.h"
60 #include "fibheap.h"
61 #include "opts.h"
62 #include "diagnostic.h"
63
64 enum upper_128bits_state
65 {
66 unknown = 0,
67 unused,
68 used
69 };
70
71 typedef struct block_info_def
72 {
73 /* State of the upper 128bits of AVX registers at exit. */
74 enum upper_128bits_state state;
75 /* TRUE if state of the upper 128bits of AVX registers is unchanged
76 in this block. */
77 bool unchanged;
78 /* TRUE if block has been processed. */
79 bool processed;
80 /* TRUE if block has been scanned. */
81 bool scanned;
82 /* Previous state of the upper 128bits of AVX registers at entry. */
83 enum upper_128bits_state prev;
84 } *block_info;
85
86 #define BLOCK_INFO(B) ((block_info) (B)->aux)
87
88 enum call_avx256_state
89 {
90 /* Callee returns 256bit AVX register. */
91 callee_return_avx256 = -1,
92 /* Callee returns and passes 256bit AVX register. */
93 callee_return_pass_avx256,
94 /* Callee passes 256bit AVX register. */
95 callee_pass_avx256,
96 /* Callee doesn't return nor passe 256bit AVX register, or no
97 256bit AVX register in function return. */
98 call_no_avx256,
99 /* vzeroupper intrinsic. */
100 vzeroupper_intrinsic
101 };
102
103 /* Check if a 256bit AVX register is referenced in stores. */
104
105 static void
106 check_avx256_stores (rtx dest, const_rtx set, void *data)
107 {
108 if ((REG_P (dest)
109 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
110 || (GET_CODE (set) == SET
111 && REG_P (SET_SRC (set))
112 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
113 {
114 enum upper_128bits_state *state
115 = (enum upper_128bits_state *) data;
116 *state = used;
117 }
118 }
119
120 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
121 in basic block BB. Delete it if upper 128bit AVX registers are
122 unused. If it isn't deleted, move it to just before a jump insn.
123
124 STATE is state of the upper 128bits of AVX registers at entry. */
125
126 static void
127 move_or_delete_vzeroupper_2 (basic_block bb,
128 enum upper_128bits_state state)
129 {
130 rtx insn, bb_end;
131 rtx vzeroupper_insn = NULL_RTX;
132 rtx pat;
133 int avx256;
134 bool unchanged;
135
136 if (BLOCK_INFO (bb)->unchanged)
137 {
138 if (dump_file)
139 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
140 bb->index, state);
141
142 BLOCK_INFO (bb)->state = state;
143 return;
144 }
145
146 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
147 {
148 if (dump_file)
149 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
150 bb->index, BLOCK_INFO (bb)->state);
151 return;
152 }
153
154 BLOCK_INFO (bb)->prev = state;
155
156 if (dump_file)
157 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
158 bb->index, state);
159
160 unchanged = true;
161
162 /* BB_END changes when it is deleted. */
163 bb_end = BB_END (bb);
164 insn = BB_HEAD (bb);
165 while (insn != bb_end)
166 {
167 insn = NEXT_INSN (insn);
168
169 if (!NONDEBUG_INSN_P (insn))
170 continue;
171
172 /* Move vzeroupper before jump/call. */
173 if (JUMP_P (insn) || CALL_P (insn))
174 {
175 if (!vzeroupper_insn)
176 continue;
177
178 if (PREV_INSN (insn) != vzeroupper_insn)
179 {
180 if (dump_file)
181 {
182 fprintf (dump_file, "Move vzeroupper after:\n");
183 print_rtl_single (dump_file, PREV_INSN (insn));
184 fprintf (dump_file, "before:\n");
185 print_rtl_single (dump_file, insn);
186 }
187 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
188 PREV_INSN (insn));
189 }
190 vzeroupper_insn = NULL_RTX;
191 continue;
192 }
193
194 pat = PATTERN (insn);
195
196 /* Check insn for vzeroupper intrinsic. */
197 if (GET_CODE (pat) == UNSPEC_VOLATILE
198 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
199 {
200 if (dump_file)
201 {
202 /* Found vzeroupper intrinsic. */
203 fprintf (dump_file, "Found vzeroupper:\n");
204 print_rtl_single (dump_file, insn);
205 }
206 }
207 else
208 {
209 /* Check insn for vzeroall intrinsic. */
210 if (GET_CODE (pat) == PARALLEL
211 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
212 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
213 {
214 state = unused;
215 unchanged = false;
216
217 /* Delete pending vzeroupper insertion. */
218 if (vzeroupper_insn)
219 {
220 delete_insn (vzeroupper_insn);
221 vzeroupper_insn = NULL_RTX;
222 }
223 }
224 else if (state != used)
225 {
226 note_stores (pat, check_avx256_stores, &state);
227 if (state == used)
228 unchanged = false;
229 }
230 continue;
231 }
232
233 /* Process vzeroupper intrinsic. */
234 avx256 = INTVAL (XVECEXP (pat, 0, 0));
235
236 if (state == unused)
237 {
238 /* Since the upper 128bits are cleared, callee must not pass
239 256bit AVX register. We only need to check if callee
240 returns 256bit AVX register. */
241 if (avx256 == callee_return_avx256)
242 {
243 state = used;
244 unchanged = false;
245 }
246
247 /* Remove unnecessary vzeroupper since upper 128bits are
248 cleared. */
249 if (dump_file)
250 {
251 fprintf (dump_file, "Delete redundant vzeroupper:\n");
252 print_rtl_single (dump_file, insn);
253 }
254 delete_insn (insn);
255 }
256 else
257 {
258 /* Set state to UNUSED if callee doesn't return 256bit AVX
259 register. */
260 if (avx256 != callee_return_pass_avx256)
261 state = unused;
262
263 if (avx256 == callee_return_pass_avx256
264 || avx256 == callee_pass_avx256)
265 {
266 /* Must remove vzeroupper since callee passes in 256bit
267 AVX register. */
268 if (dump_file)
269 {
270 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
271 print_rtl_single (dump_file, insn);
272 }
273 delete_insn (insn);
274 }
275 else
276 {
277 vzeroupper_insn = insn;
278 unchanged = false;
279 }
280 }
281 }
282
283 BLOCK_INFO (bb)->state = state;
284 BLOCK_INFO (bb)->unchanged = unchanged;
285 BLOCK_INFO (bb)->scanned = true;
286
287 if (dump_file)
288 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
289 bb->index, unchanged ? "unchanged" : "changed",
290 state);
291 }
292
293 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
294 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
295 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
296 state is changed. */
297
298 static bool
299 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
300 {
301 edge e;
302 edge_iterator ei;
303 enum upper_128bits_state state, old_state, new_state;
304 bool seen_unknown;
305
306 if (dump_file)
307 fprintf (dump_file, " Process [bb %i]: status: %d\n",
308 block->index, BLOCK_INFO (block)->processed);
309
310 if (BLOCK_INFO (block)->processed)
311 return false;
312
313 state = unused;
314
315 /* Check all predecessor edges of this block. */
316 seen_unknown = false;
317 FOR_EACH_EDGE (e, ei, block->preds)
318 {
319 if (e->src == block)
320 continue;
321 switch (BLOCK_INFO (e->src)->state)
322 {
323 case unknown:
324 if (!unknown_is_unused)
325 seen_unknown = true;
326 case unused:
327 break;
328 case used:
329 state = used;
330 goto done;
331 }
332 }
333
334 if (seen_unknown)
335 state = unknown;
336
337 done:
338 old_state = BLOCK_INFO (block)->state;
339 move_or_delete_vzeroupper_2 (block, state);
340 new_state = BLOCK_INFO (block)->state;
341
342 if (state != unknown || new_state == used)
343 BLOCK_INFO (block)->processed = true;
344
345 /* Need to rescan if the upper 128bits of AVX registers are changed
346 to USED at exit. */
347 if (new_state != old_state)
348 {
349 if (new_state == used)
350 cfun->machine->rescan_vzeroupper_p = 1;
351 return true;
352 }
353 else
354 return false;
355 }
356
357 /* Go through the instruction stream looking for vzeroupper. Delete
358 it if upper 128bit AVX registers are unused. If it isn't deleted,
359 move it to just before a jump insn. */
360
361 static void
362 move_or_delete_vzeroupper (void)
363 {
364 edge e;
365 edge_iterator ei;
366 basic_block bb;
367 fibheap_t worklist, pending, fibheap_swap;
368 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
369 int *bb_order;
370 int *rc_order;
371 int i;
372
373 /* Set up block info for each basic block. */
374 alloc_aux_for_blocks (sizeof (struct block_info_def));
375
376 /* Process outgoing edges of entry point. */
377 if (dump_file)
378 fprintf (dump_file, "Process outgoing edges of entry point\n");
379
380 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
381 {
382 move_or_delete_vzeroupper_2 (e->dest,
383 cfun->machine->caller_pass_avx256_p
384 ? used : unused);
385 BLOCK_INFO (e->dest)->processed = true;
386 }
387
388 /* Compute reverse completion order of depth first search of the CFG
389 so that the data-flow runs faster. */
390 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
391 bb_order = XNEWVEC (int, last_basic_block);
392 pre_and_rev_post_order_compute (NULL, rc_order, false);
393 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
394 bb_order[rc_order[i]] = i;
395 free (rc_order);
396
397 worklist = fibheap_new ();
398 pending = fibheap_new ();
399 visited = sbitmap_alloc (last_basic_block);
400 in_worklist = sbitmap_alloc (last_basic_block);
401 in_pending = sbitmap_alloc (last_basic_block);
402 sbitmap_zero (in_worklist);
403
404 /* Don't check outgoing edges of entry point. */
405 sbitmap_ones (in_pending);
406 FOR_EACH_BB (bb)
407 if (BLOCK_INFO (bb)->processed)
408 RESET_BIT (in_pending, bb->index);
409 else
410 {
411 move_or_delete_vzeroupper_1 (bb, false);
412 fibheap_insert (pending, bb_order[bb->index], bb);
413 }
414
415 if (dump_file)
416 fprintf (dump_file, "Check remaining basic blocks\n");
417
418 while (!fibheap_empty (pending))
419 {
420 fibheap_swap = pending;
421 pending = worklist;
422 worklist = fibheap_swap;
423 sbitmap_swap = in_pending;
424 in_pending = in_worklist;
425 in_worklist = sbitmap_swap;
426
427 sbitmap_zero (visited);
428
429 cfun->machine->rescan_vzeroupper_p = 0;
430
431 while (!fibheap_empty (worklist))
432 {
433 bb = (basic_block) fibheap_extract_min (worklist);
434 RESET_BIT (in_worklist, bb->index);
435 gcc_assert (!TEST_BIT (visited, bb->index));
436 if (!TEST_BIT (visited, bb->index))
437 {
438 edge_iterator ei;
439
440 SET_BIT (visited, bb->index);
441
442 if (move_or_delete_vzeroupper_1 (bb, false))
443 FOR_EACH_EDGE (e, ei, bb->succs)
444 {
445 if (e->dest == EXIT_BLOCK_PTR
446 || BLOCK_INFO (e->dest)->processed)
447 continue;
448
449 if (TEST_BIT (visited, e->dest->index))
450 {
451 if (!TEST_BIT (in_pending, e->dest->index))
452 {
453 /* Send E->DEST to next round. */
454 SET_BIT (in_pending, e->dest->index);
455 fibheap_insert (pending,
456 bb_order[e->dest->index],
457 e->dest);
458 }
459 }
460 else if (!TEST_BIT (in_worklist, e->dest->index))
461 {
462 /* Add E->DEST to current round. */
463 SET_BIT (in_worklist, e->dest->index);
464 fibheap_insert (worklist, bb_order[e->dest->index],
465 e->dest);
466 }
467 }
468 }
469 }
470
471 if (!cfun->machine->rescan_vzeroupper_p)
472 break;
473 }
474
475 free (bb_order);
476 fibheap_delete (worklist);
477 fibheap_delete (pending);
478 sbitmap_free (visited);
479 sbitmap_free (in_worklist);
480 sbitmap_free (in_pending);
481
482 if (dump_file)
483 fprintf (dump_file, "Process remaining basic blocks\n");
484
485 FOR_EACH_BB (bb)
486 move_or_delete_vzeroupper_1 (bb, true);
487
488 free_aux_for_blocks ();
489 }
490
491 static rtx legitimize_dllimport_symbol (rtx, bool);
492
493 #ifndef CHECK_STACK_LIMIT
494 #define CHECK_STACK_LIMIT (-1)
495 #endif
496
497 /* Return index of given mode in mult and division cost tables. */
498 #define MODE_INDEX(mode) \
499 ((mode) == QImode ? 0 \
500 : (mode) == HImode ? 1 \
501 : (mode) == SImode ? 2 \
502 : (mode) == DImode ? 3 \
503 : 4)
504
505 /* Processor costs (relative to an add) */
506 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
507 #define COSTS_N_BYTES(N) ((N) * 2)
508
509 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
510
511 const
512 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
513 COSTS_N_BYTES (2), /* cost of an add instruction */
514 COSTS_N_BYTES (3), /* cost of a lea instruction */
515 COSTS_N_BYTES (2), /* variable shift costs */
516 COSTS_N_BYTES (3), /* constant shift costs */
517 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
518 COSTS_N_BYTES (3), /* HI */
519 COSTS_N_BYTES (3), /* SI */
520 COSTS_N_BYTES (3), /* DI */
521 COSTS_N_BYTES (5)}, /* other */
522 0, /* cost of multiply per each bit set */
523 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
524 COSTS_N_BYTES (3), /* HI */
525 COSTS_N_BYTES (3), /* SI */
526 COSTS_N_BYTES (3), /* DI */
527 COSTS_N_BYTES (5)}, /* other */
528 COSTS_N_BYTES (3), /* cost of movsx */
529 COSTS_N_BYTES (3), /* cost of movzx */
530 0, /* "large" insn */
531 2, /* MOVE_RATIO */
532 2, /* cost for loading QImode using movzbl */
533 {2, 2, 2}, /* cost of loading integer registers
534 in QImode, HImode and SImode.
535 Relative to reg-reg move (2). */
536 {2, 2, 2}, /* cost of storing integer registers */
537 2, /* cost of reg,reg fld/fst */
538 {2, 2, 2}, /* cost of loading fp registers
539 in SFmode, DFmode and XFmode */
540 {2, 2, 2}, /* cost of storing fp registers
541 in SFmode, DFmode and XFmode */
542 3, /* cost of moving MMX register */
543 {3, 3}, /* cost of loading MMX registers
544 in SImode and DImode */
545 {3, 3}, /* cost of storing MMX registers
546 in SImode and DImode */
547 3, /* cost of moving SSE register */
548 {3, 3, 3}, /* cost of loading SSE registers
549 in SImode, DImode and TImode */
550 {3, 3, 3}, /* cost of storing SSE registers
551 in SImode, DImode and TImode */
552 3, /* MMX or SSE register to integer */
553 0, /* size of l1 cache */
554 0, /* size of l2 cache */
555 0, /* size of prefetch block */
556 0, /* number of parallel prefetches */
557 2, /* Branch cost */
558 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
559 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
560 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
561 COSTS_N_BYTES (2), /* cost of FABS instruction. */
562 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
563 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
564 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
565 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 1, /* scalar_stmt_cost. */
569 1, /* scalar load_cost. */
570 1, /* scalar_store_cost. */
571 1, /* vec_stmt_cost. */
572 1, /* vec_to_scalar_cost. */
573 1, /* scalar_to_vec_cost. */
574 1, /* vec_align_load_cost. */
575 1, /* vec_unalign_load_cost. */
576 1, /* vec_store_cost. */
577 1, /* cond_taken_branch_cost. */
578 1, /* cond_not_taken_branch_cost. */
579 };
580
581 /* Processor costs (relative to an add) */
582 static const
583 struct processor_costs i386_cost = { /* 386 specific costs */
584 COSTS_N_INSNS (1), /* cost of an add instruction */
585 COSTS_N_INSNS (1), /* cost of a lea instruction */
586 COSTS_N_INSNS (3), /* variable shift costs */
587 COSTS_N_INSNS (2), /* constant shift costs */
588 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
589 COSTS_N_INSNS (6), /* HI */
590 COSTS_N_INSNS (6), /* SI */
591 COSTS_N_INSNS (6), /* DI */
592 COSTS_N_INSNS (6)}, /* other */
593 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
594 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
595 COSTS_N_INSNS (23), /* HI */
596 COSTS_N_INSNS (23), /* SI */
597 COSTS_N_INSNS (23), /* DI */
598 COSTS_N_INSNS (23)}, /* other */
599 COSTS_N_INSNS (3), /* cost of movsx */
600 COSTS_N_INSNS (2), /* cost of movzx */
601 15, /* "large" insn */
602 3, /* MOVE_RATIO */
603 4, /* cost for loading QImode using movzbl */
604 {2, 4, 2}, /* cost of loading integer registers
605 in QImode, HImode and SImode.
606 Relative to reg-reg move (2). */
607 {2, 4, 2}, /* cost of storing integer registers */
608 2, /* cost of reg,reg fld/fst */
609 {8, 8, 8}, /* cost of loading fp registers
610 in SFmode, DFmode and XFmode */
611 {8, 8, 8}, /* cost of storing fp registers
612 in SFmode, DFmode and XFmode */
613 2, /* cost of moving MMX register */
614 {4, 8}, /* cost of loading MMX registers
615 in SImode and DImode */
616 {4, 8}, /* cost of storing MMX registers
617 in SImode and DImode */
618 2, /* cost of moving SSE register */
619 {4, 8, 16}, /* cost of loading SSE registers
620 in SImode, DImode and TImode */
621 {4, 8, 16}, /* cost of storing SSE registers
622 in SImode, DImode and TImode */
623 3, /* MMX or SSE register to integer */
624 0, /* size of l1 cache */
625 0, /* size of l2 cache */
626 0, /* size of prefetch block */
627 0, /* number of parallel prefetches */
628 1, /* Branch cost */
629 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
630 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
631 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
632 COSTS_N_INSNS (22), /* cost of FABS instruction. */
633 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
634 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
635 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
636 DUMMY_STRINGOP_ALGS},
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 1, /* scalar_stmt_cost. */
640 1, /* scalar load_cost. */
641 1, /* scalar_store_cost. */
642 1, /* vec_stmt_cost. */
643 1, /* vec_to_scalar_cost. */
644 1, /* scalar_to_vec_cost. */
645 1, /* vec_align_load_cost. */
646 2, /* vec_unalign_load_cost. */
647 1, /* vec_store_cost. */
648 3, /* cond_taken_branch_cost. */
649 1, /* cond_not_taken_branch_cost. */
650 };
651
652 static const
653 struct processor_costs i486_cost = { /* 486 specific costs */
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (1), /* cost of a lea instruction */
656 COSTS_N_INSNS (3), /* variable shift costs */
657 COSTS_N_INSNS (2), /* constant shift costs */
658 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (12), /* HI */
660 COSTS_N_INSNS (12), /* SI */
661 COSTS_N_INSNS (12), /* DI */
662 COSTS_N_INSNS (12)}, /* other */
663 1, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (40), /* HI */
666 COSTS_N_INSNS (40), /* SI */
667 COSTS_N_INSNS (40), /* DI */
668 COSTS_N_INSNS (40)}, /* other */
669 COSTS_N_INSNS (3), /* cost of movsx */
670 COSTS_N_INSNS (2), /* cost of movzx */
671 15, /* "large" insn */
672 3, /* MOVE_RATIO */
673 4, /* cost for loading QImode using movzbl */
674 {2, 4, 2}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {2, 4, 2}, /* cost of storing integer registers */
678 2, /* cost of reg,reg fld/fst */
679 {8, 8, 8}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {8, 8, 8}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {4, 8}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {4, 8}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {4, 8, 16}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {4, 8, 16}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 3, /* MMX or SSE register to integer */
694 4, /* size of l1 cache. 486 has 8kB cache
695 shared for code and data, so 4kB is
696 not really precise. */
697 4, /* size of l2 cache */
698 0, /* size of prefetch block */
699 0, /* number of parallel prefetches */
700 1, /* Branch cost */
701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
702 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
703 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
704 COSTS_N_INSNS (3), /* cost of FABS instruction. */
705 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
706 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
707 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
708 DUMMY_STRINGOP_ALGS},
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 1, /* scalar_stmt_cost. */
712 1, /* scalar load_cost. */
713 1, /* scalar_store_cost. */
714 1, /* vec_stmt_cost. */
715 1, /* vec_to_scalar_cost. */
716 1, /* scalar_to_vec_cost. */
717 1, /* vec_align_load_cost. */
718 2, /* vec_unalign_load_cost. */
719 1, /* vec_store_cost. */
720 3, /* cond_taken_branch_cost. */
721 1, /* cond_not_taken_branch_cost. */
722 };
723
724 static const
725 struct processor_costs pentium_cost = {
726 COSTS_N_INSNS (1), /* cost of an add instruction */
727 COSTS_N_INSNS (1), /* cost of a lea instruction */
728 COSTS_N_INSNS (4), /* variable shift costs */
729 COSTS_N_INSNS (1), /* constant shift costs */
730 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
731 COSTS_N_INSNS (11), /* HI */
732 COSTS_N_INSNS (11), /* SI */
733 COSTS_N_INSNS (11), /* DI */
734 COSTS_N_INSNS (11)}, /* other */
735 0, /* cost of multiply per each bit set */
736 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
737 COSTS_N_INSNS (25), /* HI */
738 COSTS_N_INSNS (25), /* SI */
739 COSTS_N_INSNS (25), /* DI */
740 COSTS_N_INSNS (25)}, /* other */
741 COSTS_N_INSNS (3), /* cost of movsx */
742 COSTS_N_INSNS (2), /* cost of movzx */
743 8, /* "large" insn */
744 6, /* MOVE_RATIO */
745 6, /* cost for loading QImode using movzbl */
746 {2, 4, 2}, /* cost of loading integer registers
747 in QImode, HImode and SImode.
748 Relative to reg-reg move (2). */
749 {2, 4, 2}, /* cost of storing integer registers */
750 2, /* cost of reg,reg fld/fst */
751 {2, 2, 6}, /* cost of loading fp registers
752 in SFmode, DFmode and XFmode */
753 {4, 4, 6}, /* cost of storing fp registers
754 in SFmode, DFmode and XFmode */
755 8, /* cost of moving MMX register */
756 {8, 8}, /* cost of loading MMX registers
757 in SImode and DImode */
758 {8, 8}, /* cost of storing MMX registers
759 in SImode and DImode */
760 2, /* cost of moving SSE register */
761 {4, 8, 16}, /* cost of loading SSE registers
762 in SImode, DImode and TImode */
763 {4, 8, 16}, /* cost of storing SSE registers
764 in SImode, DImode and TImode */
765 3, /* MMX or SSE register to integer */
766 8, /* size of l1 cache. */
767 8, /* size of l2 cache */
768 0, /* size of prefetch block */
769 0, /* number of parallel prefetches */
770 2, /* Branch cost */
771 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
772 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
773 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
774 COSTS_N_INSNS (1), /* cost of FABS instruction. */
775 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
776 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
777 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
778 DUMMY_STRINGOP_ALGS},
779 {{libcall, {{-1, rep_prefix_4_byte}}},
780 DUMMY_STRINGOP_ALGS},
781 1, /* scalar_stmt_cost. */
782 1, /* scalar load_cost. */
783 1, /* scalar_store_cost. */
784 1, /* vec_stmt_cost. */
785 1, /* vec_to_scalar_cost. */
786 1, /* scalar_to_vec_cost. */
787 1, /* vec_align_load_cost. */
788 2, /* vec_unalign_load_cost. */
789 1, /* vec_store_cost. */
790 3, /* cond_taken_branch_cost. */
791 1, /* cond_not_taken_branch_cost. */
792 };
793
794 static const
795 struct processor_costs pentiumpro_cost = {
796 COSTS_N_INSNS (1), /* cost of an add instruction */
797 COSTS_N_INSNS (1), /* cost of a lea instruction */
798 COSTS_N_INSNS (1), /* variable shift costs */
799 COSTS_N_INSNS (1), /* constant shift costs */
800 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
801 COSTS_N_INSNS (4), /* HI */
802 COSTS_N_INSNS (4), /* SI */
803 COSTS_N_INSNS (4), /* DI */
804 COSTS_N_INSNS (4)}, /* other */
805 0, /* cost of multiply per each bit set */
806 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
807 COSTS_N_INSNS (17), /* HI */
808 COSTS_N_INSNS (17), /* SI */
809 COSTS_N_INSNS (17), /* DI */
810 COSTS_N_INSNS (17)}, /* other */
811 COSTS_N_INSNS (1), /* cost of movsx */
812 COSTS_N_INSNS (1), /* cost of movzx */
813 8, /* "large" insn */
814 6, /* MOVE_RATIO */
815 2, /* cost for loading QImode using movzbl */
816 {4, 4, 4}, /* cost of loading integer registers
817 in QImode, HImode and SImode.
818 Relative to reg-reg move (2). */
819 {2, 2, 2}, /* cost of storing integer registers */
820 2, /* cost of reg,reg fld/fst */
821 {2, 2, 6}, /* cost of loading fp registers
822 in SFmode, DFmode and XFmode */
823 {4, 4, 6}, /* cost of storing fp registers
824 in SFmode, DFmode and XFmode */
825 2, /* cost of moving MMX register */
826 {2, 2}, /* cost of loading MMX registers
827 in SImode and DImode */
828 {2, 2}, /* cost of storing MMX registers
829 in SImode and DImode */
830 2, /* cost of moving SSE register */
831 {2, 2, 8}, /* cost of loading SSE registers
832 in SImode, DImode and TImode */
833 {2, 2, 8}, /* cost of storing SSE registers
834 in SImode, DImode and TImode */
835 3, /* MMX or SSE register to integer */
836 8, /* size of l1 cache. */
837 256, /* size of l2 cache */
838 32, /* size of prefetch block */
839 6, /* number of parallel prefetches */
840 2, /* Branch cost */
841 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
842 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
843 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
844 COSTS_N_INSNS (2), /* cost of FABS instruction. */
845 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
846 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
847 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
848 (we ensure the alignment). For small blocks inline loop is still a
849 noticeable win, for bigger blocks either rep movsl or rep movsb is
850 way to go. Rep movsb has apparently more expensive startup time in CPU,
851 but after 4K the difference is down in the noise. */
852 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
853 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
854 DUMMY_STRINGOP_ALGS},
855 {{rep_prefix_4_byte, {{1024, unrolled_loop},
856 {8192, rep_prefix_4_byte}, {-1, libcall}}},
857 DUMMY_STRINGOP_ALGS},
858 1, /* scalar_stmt_cost. */
859 1, /* scalar load_cost. */
860 1, /* scalar_store_cost. */
861 1, /* vec_stmt_cost. */
862 1, /* vec_to_scalar_cost. */
863 1, /* scalar_to_vec_cost. */
864 1, /* vec_align_load_cost. */
865 2, /* vec_unalign_load_cost. */
866 1, /* vec_store_cost. */
867 3, /* cond_taken_branch_cost. */
868 1, /* cond_not_taken_branch_cost. */
869 };
870
871 static const
872 struct processor_costs geode_cost = {
873 COSTS_N_INSNS (1), /* cost of an add instruction */
874 COSTS_N_INSNS (1), /* cost of a lea instruction */
875 COSTS_N_INSNS (2), /* variable shift costs */
876 COSTS_N_INSNS (1), /* constant shift costs */
877 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
878 COSTS_N_INSNS (4), /* HI */
879 COSTS_N_INSNS (7), /* SI */
880 COSTS_N_INSNS (7), /* DI */
881 COSTS_N_INSNS (7)}, /* other */
882 0, /* cost of multiply per each bit set */
883 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
884 COSTS_N_INSNS (23), /* HI */
885 COSTS_N_INSNS (39), /* SI */
886 COSTS_N_INSNS (39), /* DI */
887 COSTS_N_INSNS (39)}, /* other */
888 COSTS_N_INSNS (1), /* cost of movsx */
889 COSTS_N_INSNS (1), /* cost of movzx */
890 8, /* "large" insn */
891 4, /* MOVE_RATIO */
892 1, /* cost for loading QImode using movzbl */
893 {1, 1, 1}, /* cost of loading integer registers
894 in QImode, HImode and SImode.
895 Relative to reg-reg move (2). */
896 {1, 1, 1}, /* cost of storing integer registers */
897 1, /* cost of reg,reg fld/fst */
898 {1, 1, 1}, /* cost of loading fp registers
899 in SFmode, DFmode and XFmode */
900 {4, 6, 6}, /* cost of storing fp registers
901 in SFmode, DFmode and XFmode */
902
903 1, /* cost of moving MMX register */
904 {1, 1}, /* cost of loading MMX registers
905 in SImode and DImode */
906 {1, 1}, /* cost of storing MMX registers
907 in SImode and DImode */
908 1, /* cost of moving SSE register */
909 {1, 1, 1}, /* cost of loading SSE registers
910 in SImode, DImode and TImode */
911 {1, 1, 1}, /* cost of storing SSE registers
912 in SImode, DImode and TImode */
913 1, /* MMX or SSE register to integer */
914 64, /* size of l1 cache. */
915 128, /* size of l2 cache. */
916 32, /* size of prefetch block */
917 1, /* number of parallel prefetches */
918 1, /* Branch cost */
919 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
920 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
921 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
922 COSTS_N_INSNS (1), /* cost of FABS instruction. */
923 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
924 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
925 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
926 DUMMY_STRINGOP_ALGS},
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 1, /* scalar_stmt_cost. */
930 1, /* scalar load_cost. */
931 1, /* scalar_store_cost. */
932 1, /* vec_stmt_cost. */
933 1, /* vec_to_scalar_cost. */
934 1, /* scalar_to_vec_cost. */
935 1, /* vec_align_load_cost. */
936 2, /* vec_unalign_load_cost. */
937 1, /* vec_store_cost. */
938 3, /* cond_taken_branch_cost. */
939 1, /* cond_not_taken_branch_cost. */
940 };
941
942 static const
943 struct processor_costs k6_cost = {
944 COSTS_N_INSNS (1), /* cost of an add instruction */
945 COSTS_N_INSNS (2), /* cost of a lea instruction */
946 COSTS_N_INSNS (1), /* variable shift costs */
947 COSTS_N_INSNS (1), /* constant shift costs */
948 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
949 COSTS_N_INSNS (3), /* HI */
950 COSTS_N_INSNS (3), /* SI */
951 COSTS_N_INSNS (3), /* DI */
952 COSTS_N_INSNS (3)}, /* other */
953 0, /* cost of multiply per each bit set */
954 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
955 COSTS_N_INSNS (18), /* HI */
956 COSTS_N_INSNS (18), /* SI */
957 COSTS_N_INSNS (18), /* DI */
958 COSTS_N_INSNS (18)}, /* other */
959 COSTS_N_INSNS (2), /* cost of movsx */
960 COSTS_N_INSNS (2), /* cost of movzx */
961 8, /* "large" insn */
962 4, /* MOVE_RATIO */
963 3, /* cost for loading QImode using movzbl */
964 {4, 5, 4}, /* cost of loading integer registers
965 in QImode, HImode and SImode.
966 Relative to reg-reg move (2). */
967 {2, 3, 2}, /* cost of storing integer registers */
968 4, /* cost of reg,reg fld/fst */
969 {6, 6, 6}, /* cost of loading fp registers
970 in SFmode, DFmode and XFmode */
971 {4, 4, 4}, /* cost of storing fp registers
972 in SFmode, DFmode and XFmode */
973 2, /* cost of moving MMX register */
974 {2, 2}, /* cost of loading MMX registers
975 in SImode and DImode */
976 {2, 2}, /* cost of storing MMX registers
977 in SImode and DImode */
978 2, /* cost of moving SSE register */
979 {2, 2, 8}, /* cost of loading SSE registers
980 in SImode, DImode and TImode */
981 {2, 2, 8}, /* cost of storing SSE registers
982 in SImode, DImode and TImode */
983 6, /* MMX or SSE register to integer */
984 32, /* size of l1 cache. */
985 32, /* size of l2 cache. Some models
986 have integrated l2 cache, but
987 optimizing for k6 is not important
988 enough to worry about that. */
989 32, /* size of prefetch block */
990 1, /* number of parallel prefetches */
991 1, /* Branch cost */
992 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
993 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
994 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
995 COSTS_N_INSNS (2), /* cost of FABS instruction. */
996 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
997 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
998 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
999 DUMMY_STRINGOP_ALGS},
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 1, /* scalar_stmt_cost. */
1003 1, /* scalar load_cost. */
1004 1, /* scalar_store_cost. */
1005 1, /* vec_stmt_cost. */
1006 1, /* vec_to_scalar_cost. */
1007 1, /* scalar_to_vec_cost. */
1008 1, /* vec_align_load_cost. */
1009 2, /* vec_unalign_load_cost. */
1010 1, /* vec_store_cost. */
1011 3, /* cond_taken_branch_cost. */
1012 1, /* cond_not_taken_branch_cost. */
1013 };
1014
1015 static const
1016 struct processor_costs athlon_cost = {
1017 COSTS_N_INSNS (1), /* cost of an add instruction */
1018 COSTS_N_INSNS (2), /* cost of a lea instruction */
1019 COSTS_N_INSNS (1), /* variable shift costs */
1020 COSTS_N_INSNS (1), /* constant shift costs */
1021 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1022 COSTS_N_INSNS (5), /* HI */
1023 COSTS_N_INSNS (5), /* SI */
1024 COSTS_N_INSNS (5), /* DI */
1025 COSTS_N_INSNS (5)}, /* other */
1026 0, /* cost of multiply per each bit set */
1027 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1028 COSTS_N_INSNS (26), /* HI */
1029 COSTS_N_INSNS (42), /* SI */
1030 COSTS_N_INSNS (74), /* DI */
1031 COSTS_N_INSNS (74)}, /* other */
1032 COSTS_N_INSNS (1), /* cost of movsx */
1033 COSTS_N_INSNS (1), /* cost of movzx */
1034 8, /* "large" insn */
1035 9, /* MOVE_RATIO */
1036 4, /* cost for loading QImode using movzbl */
1037 {3, 4, 3}, /* cost of loading integer registers
1038 in QImode, HImode and SImode.
1039 Relative to reg-reg move (2). */
1040 {3, 4, 3}, /* cost of storing integer registers */
1041 4, /* cost of reg,reg fld/fst */
1042 {4, 4, 12}, /* cost of loading fp registers
1043 in SFmode, DFmode and XFmode */
1044 {6, 6, 8}, /* cost of storing fp registers
1045 in SFmode, DFmode and XFmode */
1046 2, /* cost of moving MMX register */
1047 {4, 4}, /* cost of loading MMX registers
1048 in SImode and DImode */
1049 {4, 4}, /* cost of storing MMX registers
1050 in SImode and DImode */
1051 2, /* cost of moving SSE register */
1052 {4, 4, 6}, /* cost of loading SSE registers
1053 in SImode, DImode and TImode */
1054 {4, 4, 5}, /* cost of storing SSE registers
1055 in SImode, DImode and TImode */
1056 5, /* MMX or SSE register to integer */
1057 64, /* size of l1 cache. */
1058 256, /* size of l2 cache. */
1059 64, /* size of prefetch block */
1060 6, /* number of parallel prefetches */
1061 5, /* Branch cost */
1062 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1063 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1064 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1065 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1066 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1067 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1068 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1069 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1070 128 bytes for memset. */
1071 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1072 DUMMY_STRINGOP_ALGS},
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 1, /* scalar_stmt_cost. */
1076 1, /* scalar load_cost. */
1077 1, /* scalar_store_cost. */
1078 1, /* vec_stmt_cost. */
1079 1, /* vec_to_scalar_cost. */
1080 1, /* scalar_to_vec_cost. */
1081 1, /* vec_align_load_cost. */
1082 2, /* vec_unalign_load_cost. */
1083 1, /* vec_store_cost. */
1084 3, /* cond_taken_branch_cost. */
1085 1, /* cond_not_taken_branch_cost. */
1086 };
1087
1088 static const
1089 struct processor_costs k8_cost = {
1090 COSTS_N_INSNS (1), /* cost of an add instruction */
1091 COSTS_N_INSNS (2), /* cost of a lea instruction */
1092 COSTS_N_INSNS (1), /* variable shift costs */
1093 COSTS_N_INSNS (1), /* constant shift costs */
1094 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1095 COSTS_N_INSNS (4), /* HI */
1096 COSTS_N_INSNS (3), /* SI */
1097 COSTS_N_INSNS (4), /* DI */
1098 COSTS_N_INSNS (5)}, /* other */
1099 0, /* cost of multiply per each bit set */
1100 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1101 COSTS_N_INSNS (26), /* HI */
1102 COSTS_N_INSNS (42), /* SI */
1103 COSTS_N_INSNS (74), /* DI */
1104 COSTS_N_INSNS (74)}, /* other */
1105 COSTS_N_INSNS (1), /* cost of movsx */
1106 COSTS_N_INSNS (1), /* cost of movzx */
1107 8, /* "large" insn */
1108 9, /* MOVE_RATIO */
1109 4, /* cost for loading QImode using movzbl */
1110 {3, 4, 3}, /* cost of loading integer registers
1111 in QImode, HImode and SImode.
1112 Relative to reg-reg move (2). */
1113 {3, 4, 3}, /* cost of storing integer registers */
1114 4, /* cost of reg,reg fld/fst */
1115 {4, 4, 12}, /* cost of loading fp registers
1116 in SFmode, DFmode and XFmode */
1117 {6, 6, 8}, /* cost of storing fp registers
1118 in SFmode, DFmode and XFmode */
1119 2, /* cost of moving MMX register */
1120 {3, 3}, /* cost of loading MMX registers
1121 in SImode and DImode */
1122 {4, 4}, /* cost of storing MMX registers
1123 in SImode and DImode */
1124 2, /* cost of moving SSE register */
1125 {4, 3, 6}, /* cost of loading SSE registers
1126 in SImode, DImode and TImode */
1127 {4, 4, 5}, /* cost of storing SSE registers
1128 in SImode, DImode and TImode */
1129 5, /* MMX or SSE register to integer */
1130 64, /* size of l1 cache. */
1131 512, /* size of l2 cache. */
1132 64, /* size of prefetch block */
1133 /* New AMD processors never drop prefetches; if they cannot be performed
1134 immediately, they are queued. We set number of simultaneous prefetches
1135 to a large constant to reflect this (it probably is not a good idea not
1136 to limit number of prefetches at all, as their execution also takes some
1137 time). */
1138 100, /* number of parallel prefetches */
1139 3, /* Branch cost */
1140 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1141 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1142 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1143 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1144 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1145 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1146 /* K8 has optimized REP instruction for medium sized blocks, but for very
1147 small blocks it is better to use loop. For large blocks, libcall can
1148 do nontemporary accesses and beat inline considerably. */
1149 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1150 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1151 {{libcall, {{8, loop}, {24, unrolled_loop},
1152 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1153 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1154 4, /* scalar_stmt_cost. */
1155 2, /* scalar load_cost. */
1156 2, /* scalar_store_cost. */
1157 5, /* vec_stmt_cost. */
1158 0, /* vec_to_scalar_cost. */
1159 2, /* scalar_to_vec_cost. */
1160 2, /* vec_align_load_cost. */
1161 3, /* vec_unalign_load_cost. */
1162 3, /* vec_store_cost. */
1163 3, /* cond_taken_branch_cost. */
1164 2, /* cond_not_taken_branch_cost. */
1165 };
1166
1167 struct processor_costs amdfam10_cost = {
1168 COSTS_N_INSNS (1), /* cost of an add instruction */
1169 COSTS_N_INSNS (2), /* cost of a lea instruction */
1170 COSTS_N_INSNS (1), /* variable shift costs */
1171 COSTS_N_INSNS (1), /* constant shift costs */
1172 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1173 COSTS_N_INSNS (4), /* HI */
1174 COSTS_N_INSNS (3), /* SI */
1175 COSTS_N_INSNS (4), /* DI */
1176 COSTS_N_INSNS (5)}, /* other */
1177 0, /* cost of multiply per each bit set */
1178 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1179 COSTS_N_INSNS (35), /* HI */
1180 COSTS_N_INSNS (51), /* SI */
1181 COSTS_N_INSNS (83), /* DI */
1182 COSTS_N_INSNS (83)}, /* other */
1183 COSTS_N_INSNS (1), /* cost of movsx */
1184 COSTS_N_INSNS (1), /* cost of movzx */
1185 8, /* "large" insn */
1186 9, /* MOVE_RATIO */
1187 4, /* cost for loading QImode using movzbl */
1188 {3, 4, 3}, /* cost of loading integer registers
1189 in QImode, HImode and SImode.
1190 Relative to reg-reg move (2). */
1191 {3, 4, 3}, /* cost of storing integer registers */
1192 4, /* cost of reg,reg fld/fst */
1193 {4, 4, 12}, /* cost of loading fp registers
1194 in SFmode, DFmode and XFmode */
1195 {6, 6, 8}, /* cost of storing fp registers
1196 in SFmode, DFmode and XFmode */
1197 2, /* cost of moving MMX register */
1198 {3, 3}, /* cost of loading MMX registers
1199 in SImode and DImode */
1200 {4, 4}, /* cost of storing MMX registers
1201 in SImode and DImode */
1202 2, /* cost of moving SSE register */
1203 {4, 4, 3}, /* cost of loading SSE registers
1204 in SImode, DImode and TImode */
1205 {4, 4, 5}, /* cost of storing SSE registers
1206 in SImode, DImode and TImode */
1207 3, /* MMX or SSE register to integer */
1208 /* On K8:
1209 MOVD reg64, xmmreg Double FSTORE 4
1210 MOVD reg32, xmmreg Double FSTORE 4
1211 On AMDFAM10:
1212 MOVD reg64, xmmreg Double FADD 3
1213 1/1 1/1
1214 MOVD reg32, xmmreg Double FADD 3
1215 1/1 1/1 */
1216 64, /* size of l1 cache. */
1217 512, /* size of l2 cache. */
1218 64, /* size of prefetch block */
1219 /* New AMD processors never drop prefetches; if they cannot be performed
1220 immediately, they are queued. We set number of simultaneous prefetches
1221 to a large constant to reflect this (it probably is not a good idea not
1222 to limit number of prefetches at all, as their execution also takes some
1223 time). */
1224 100, /* number of parallel prefetches */
1225 2, /* Branch cost */
1226 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1227 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1228 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1229 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1230 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1231 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1232
1233 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1234 very small blocks it is better to use loop. For large blocks, libcall can
1235 do nontemporary accesses and beat inline considerably. */
1236 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1237 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1238 {{libcall, {{8, loop}, {24, unrolled_loop},
1239 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1240 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1241 4, /* scalar_stmt_cost. */
1242 2, /* scalar load_cost. */
1243 2, /* scalar_store_cost. */
1244 6, /* vec_stmt_cost. */
1245 0, /* vec_to_scalar_cost. */
1246 2, /* scalar_to_vec_cost. */
1247 2, /* vec_align_load_cost. */
1248 2, /* vec_unalign_load_cost. */
1249 2, /* vec_store_cost. */
1250 2, /* cond_taken_branch_cost. */
1251 1, /* cond_not_taken_branch_cost. */
1252 };
1253
1254 struct processor_costs bdver1_cost = {
1255 COSTS_N_INSNS (1), /* cost of an add instruction */
1256 COSTS_N_INSNS (1), /* cost of a lea instruction */
1257 COSTS_N_INSNS (1), /* variable shift costs */
1258 COSTS_N_INSNS (1), /* constant shift costs */
1259 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1260 COSTS_N_INSNS (4), /* HI */
1261 COSTS_N_INSNS (4), /* SI */
1262 COSTS_N_INSNS (6), /* DI */
1263 COSTS_N_INSNS (6)}, /* other */
1264 0, /* cost of multiply per each bit set */
1265 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1266 COSTS_N_INSNS (35), /* HI */
1267 COSTS_N_INSNS (51), /* SI */
1268 COSTS_N_INSNS (83), /* DI */
1269 COSTS_N_INSNS (83)}, /* other */
1270 COSTS_N_INSNS (1), /* cost of movsx */
1271 COSTS_N_INSNS (1), /* cost of movzx */
1272 8, /* "large" insn */
1273 9, /* MOVE_RATIO */
1274 4, /* cost for loading QImode using movzbl */
1275 {5, 5, 4}, /* cost of loading integer registers
1276 in QImode, HImode and SImode.
1277 Relative to reg-reg move (2). */
1278 {4, 4, 4}, /* cost of storing integer registers */
1279 2, /* cost of reg,reg fld/fst */
1280 {5, 5, 12}, /* cost of loading fp registers
1281 in SFmode, DFmode and XFmode */
1282 {4, 4, 8}, /* cost of storing fp registers
1283 in SFmode, DFmode and XFmode */
1284 2, /* cost of moving MMX register */
1285 {4, 4}, /* cost of loading MMX registers
1286 in SImode and DImode */
1287 {4, 4}, /* cost of storing MMX registers
1288 in SImode and DImode */
1289 2, /* cost of moving SSE register */
1290 {4, 4, 4}, /* cost of loading SSE registers
1291 in SImode, DImode and TImode */
1292 {4, 4, 4}, /* cost of storing SSE registers
1293 in SImode, DImode and TImode */
1294 2, /* MMX or SSE register to integer */
1295 /* On K8:
1296 MOVD reg64, xmmreg Double FSTORE 4
1297 MOVD reg32, xmmreg Double FSTORE 4
1298 On AMDFAM10:
1299 MOVD reg64, xmmreg Double FADD 3
1300 1/1 1/1
1301 MOVD reg32, xmmreg Double FADD 3
1302 1/1 1/1 */
1303 16, /* size of l1 cache. */
1304 2048, /* size of l2 cache. */
1305 64, /* size of prefetch block */
1306 /* New AMD processors never drop prefetches; if they cannot be performed
1307 immediately, they are queued. We set number of simultaneous prefetches
1308 to a large constant to reflect this (it probably is not a good idea not
1309 to limit number of prefetches at all, as their execution also takes some
1310 time). */
1311 100, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1319
1320 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1321 very small blocks it is better to use loop. For large blocks, libcall
1322 can do nontemporary accesses and beat inline considerably. */
1323 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1324 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1325 {{libcall, {{8, loop}, {24, unrolled_loop},
1326 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1327 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1328 6, /* scalar_stmt_cost. */
1329 4, /* scalar load_cost. */
1330 4, /* scalar_store_cost. */
1331 6, /* vec_stmt_cost. */
1332 0, /* vec_to_scalar_cost. */
1333 2, /* scalar_to_vec_cost. */
1334 4, /* vec_align_load_cost. */
1335 4, /* vec_unalign_load_cost. */
1336 4, /* vec_store_cost. */
1337 2, /* cond_taken_branch_cost. */
1338 1, /* cond_not_taken_branch_cost. */
1339 };
1340
1341 struct processor_costs bdver2_cost = {
1342 COSTS_N_INSNS (1), /* cost of an add instruction */
1343 COSTS_N_INSNS (1), /* cost of a lea instruction */
1344 COSTS_N_INSNS (1), /* variable shift costs */
1345 COSTS_N_INSNS (1), /* constant shift costs */
1346 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1347 COSTS_N_INSNS (4), /* HI */
1348 COSTS_N_INSNS (4), /* SI */
1349 COSTS_N_INSNS (6), /* DI */
1350 COSTS_N_INSNS (6)}, /* other */
1351 0, /* cost of multiply per each bit set */
1352 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1353 COSTS_N_INSNS (35), /* HI */
1354 COSTS_N_INSNS (51), /* SI */
1355 COSTS_N_INSNS (83), /* DI */
1356 COSTS_N_INSNS (83)}, /* other */
1357 COSTS_N_INSNS (1), /* cost of movsx */
1358 COSTS_N_INSNS (1), /* cost of movzx */
1359 8, /* "large" insn */
1360 9, /* MOVE_RATIO */
1361 4, /* cost for loading QImode using movzbl */
1362 {5, 5, 4}, /* cost of loading integer registers
1363 in QImode, HImode and SImode.
1364 Relative to reg-reg move (2). */
1365 {4, 4, 4}, /* cost of storing integer registers */
1366 2, /* cost of reg,reg fld/fst */
1367 {5, 5, 12}, /* cost of loading fp registers
1368 in SFmode, DFmode and XFmode */
1369 {4, 4, 8}, /* cost of storing fp registers
1370 in SFmode, DFmode and XFmode */
1371 2, /* cost of moving MMX register */
1372 {4, 4}, /* cost of loading MMX registers
1373 in SImode and DImode */
1374 {4, 4}, /* cost of storing MMX registers
1375 in SImode and DImode */
1376 2, /* cost of moving SSE register */
1377 {4, 4, 4}, /* cost of loading SSE registers
1378 in SImode, DImode and TImode */
1379 {4, 4, 4}, /* cost of storing SSE registers
1380 in SImode, DImode and TImode */
1381 2, /* MMX or SSE register to integer */
1382 /* On K8:
1383 MOVD reg64, xmmreg Double FSTORE 4
1384 MOVD reg32, xmmreg Double FSTORE 4
1385 On AMDFAM10:
1386 MOVD reg64, xmmreg Double FADD 3
1387 1/1 1/1
1388 MOVD reg32, xmmreg Double FADD 3
1389 1/1 1/1 */
1390 16, /* size of l1 cache. */
1391 2048, /* size of l2 cache. */
1392 64, /* size of prefetch block */
1393 /* New AMD processors never drop prefetches; if they cannot be performed
1394 immediately, they are queued. We set number of simultaneous prefetches
1395 to a large constant to reflect this (it probably is not a good idea not
1396 to limit number of prefetches at all, as their execution also takes some
1397 time). */
1398 100, /* number of parallel prefetches */
1399 2, /* Branch cost */
1400 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1401 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1402 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1403 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1404 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1405 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1406
1407 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1408 very small blocks it is better to use loop. For large blocks, libcall
1409 can do nontemporary accesses and beat inline considerably. */
1410 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1411 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1412 {{libcall, {{8, loop}, {24, unrolled_loop},
1413 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1414 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1415 6, /* scalar_stmt_cost. */
1416 4, /* scalar load_cost. */
1417 4, /* scalar_store_cost. */
1418 6, /* vec_stmt_cost. */
1419 0, /* vec_to_scalar_cost. */
1420 2, /* scalar_to_vec_cost. */
1421 4, /* vec_align_load_cost. */
1422 4, /* vec_unalign_load_cost. */
1423 4, /* vec_store_cost. */
1424 2, /* cond_taken_branch_cost. */
1425 1, /* cond_not_taken_branch_cost. */
1426 };
1427
1428 struct processor_costs btver1_cost = {
1429 COSTS_N_INSNS (1), /* cost of an add instruction */
1430 COSTS_N_INSNS (2), /* cost of a lea instruction */
1431 COSTS_N_INSNS (1), /* variable shift costs */
1432 COSTS_N_INSNS (1), /* constant shift costs */
1433 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1434 COSTS_N_INSNS (4), /* HI */
1435 COSTS_N_INSNS (3), /* SI */
1436 COSTS_N_INSNS (4), /* DI */
1437 COSTS_N_INSNS (5)}, /* other */
1438 0, /* cost of multiply per each bit set */
1439 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1440 COSTS_N_INSNS (35), /* HI */
1441 COSTS_N_INSNS (51), /* SI */
1442 COSTS_N_INSNS (83), /* DI */
1443 COSTS_N_INSNS (83)}, /* other */
1444 COSTS_N_INSNS (1), /* cost of movsx */
1445 COSTS_N_INSNS (1), /* cost of movzx */
1446 8, /* "large" insn */
1447 9, /* MOVE_RATIO */
1448 4, /* cost for loading QImode using movzbl */
1449 {3, 4, 3}, /* cost of loading integer registers
1450 in QImode, HImode and SImode.
1451 Relative to reg-reg move (2). */
1452 {3, 4, 3}, /* cost of storing integer registers */
1453 4, /* cost of reg,reg fld/fst */
1454 {4, 4, 12}, /* cost of loading fp registers
1455 in SFmode, DFmode and XFmode */
1456 {6, 6, 8}, /* cost of storing fp registers
1457 in SFmode, DFmode and XFmode */
1458 2, /* cost of moving MMX register */
1459 {3, 3}, /* cost of loading MMX registers
1460 in SImode and DImode */
1461 {4, 4}, /* cost of storing MMX registers
1462 in SImode and DImode */
1463 2, /* cost of moving SSE register */
1464 {4, 4, 3}, /* cost of loading SSE registers
1465 in SImode, DImode and TImode */
1466 {4, 4, 5}, /* cost of storing SSE registers
1467 in SImode, DImode and TImode */
1468 3, /* MMX or SSE register to integer */
1469 /* On K8:
1470 MOVD reg64, xmmreg Double FSTORE 4
1471 MOVD reg32, xmmreg Double FSTORE 4
1472 On AMDFAM10:
1473 MOVD reg64, xmmreg Double FADD 3
1474 1/1 1/1
1475 MOVD reg32, xmmreg Double FADD 3
1476 1/1 1/1 */
1477 32, /* size of l1 cache. */
1478 512, /* size of l2 cache. */
1479 64, /* size of prefetch block */
1480 100, /* number of parallel prefetches */
1481 2, /* Branch cost */
1482 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1483 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1484 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1485 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1486 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1487 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1488
1489 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1490 very small blocks it is better to use loop. For large blocks, libcall can
1491 do nontemporary accesses and beat inline considerably. */
1492 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1493 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1494 {{libcall, {{8, loop}, {24, unrolled_loop},
1495 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1496 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1497 4, /* scalar_stmt_cost. */
1498 2, /* scalar load_cost. */
1499 2, /* scalar_store_cost. */
1500 6, /* vec_stmt_cost. */
1501 0, /* vec_to_scalar_cost. */
1502 2, /* scalar_to_vec_cost. */
1503 2, /* vec_align_load_cost. */
1504 2, /* vec_unalign_load_cost. */
1505 2, /* vec_store_cost. */
1506 2, /* cond_taken_branch_cost. */
1507 1, /* cond_not_taken_branch_cost. */
1508 };
1509
1510 static const
1511 struct processor_costs pentium4_cost = {
1512 COSTS_N_INSNS (1), /* cost of an add instruction */
1513 COSTS_N_INSNS (3), /* cost of a lea instruction */
1514 COSTS_N_INSNS (4), /* variable shift costs */
1515 COSTS_N_INSNS (4), /* constant shift costs */
1516 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1517 COSTS_N_INSNS (15), /* HI */
1518 COSTS_N_INSNS (15), /* SI */
1519 COSTS_N_INSNS (15), /* DI */
1520 COSTS_N_INSNS (15)}, /* other */
1521 0, /* cost of multiply per each bit set */
1522 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1523 COSTS_N_INSNS (56), /* HI */
1524 COSTS_N_INSNS (56), /* SI */
1525 COSTS_N_INSNS (56), /* DI */
1526 COSTS_N_INSNS (56)}, /* other */
1527 COSTS_N_INSNS (1), /* cost of movsx */
1528 COSTS_N_INSNS (1), /* cost of movzx */
1529 16, /* "large" insn */
1530 6, /* MOVE_RATIO */
1531 2, /* cost for loading QImode using movzbl */
1532 {4, 5, 4}, /* cost of loading integer registers
1533 in QImode, HImode and SImode.
1534 Relative to reg-reg move (2). */
1535 {2, 3, 2}, /* cost of storing integer registers */
1536 2, /* cost of reg,reg fld/fst */
1537 {2, 2, 6}, /* cost of loading fp registers
1538 in SFmode, DFmode and XFmode */
1539 {4, 4, 6}, /* cost of storing fp registers
1540 in SFmode, DFmode and XFmode */
1541 2, /* cost of moving MMX register */
1542 {2, 2}, /* cost of loading MMX registers
1543 in SImode and DImode */
1544 {2, 2}, /* cost of storing MMX registers
1545 in SImode and DImode */
1546 12, /* cost of moving SSE register */
1547 {12, 12, 12}, /* cost of loading SSE registers
1548 in SImode, DImode and TImode */
1549 {2, 2, 8}, /* cost of storing SSE registers
1550 in SImode, DImode and TImode */
1551 10, /* MMX or SSE register to integer */
1552 8, /* size of l1 cache. */
1553 256, /* size of l2 cache. */
1554 64, /* size of prefetch block */
1555 6, /* number of parallel prefetches */
1556 2, /* Branch cost */
1557 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1558 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1559 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1560 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1561 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1562 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1563 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1564 DUMMY_STRINGOP_ALGS},
1565 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1566 {-1, libcall}}},
1567 DUMMY_STRINGOP_ALGS},
1568 1, /* scalar_stmt_cost. */
1569 1, /* scalar load_cost. */
1570 1, /* scalar_store_cost. */
1571 1, /* vec_stmt_cost. */
1572 1, /* vec_to_scalar_cost. */
1573 1, /* scalar_to_vec_cost. */
1574 1, /* vec_align_load_cost. */
1575 2, /* vec_unalign_load_cost. */
1576 1, /* vec_store_cost. */
1577 3, /* cond_taken_branch_cost. */
1578 1, /* cond_not_taken_branch_cost. */
1579 };
1580
1581 static const
1582 struct processor_costs nocona_cost = {
1583 COSTS_N_INSNS (1), /* cost of an add instruction */
1584 COSTS_N_INSNS (1), /* cost of a lea instruction */
1585 COSTS_N_INSNS (1), /* variable shift costs */
1586 COSTS_N_INSNS (1), /* constant shift costs */
1587 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1588 COSTS_N_INSNS (10), /* HI */
1589 COSTS_N_INSNS (10), /* SI */
1590 COSTS_N_INSNS (10), /* DI */
1591 COSTS_N_INSNS (10)}, /* other */
1592 0, /* cost of multiply per each bit set */
1593 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1594 COSTS_N_INSNS (66), /* HI */
1595 COSTS_N_INSNS (66), /* SI */
1596 COSTS_N_INSNS (66), /* DI */
1597 COSTS_N_INSNS (66)}, /* other */
1598 COSTS_N_INSNS (1), /* cost of movsx */
1599 COSTS_N_INSNS (1), /* cost of movzx */
1600 16, /* "large" insn */
1601 17, /* MOVE_RATIO */
1602 4, /* cost for loading QImode using movzbl */
1603 {4, 4, 4}, /* cost of loading integer registers
1604 in QImode, HImode and SImode.
1605 Relative to reg-reg move (2). */
1606 {4, 4, 4}, /* cost of storing integer registers */
1607 3, /* cost of reg,reg fld/fst */
1608 {12, 12, 12}, /* cost of loading fp registers
1609 in SFmode, DFmode and XFmode */
1610 {4, 4, 4}, /* cost of storing fp registers
1611 in SFmode, DFmode and XFmode */
1612 6, /* cost of moving MMX register */
1613 {12, 12}, /* cost of loading MMX registers
1614 in SImode and DImode */
1615 {12, 12}, /* cost of storing MMX registers
1616 in SImode and DImode */
1617 6, /* cost of moving SSE register */
1618 {12, 12, 12}, /* cost of loading SSE registers
1619 in SImode, DImode and TImode */
1620 {12, 12, 12}, /* cost of storing SSE registers
1621 in SImode, DImode and TImode */
1622 8, /* MMX or SSE register to integer */
1623 8, /* size of l1 cache. */
1624 1024, /* size of l2 cache. */
1625 128, /* size of prefetch block */
1626 8, /* number of parallel prefetches */
1627 1, /* Branch cost */
1628 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1629 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1630 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1631 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1632 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1633 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1634 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1635 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1636 {100000, unrolled_loop}, {-1, libcall}}}},
1637 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1638 {-1, libcall}}},
1639 {libcall, {{24, loop}, {64, unrolled_loop},
1640 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1641 1, /* scalar_stmt_cost. */
1642 1, /* scalar load_cost. */
1643 1, /* scalar_store_cost. */
1644 1, /* vec_stmt_cost. */
1645 1, /* vec_to_scalar_cost. */
1646 1, /* scalar_to_vec_cost. */
1647 1, /* vec_align_load_cost. */
1648 2, /* vec_unalign_load_cost. */
1649 1, /* vec_store_cost. */
1650 3, /* cond_taken_branch_cost. */
1651 1, /* cond_not_taken_branch_cost. */
1652 };
1653
1654 static const
1655 struct processor_costs atom_cost = {
1656 COSTS_N_INSNS (1), /* cost of an add instruction */
1657 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1658 COSTS_N_INSNS (1), /* variable shift costs */
1659 COSTS_N_INSNS (1), /* constant shift costs */
1660 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1661 COSTS_N_INSNS (4), /* HI */
1662 COSTS_N_INSNS (3), /* SI */
1663 COSTS_N_INSNS (4), /* DI */
1664 COSTS_N_INSNS (2)}, /* other */
1665 0, /* cost of multiply per each bit set */
1666 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1667 COSTS_N_INSNS (26), /* HI */
1668 COSTS_N_INSNS (42), /* SI */
1669 COSTS_N_INSNS (74), /* DI */
1670 COSTS_N_INSNS (74)}, /* other */
1671 COSTS_N_INSNS (1), /* cost of movsx */
1672 COSTS_N_INSNS (1), /* cost of movzx */
1673 8, /* "large" insn */
1674 17, /* MOVE_RATIO */
1675 4, /* cost for loading QImode using movzbl */
1676 {4, 4, 4}, /* cost of loading integer registers
1677 in QImode, HImode and SImode.
1678 Relative to reg-reg move (2). */
1679 {4, 4, 4}, /* cost of storing integer registers */
1680 4, /* cost of reg,reg fld/fst */
1681 {12, 12, 12}, /* cost of loading fp registers
1682 in SFmode, DFmode and XFmode */
1683 {6, 6, 8}, /* cost of storing fp registers
1684 in SFmode, DFmode and XFmode */
1685 2, /* cost of moving MMX register */
1686 {8, 8}, /* cost of loading MMX registers
1687 in SImode and DImode */
1688 {8, 8}, /* cost of storing MMX registers
1689 in SImode and DImode */
1690 2, /* cost of moving SSE register */
1691 {8, 8, 8}, /* cost of loading SSE registers
1692 in SImode, DImode and TImode */
1693 {8, 8, 8}, /* cost of storing SSE registers
1694 in SImode, DImode and TImode */
1695 5, /* MMX or SSE register to integer */
1696 32, /* size of l1 cache. */
1697 256, /* size of l2 cache. */
1698 64, /* size of prefetch block */
1699 6, /* number of parallel prefetches */
1700 3, /* Branch cost */
1701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1702 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1703 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1704 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1705 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1706 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1707 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1708 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1709 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1710 {{libcall, {{8, loop}, {15, unrolled_loop},
1711 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1712 {libcall, {{24, loop}, {32, unrolled_loop},
1713 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1714 1, /* scalar_stmt_cost. */
1715 1, /* scalar load_cost. */
1716 1, /* scalar_store_cost. */
1717 1, /* vec_stmt_cost. */
1718 1, /* vec_to_scalar_cost. */
1719 1, /* scalar_to_vec_cost. */
1720 1, /* vec_align_load_cost. */
1721 2, /* vec_unalign_load_cost. */
1722 1, /* vec_store_cost. */
1723 3, /* cond_taken_branch_cost. */
1724 1, /* cond_not_taken_branch_cost. */
1725 };
1726
1727 /* Generic64 should produce code tuned for Nocona and K8. */
1728 static const
1729 struct processor_costs generic64_cost = {
1730 COSTS_N_INSNS (1), /* cost of an add instruction */
1731 /* On all chips taken into consideration lea is 2 cycles and more. With
1732 this cost however our current implementation of synth_mult results in
1733 use of unnecessary temporary registers causing regression on several
1734 SPECfp benchmarks. */
1735 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1736 COSTS_N_INSNS (1), /* variable shift costs */
1737 COSTS_N_INSNS (1), /* constant shift costs */
1738 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1739 COSTS_N_INSNS (4), /* HI */
1740 COSTS_N_INSNS (3), /* SI */
1741 COSTS_N_INSNS (4), /* DI */
1742 COSTS_N_INSNS (2)}, /* other */
1743 0, /* cost of multiply per each bit set */
1744 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1745 COSTS_N_INSNS (26), /* HI */
1746 COSTS_N_INSNS (42), /* SI */
1747 COSTS_N_INSNS (74), /* DI */
1748 COSTS_N_INSNS (74)}, /* other */
1749 COSTS_N_INSNS (1), /* cost of movsx */
1750 COSTS_N_INSNS (1), /* cost of movzx */
1751 8, /* "large" insn */
1752 17, /* MOVE_RATIO */
1753 4, /* cost for loading QImode using movzbl */
1754 {4, 4, 4}, /* cost of loading integer registers
1755 in QImode, HImode and SImode.
1756 Relative to reg-reg move (2). */
1757 {4, 4, 4}, /* cost of storing integer registers */
1758 4, /* cost of reg,reg fld/fst */
1759 {12, 12, 12}, /* cost of loading fp registers
1760 in SFmode, DFmode and XFmode */
1761 {6, 6, 8}, /* cost of storing fp registers
1762 in SFmode, DFmode and XFmode */
1763 2, /* cost of moving MMX register */
1764 {8, 8}, /* cost of loading MMX registers
1765 in SImode and DImode */
1766 {8, 8}, /* cost of storing MMX registers
1767 in SImode and DImode */
1768 2, /* cost of moving SSE register */
1769 {8, 8, 8}, /* cost of loading SSE registers
1770 in SImode, DImode and TImode */
1771 {8, 8, 8}, /* cost of storing SSE registers
1772 in SImode, DImode and TImode */
1773 5, /* MMX or SSE register to integer */
1774 32, /* size of l1 cache. */
1775 512, /* size of l2 cache. */
1776 64, /* size of prefetch block */
1777 6, /* number of parallel prefetches */
1778 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1779 value is increased to perhaps more appropriate value of 5. */
1780 3, /* Branch cost */
1781 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1782 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1783 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1784 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1785 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1786 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1787 {DUMMY_STRINGOP_ALGS,
1788 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1789 {DUMMY_STRINGOP_ALGS,
1790 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1791 1, /* scalar_stmt_cost. */
1792 1, /* scalar load_cost. */
1793 1, /* scalar_store_cost. */
1794 1, /* vec_stmt_cost. */
1795 1, /* vec_to_scalar_cost. */
1796 1, /* scalar_to_vec_cost. */
1797 1, /* vec_align_load_cost. */
1798 2, /* vec_unalign_load_cost. */
1799 1, /* vec_store_cost. */
1800 3, /* cond_taken_branch_cost. */
1801 1, /* cond_not_taken_branch_cost. */
1802 };
1803
1804 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1805 Athlon and K8. */
1806 static const
1807 struct processor_costs generic32_cost = {
1808 COSTS_N_INSNS (1), /* cost of an add instruction */
1809 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1810 COSTS_N_INSNS (1), /* variable shift costs */
1811 COSTS_N_INSNS (1), /* constant shift costs */
1812 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1813 COSTS_N_INSNS (4), /* HI */
1814 COSTS_N_INSNS (3), /* SI */
1815 COSTS_N_INSNS (4), /* DI */
1816 COSTS_N_INSNS (2)}, /* other */
1817 0, /* cost of multiply per each bit set */
1818 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1819 COSTS_N_INSNS (26), /* HI */
1820 COSTS_N_INSNS (42), /* SI */
1821 COSTS_N_INSNS (74), /* DI */
1822 COSTS_N_INSNS (74)}, /* other */
1823 COSTS_N_INSNS (1), /* cost of movsx */
1824 COSTS_N_INSNS (1), /* cost of movzx */
1825 8, /* "large" insn */
1826 17, /* MOVE_RATIO */
1827 4, /* cost for loading QImode using movzbl */
1828 {4, 4, 4}, /* cost of loading integer registers
1829 in QImode, HImode and SImode.
1830 Relative to reg-reg move (2). */
1831 {4, 4, 4}, /* cost of storing integer registers */
1832 4, /* cost of reg,reg fld/fst */
1833 {12, 12, 12}, /* cost of loading fp registers
1834 in SFmode, DFmode and XFmode */
1835 {6, 6, 8}, /* cost of storing fp registers
1836 in SFmode, DFmode and XFmode */
1837 2, /* cost of moving MMX register */
1838 {8, 8}, /* cost of loading MMX registers
1839 in SImode and DImode */
1840 {8, 8}, /* cost of storing MMX registers
1841 in SImode and DImode */
1842 2, /* cost of moving SSE register */
1843 {8, 8, 8}, /* cost of loading SSE registers
1844 in SImode, DImode and TImode */
1845 {8, 8, 8}, /* cost of storing SSE registers
1846 in SImode, DImode and TImode */
1847 5, /* MMX or SSE register to integer */
1848 32, /* size of l1 cache. */
1849 256, /* size of l2 cache. */
1850 64, /* size of prefetch block */
1851 6, /* number of parallel prefetches */
1852 3, /* Branch cost */
1853 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1854 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1855 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1856 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1857 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1858 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1859 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1860 DUMMY_STRINGOP_ALGS},
1861 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1862 DUMMY_STRINGOP_ALGS},
1863 1, /* scalar_stmt_cost. */
1864 1, /* scalar load_cost. */
1865 1, /* scalar_store_cost. */
1866 1, /* vec_stmt_cost. */
1867 1, /* vec_to_scalar_cost. */
1868 1, /* scalar_to_vec_cost. */
1869 1, /* vec_align_load_cost. */
1870 2, /* vec_unalign_load_cost. */
1871 1, /* vec_store_cost. */
1872 3, /* cond_taken_branch_cost. */
1873 1, /* cond_not_taken_branch_cost. */
1874 };
1875
1876 const struct processor_costs *ix86_cost = &pentium_cost;
1877
1878 /* Processor feature/optimization bitmasks. */
1879 #define m_386 (1<<PROCESSOR_I386)
1880 #define m_486 (1<<PROCESSOR_I486)
1881 #define m_PENT (1<<PROCESSOR_PENTIUM)
1882 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1883 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1884 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1885 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1886 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1887 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1888 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1889 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1890 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1891 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1892 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1893 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1894 #define m_ATOM (1<<PROCESSOR_ATOM)
1895
1896 #define m_GEODE (1<<PROCESSOR_GEODE)
1897 #define m_K6 (1<<PROCESSOR_K6)
1898 #define m_K6_GEODE (m_K6 | m_GEODE)
1899 #define m_K8 (1<<PROCESSOR_K8)
1900 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1901 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1902 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1903 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1904 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1905 #define m_BDVER (m_BDVER1 | m_BDVER2)
1906 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1907 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1908
1909 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1910 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1911
1912 /* Generic instruction choice should be common subset of supported CPUs
1913 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1914 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1915
1916 /* Feature tests against the various tunings. */
1917 unsigned char ix86_tune_features[X86_TUNE_LAST];
1918
1919 /* Feature tests against the various tunings used to create ix86_tune_features
1920 based on the processor mask. */
1921 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1922 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1923 negatively, so enabling for Generic64 seems like good code size
1924 tradeoff. We can't enable it for 32bit generic because it does not
1925 work well with PPro base chips. */
1926 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1927
1928 /* X86_TUNE_PUSH_MEMORY */
1929 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1930
1931 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1932 m_486 | m_PENT,
1933
1934 /* X86_TUNE_UNROLL_STRLEN */
1935 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1936
1937 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1938 on simulation result. But after P4 was made, no performance benefit
1939 was observed with branch hints. It also increases the code size.
1940 As a result, icc never generates branch hints. */
1941 0,
1942
1943 /* X86_TUNE_DOUBLE_WITH_ADD */
1944 ~m_386,
1945
1946 /* X86_TUNE_USE_SAHF */
1947 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1948
1949 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1950 partial dependencies. */
1951 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1952
1953 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1954 register stalls on Generic32 compilation setting as well. However
1955 in current implementation the partial register stalls are not eliminated
1956 very well - they can be introduced via subregs synthesized by combine
1957 and can happen in caller/callee saving sequences. Because this option
1958 pays back little on PPro based chips and is in conflict with partial reg
1959 dependencies used by Athlon/P4 based chips, it is better to leave it off
1960 for generic32 for now. */
1961 m_PPRO,
1962
1963 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1964 m_CORE2I7 | m_GENERIC,
1965
1966 /* X86_TUNE_USE_HIMODE_FIOP */
1967 m_386 | m_486 | m_K6_GEODE,
1968
1969 /* X86_TUNE_USE_SIMODE_FIOP */
1970 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1971
1972 /* X86_TUNE_USE_MOV0 */
1973 m_K6,
1974
1975 /* X86_TUNE_USE_CLTD */
1976 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1977
1978 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1979 m_PENT4,
1980
1981 /* X86_TUNE_SPLIT_LONG_MOVES */
1982 m_PPRO,
1983
1984 /* X86_TUNE_READ_MODIFY_WRITE */
1985 ~m_PENT,
1986
1987 /* X86_TUNE_READ_MODIFY */
1988 ~(m_PENT | m_PPRO),
1989
1990 /* X86_TUNE_PROMOTE_QIMODE */
1991 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1992
1993 /* X86_TUNE_FAST_PREFIX */
1994 ~(m_386 | m_486 | m_PENT),
1995
1996 /* X86_TUNE_SINGLE_STRINGOP */
1997 m_386 | m_P4_NOCONA,
1998
1999 /* X86_TUNE_QIMODE_MATH */
2000 ~0,
2001
2002 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2003 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2004 might be considered for Generic32 if our scheme for avoiding partial
2005 stalls was more effective. */
2006 ~m_PPRO,
2007
2008 /* X86_TUNE_PROMOTE_QI_REGS */
2009 0,
2010
2011 /* X86_TUNE_PROMOTE_HI_REGS */
2012 m_PPRO,
2013
2014 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2015 over esp addition. */
2016 m_386 | m_486 | m_PENT | m_PPRO,
2017
2018 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2019 over esp addition. */
2020 m_PENT,
2021
2022 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2023 over esp subtraction. */
2024 m_386 | m_486 | m_PENT | m_K6_GEODE,
2025
2026 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2027 over esp subtraction. */
2028 m_PENT | m_K6_GEODE,
2029
2030 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2031 for DFmode copies */
2032 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2033
2034 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2035 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2036
2037 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2038 conflict here in between PPro/Pentium4 based chips that thread 128bit
2039 SSE registers as single units versus K8 based chips that divide SSE
2040 registers to two 64bit halves. This knob promotes all store destinations
2041 to be 128bit to allow register renaming on 128bit SSE units, but usually
2042 results in one extra microop on 64bit SSE units. Experimental results
2043 shows that disabling this option on P4 brings over 20% SPECfp regression,
2044 while enabling it on K8 brings roughly 2.4% regression that can be partly
2045 masked by careful scheduling of moves. */
2046 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2047
2048 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2049 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2050
2051 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2052 m_COREI7 | m_BDVER,
2053
2054 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2055 m_BDVER ,
2056
2057 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2058 are resolved on SSE register parts instead of whole registers, so we may
2059 maintain just lower part of scalar values in proper format leaving the
2060 upper part undefined. */
2061 m_ATHLON_K8,
2062
2063 /* X86_TUNE_SSE_TYPELESS_STORES */
2064 m_AMD_MULTIPLE,
2065
2066 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2067 m_PPRO | m_P4_NOCONA,
2068
2069 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2070 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2071
2072 /* X86_TUNE_PROLOGUE_USING_MOVE */
2073 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2074
2075 /* X86_TUNE_EPILOGUE_USING_MOVE */
2076 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2077
2078 /* X86_TUNE_SHIFT1 */
2079 ~m_486,
2080
2081 /* X86_TUNE_USE_FFREEP */
2082 m_AMD_MULTIPLE,
2083
2084 /* X86_TUNE_INTER_UNIT_MOVES */
2085 ~(m_AMD_MULTIPLE | m_GENERIC),
2086
2087 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2088 ~(m_AMDFAM10 | m_BDVER ),
2089
2090 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2091 than 4 branch instructions in the 16 byte window. */
2092 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2093
2094 /* X86_TUNE_SCHEDULE */
2095 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2096
2097 /* X86_TUNE_USE_BT */
2098 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2099
2100 /* X86_TUNE_USE_INCDEC */
2101 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2102
2103 /* X86_TUNE_PAD_RETURNS */
2104 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2105
2106 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2107 m_ATOM,
2108
2109 /* X86_TUNE_EXT_80387_CONSTANTS */
2110 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2111
2112 /* X86_TUNE_SHORTEN_X87_SSE */
2113 ~m_K8,
2114
2115 /* X86_TUNE_AVOID_VECTOR_DECODE */
2116 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2117
2118 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2119 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2120 ~(m_386 | m_486),
2121
2122 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2123 vector path on AMD machines. */
2124 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2125
2126 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2127 machines. */
2128 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2129
2130 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2131 than a MOV. */
2132 m_PENT,
2133
2134 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2135 but one byte longer. */
2136 m_PENT,
2137
2138 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2139 operand that cannot be represented using a modRM byte. The XOR
2140 replacement is long decoded, so this split helps here as well. */
2141 m_K6,
2142
2143 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2144 from FP to FP. */
2145 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2146
2147 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2148 from integer to FP. */
2149 m_AMDFAM10,
2150
2151 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2152 with a subsequent conditional jump instruction into a single
2153 compare-and-branch uop. */
2154 m_BDVER,
2155
2156 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2157 will impact LEA instruction selection. */
2158 m_ATOM,
2159
2160 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2161 instructions. */
2162 ~m_ATOM,
2163
2164 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2165 at -O3. For the moment, the prefetching seems badly tuned for Intel
2166 chips. */
2167 m_K6_GEODE | m_AMD_MULTIPLE,
2168
2169 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2170 the auto-vectorizer. */
2171 m_BDVER,
2172
2173 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2174 during reassociation of integer computation. */
2175 m_ATOM,
2176
2177 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2178 during reassociation of fp computation. */
2179 m_ATOM
2180 };
2181
2182 /* Feature tests against the various architecture variations. */
2183 unsigned char ix86_arch_features[X86_ARCH_LAST];
2184
2185 /* Feature tests against the various architecture variations, used to create
2186 ix86_arch_features based on the processor mask. */
2187 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2188 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
2189 ~(m_386 | m_486 | m_PENT | m_K6),
2190
2191 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2192 ~m_386,
2193
2194 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2195 ~(m_386 | m_486),
2196
2197 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2198 ~m_386,
2199
2200 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2201 ~m_386,
2202 };
2203
2204 static const unsigned int x86_accumulate_outgoing_args
2205 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2206
2207 static const unsigned int x86_arch_always_fancy_math_387
2208 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2209
2210 static const unsigned int x86_avx256_split_unaligned_load
2211 = m_COREI7 | m_GENERIC;
2212
2213 static const unsigned int x86_avx256_split_unaligned_store
2214 = m_COREI7 | m_BDVER | m_GENERIC;
2215
2216 /* In case the average insn count for single function invocation is
2217 lower than this constant, emit fast (but longer) prologue and
2218 epilogue code. */
2219 #define FAST_PROLOGUE_INSN_COUNT 20
2220
2221 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2222 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2223 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2224 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2225
2226 /* Array of the smallest class containing reg number REGNO, indexed by
2227 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2228
2229 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2230 {
2231 /* ax, dx, cx, bx */
2232 AREG, DREG, CREG, BREG,
2233 /* si, di, bp, sp */
2234 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2235 /* FP registers */
2236 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2237 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2238 /* arg pointer */
2239 NON_Q_REGS,
2240 /* flags, fpsr, fpcr, frame */
2241 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2242 /* SSE registers */
2243 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2244 SSE_REGS, SSE_REGS,
2245 /* MMX registers */
2246 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2247 MMX_REGS, MMX_REGS,
2248 /* REX registers */
2249 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2250 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2251 /* SSE REX registers */
2252 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2253 SSE_REGS, SSE_REGS,
2254 };
2255
2256 /* The "default" register map used in 32bit mode. */
2257
2258 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2259 {
2260 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2261 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2262 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2263 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2264 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2265 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2266 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2267 };
2268
2269 /* The "default" register map used in 64bit mode. */
2270
2271 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2272 {
2273 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2274 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2275 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2276 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2277 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2278 8,9,10,11,12,13,14,15, /* extended integer registers */
2279 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2280 };
2281
2282 /* Define the register numbers to be used in Dwarf debugging information.
2283 The SVR4 reference port C compiler uses the following register numbers
2284 in its Dwarf output code:
2285 0 for %eax (gcc regno = 0)
2286 1 for %ecx (gcc regno = 2)
2287 2 for %edx (gcc regno = 1)
2288 3 for %ebx (gcc regno = 3)
2289 4 for %esp (gcc regno = 7)
2290 5 for %ebp (gcc regno = 6)
2291 6 for %esi (gcc regno = 4)
2292 7 for %edi (gcc regno = 5)
2293 The following three DWARF register numbers are never generated by
2294 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2295 believes these numbers have these meanings.
2296 8 for %eip (no gcc equivalent)
2297 9 for %eflags (gcc regno = 17)
2298 10 for %trapno (no gcc equivalent)
2299 It is not at all clear how we should number the FP stack registers
2300 for the x86 architecture. If the version of SDB on x86/svr4 were
2301 a bit less brain dead with respect to floating-point then we would
2302 have a precedent to follow with respect to DWARF register numbers
2303 for x86 FP registers, but the SDB on x86/svr4 is so completely
2304 broken with respect to FP registers that it is hardly worth thinking
2305 of it as something to strive for compatibility with.
2306 The version of x86/svr4 SDB I have at the moment does (partially)
2307 seem to believe that DWARF register number 11 is associated with
2308 the x86 register %st(0), but that's about all. Higher DWARF
2309 register numbers don't seem to be associated with anything in
2310 particular, and even for DWARF regno 11, SDB only seems to under-
2311 stand that it should say that a variable lives in %st(0) (when
2312 asked via an `=' command) if we said it was in DWARF regno 11,
2313 but SDB still prints garbage when asked for the value of the
2314 variable in question (via a `/' command).
2315 (Also note that the labels SDB prints for various FP stack regs
2316 when doing an `x' command are all wrong.)
2317 Note that these problems generally don't affect the native SVR4
2318 C compiler because it doesn't allow the use of -O with -g and
2319 because when it is *not* optimizing, it allocates a memory
2320 location for each floating-point variable, and the memory
2321 location is what gets described in the DWARF AT_location
2322 attribute for the variable in question.
2323 Regardless of the severe mental illness of the x86/svr4 SDB, we
2324 do something sensible here and we use the following DWARF
2325 register numbers. Note that these are all stack-top-relative
2326 numbers.
2327 11 for %st(0) (gcc regno = 8)
2328 12 for %st(1) (gcc regno = 9)
2329 13 for %st(2) (gcc regno = 10)
2330 14 for %st(3) (gcc regno = 11)
2331 15 for %st(4) (gcc regno = 12)
2332 16 for %st(5) (gcc regno = 13)
2333 17 for %st(6) (gcc regno = 14)
2334 18 for %st(7) (gcc regno = 15)
2335 */
2336 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2337 {
2338 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2339 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2340 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2341 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2342 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2343 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2344 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2345 };
2346
2347 /* Define parameter passing and return registers. */
2348
2349 static int const x86_64_int_parameter_registers[6] =
2350 {
2351 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2352 };
2353
2354 static int const x86_64_ms_abi_int_parameter_registers[4] =
2355 {
2356 CX_REG, DX_REG, R8_REG, R9_REG
2357 };
2358
2359 static int const x86_64_int_return_registers[4] =
2360 {
2361 AX_REG, DX_REG, DI_REG, SI_REG
2362 };
2363
2364 /* Define the structure for the machine field in struct function. */
2365
2366 struct GTY(()) stack_local_entry {
2367 unsigned short mode;
2368 unsigned short n;
2369 rtx rtl;
2370 struct stack_local_entry *next;
2371 };
2372
2373 /* Structure describing stack frame layout.
2374 Stack grows downward:
2375
2376 [arguments]
2377 <- ARG_POINTER
2378 saved pc
2379
2380 saved static chain if ix86_static_chain_on_stack
2381
2382 saved frame pointer if frame_pointer_needed
2383 <- HARD_FRAME_POINTER
2384 [saved regs]
2385 <- regs_save_offset
2386 [padding0]
2387
2388 [saved SSE regs]
2389 <- sse_regs_save_offset
2390 [padding1] |
2391 | <- FRAME_POINTER
2392 [va_arg registers] |
2393 |
2394 [frame] |
2395 |
2396 [padding2] | = to_allocate
2397 <- STACK_POINTER
2398 */
2399 struct ix86_frame
2400 {
2401 int nsseregs;
2402 int nregs;
2403 int va_arg_size;
2404 int red_zone_size;
2405 int outgoing_arguments_size;
2406 HOST_WIDE_INT frame;
2407
2408 /* The offsets relative to ARG_POINTER. */
2409 HOST_WIDE_INT frame_pointer_offset;
2410 HOST_WIDE_INT hard_frame_pointer_offset;
2411 HOST_WIDE_INT stack_pointer_offset;
2412 HOST_WIDE_INT hfp_save_offset;
2413 HOST_WIDE_INT reg_save_offset;
2414 HOST_WIDE_INT sse_reg_save_offset;
2415
2416 /* When save_regs_using_mov is set, emit prologue using
2417 move instead of push instructions. */
2418 bool save_regs_using_mov;
2419 };
2420
2421 /* Which cpu are we scheduling for. */
2422 enum attr_cpu ix86_schedule;
2423
2424 /* Which cpu are we optimizing for. */
2425 enum processor_type ix86_tune;
2426
2427 /* Which instruction set architecture to use. */
2428 enum processor_type ix86_arch;
2429
2430 /* true if sse prefetch instruction is not NOOP. */
2431 int x86_prefetch_sse;
2432
2433 /* -mstackrealign option */
2434 static const char ix86_force_align_arg_pointer_string[]
2435 = "force_align_arg_pointer";
2436
2437 static rtx (*ix86_gen_leave) (void);
2438 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2439 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2440 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2441 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2442 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2443 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2444 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2445 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2446 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2447
2448 /* Preferred alignment for stack boundary in bits. */
2449 unsigned int ix86_preferred_stack_boundary;
2450
2451 /* Alignment for incoming stack boundary in bits specified at
2452 command line. */
2453 static unsigned int ix86_user_incoming_stack_boundary;
2454
2455 /* Default alignment for incoming stack boundary in bits. */
2456 static unsigned int ix86_default_incoming_stack_boundary;
2457
2458 /* Alignment for incoming stack boundary in bits. */
2459 unsigned int ix86_incoming_stack_boundary;
2460
2461 /* Calling abi specific va_list type nodes. */
2462 static GTY(()) tree sysv_va_list_type_node;
2463 static GTY(()) tree ms_va_list_type_node;
2464
2465 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2466 char internal_label_prefix[16];
2467 int internal_label_prefix_len;
2468
2469 /* Fence to use after loop using movnt. */
2470 tree x86_mfence;
2471
2472 /* Register class used for passing given 64bit part of the argument.
2473 These represent classes as documented by the PS ABI, with the exception
2474 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2475 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2476
2477 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2478 whenever possible (upper half does contain padding). */
2479 enum x86_64_reg_class
2480 {
2481 X86_64_NO_CLASS,
2482 X86_64_INTEGER_CLASS,
2483 X86_64_INTEGERSI_CLASS,
2484 X86_64_SSE_CLASS,
2485 X86_64_SSESF_CLASS,
2486 X86_64_SSEDF_CLASS,
2487 X86_64_SSEUP_CLASS,
2488 X86_64_X87_CLASS,
2489 X86_64_X87UP_CLASS,
2490 X86_64_COMPLEX_X87_CLASS,
2491 X86_64_MEMORY_CLASS
2492 };
2493
2494 #define MAX_CLASSES 4
2495
2496 /* Table of constants used by fldpi, fldln2, etc.... */
2497 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2498 static bool ext_80387_constants_init = 0;
2499
2500 \f
2501 static struct machine_function * ix86_init_machine_status (void);
2502 static rtx ix86_function_value (const_tree, const_tree, bool);
2503 static bool ix86_function_value_regno_p (const unsigned int);
2504 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2505 const_tree);
2506 static rtx ix86_static_chain (const_tree, bool);
2507 static int ix86_function_regparm (const_tree, const_tree);
2508 static void ix86_compute_frame_layout (struct ix86_frame *);
2509 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2510 rtx, rtx, int);
2511 static void ix86_add_new_builtins (HOST_WIDE_INT);
2512 static tree ix86_canonical_va_list_type (tree);
2513 static void predict_jump (int);
2514 static unsigned int split_stack_prologue_scratch_regno (void);
2515 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2516
2517 enum ix86_function_specific_strings
2518 {
2519 IX86_FUNCTION_SPECIFIC_ARCH,
2520 IX86_FUNCTION_SPECIFIC_TUNE,
2521 IX86_FUNCTION_SPECIFIC_MAX
2522 };
2523
2524 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2525 const char *, enum fpmath_unit, bool);
2526 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2527 static void ix86_function_specific_save (struct cl_target_option *);
2528 static void ix86_function_specific_restore (struct cl_target_option *);
2529 static void ix86_function_specific_print (FILE *, int,
2530 struct cl_target_option *);
2531 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2532 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2533 struct gcc_options *);
2534 static bool ix86_can_inline_p (tree, tree);
2535 static void ix86_set_current_function (tree);
2536 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2537
2538 static enum calling_abi ix86_function_abi (const_tree);
2539
2540 \f
2541 #ifndef SUBTARGET32_DEFAULT_CPU
2542 #define SUBTARGET32_DEFAULT_CPU "i386"
2543 #endif
2544
2545 /* The svr4 ABI for the i386 says that records and unions are returned
2546 in memory. */
2547 #ifndef DEFAULT_PCC_STRUCT_RETURN
2548 #define DEFAULT_PCC_STRUCT_RETURN 1
2549 #endif
2550
2551 /* Whether -mtune= or -march= were specified */
2552 static int ix86_tune_defaulted;
2553 static int ix86_arch_specified;
2554
2555 /* Vectorization library interface and handlers. */
2556 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2557
2558 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2559 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2560
2561 /* Processor target table, indexed by processor number */
2562 struct ptt
2563 {
2564 const struct processor_costs *cost; /* Processor costs */
2565 const int align_loop; /* Default alignments. */
2566 const int align_loop_max_skip;
2567 const int align_jump;
2568 const int align_jump_max_skip;
2569 const int align_func;
2570 };
2571
2572 static const struct ptt processor_target_table[PROCESSOR_max] =
2573 {
2574 {&i386_cost, 4, 3, 4, 3, 4},
2575 {&i486_cost, 16, 15, 16, 15, 16},
2576 {&pentium_cost, 16, 7, 16, 7, 16},
2577 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2578 {&geode_cost, 0, 0, 0, 0, 0},
2579 {&k6_cost, 32, 7, 32, 7, 32},
2580 {&athlon_cost, 16, 7, 16, 7, 16},
2581 {&pentium4_cost, 0, 0, 0, 0, 0},
2582 {&k8_cost, 16, 7, 16, 7, 16},
2583 {&nocona_cost, 0, 0, 0, 0, 0},
2584 /* Core 2 32-bit. */
2585 {&generic32_cost, 16, 10, 16, 10, 16},
2586 /* Core 2 64-bit. */
2587 {&generic64_cost, 16, 10, 16, 10, 16},
2588 /* Core i7 32-bit. */
2589 {&generic32_cost, 16, 10, 16, 10, 16},
2590 /* Core i7 64-bit. */
2591 {&generic64_cost, 16, 10, 16, 10, 16},
2592 {&generic32_cost, 16, 7, 16, 7, 16},
2593 {&generic64_cost, 16, 10, 16, 10, 16},
2594 {&amdfam10_cost, 32, 24, 32, 7, 32},
2595 {&bdver1_cost, 32, 24, 32, 7, 32},
2596 {&bdver2_cost, 32, 24, 32, 7, 32},
2597 {&btver1_cost, 32, 24, 32, 7, 32},
2598 {&atom_cost, 16, 15, 16, 7, 16}
2599 };
2600
2601 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2602 {
2603 "generic",
2604 "i386",
2605 "i486",
2606 "pentium",
2607 "pentium-mmx",
2608 "pentiumpro",
2609 "pentium2",
2610 "pentium3",
2611 "pentium4",
2612 "pentium-m",
2613 "prescott",
2614 "nocona",
2615 "core2",
2616 "corei7",
2617 "atom",
2618 "geode",
2619 "k6",
2620 "k6-2",
2621 "k6-3",
2622 "athlon",
2623 "athlon-4",
2624 "k8",
2625 "amdfam10",
2626 "bdver1",
2627 "bdver2",
2628 "btver1"
2629 };
2630 \f
2631 /* Return true if a red-zone is in use. */
2632
2633 static inline bool
2634 ix86_using_red_zone (void)
2635 {
2636 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2637 }
2638 \f
2639 /* Return a string that documents the current -m options. The caller is
2640 responsible for freeing the string. */
2641
2642 static char *
2643 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2644 const char *tune, enum fpmath_unit fpmath,
2645 bool add_nl_p)
2646 {
2647 struct ix86_target_opts
2648 {
2649 const char *option; /* option string */
2650 HOST_WIDE_INT mask; /* isa mask options */
2651 };
2652
2653 /* This table is ordered so that options like -msse4.2 that imply
2654 preceding options while match those first. */
2655 static struct ix86_target_opts isa_opts[] =
2656 {
2657 { "-m64", OPTION_MASK_ISA_64BIT },
2658 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2659 { "-mfma", OPTION_MASK_ISA_FMA },
2660 { "-mxop", OPTION_MASK_ISA_XOP },
2661 { "-mlwp", OPTION_MASK_ISA_LWP },
2662 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2663 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2664 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2665 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2666 { "-msse3", OPTION_MASK_ISA_SSE3 },
2667 { "-msse2", OPTION_MASK_ISA_SSE2 },
2668 { "-msse", OPTION_MASK_ISA_SSE },
2669 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2670 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2671 { "-mmmx", OPTION_MASK_ISA_MMX },
2672 { "-mabm", OPTION_MASK_ISA_ABM },
2673 { "-mbmi", OPTION_MASK_ISA_BMI },
2674 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2675 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2676 { "-mtbm", OPTION_MASK_ISA_TBM },
2677 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2678 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2679 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2680 { "-maes", OPTION_MASK_ISA_AES },
2681 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2682 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2683 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2684 { "-mf16c", OPTION_MASK_ISA_F16C },
2685 };
2686
2687 /* Flag options. */
2688 static struct ix86_target_opts flag_opts[] =
2689 {
2690 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2691 { "-m80387", MASK_80387 },
2692 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2693 { "-malign-double", MASK_ALIGN_DOUBLE },
2694 { "-mcld", MASK_CLD },
2695 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2696 { "-mieee-fp", MASK_IEEE_FP },
2697 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2698 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2699 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2700 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2701 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2702 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2703 { "-mno-red-zone", MASK_NO_RED_ZONE },
2704 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2705 { "-mrecip", MASK_RECIP },
2706 { "-mrtd", MASK_RTD },
2707 { "-msseregparm", MASK_SSEREGPARM },
2708 { "-mstack-arg-probe", MASK_STACK_PROBE },
2709 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2710 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2711 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2712 { "-mvzeroupper", MASK_VZEROUPPER },
2713 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2714 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2715 { "-mprefer-avx128", MASK_PREFER_AVX128},
2716 };
2717
2718 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2719
2720 char isa_other[40];
2721 char target_other[40];
2722 unsigned num = 0;
2723 unsigned i, j;
2724 char *ret;
2725 char *ptr;
2726 size_t len;
2727 size_t line_len;
2728 size_t sep_len;
2729
2730 memset (opts, '\0', sizeof (opts));
2731
2732 /* Add -march= option. */
2733 if (arch)
2734 {
2735 opts[num][0] = "-march=";
2736 opts[num++][1] = arch;
2737 }
2738
2739 /* Add -mtune= option. */
2740 if (tune)
2741 {
2742 opts[num][0] = "-mtune=";
2743 opts[num++][1] = tune;
2744 }
2745
2746 /* Pick out the options in isa options. */
2747 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2748 {
2749 if ((isa & isa_opts[i].mask) != 0)
2750 {
2751 opts[num++][0] = isa_opts[i].option;
2752 isa &= ~ isa_opts[i].mask;
2753 }
2754 }
2755
2756 if (isa && add_nl_p)
2757 {
2758 opts[num++][0] = isa_other;
2759 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2760 isa);
2761 }
2762
2763 /* Add flag options. */
2764 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2765 {
2766 if ((flags & flag_opts[i].mask) != 0)
2767 {
2768 opts[num++][0] = flag_opts[i].option;
2769 flags &= ~ flag_opts[i].mask;
2770 }
2771 }
2772
2773 if (flags && add_nl_p)
2774 {
2775 opts[num++][0] = target_other;
2776 sprintf (target_other, "(other flags: %#x)", flags);
2777 }
2778
2779 /* Add -fpmath= option. */
2780 if (fpmath)
2781 {
2782 opts[num][0] = "-mfpmath=";
2783 switch ((int) fpmath)
2784 {
2785 case FPMATH_387:
2786 opts[num++][1] = "387";
2787 break;
2788
2789 case FPMATH_SSE:
2790 opts[num++][1] = "sse";
2791 break;
2792
2793 case FPMATH_387 | FPMATH_SSE:
2794 opts[num++][1] = "sse+387";
2795 break;
2796
2797 default:
2798 gcc_unreachable ();
2799 }
2800 }
2801
2802 /* Any options? */
2803 if (num == 0)
2804 return NULL;
2805
2806 gcc_assert (num < ARRAY_SIZE (opts));
2807
2808 /* Size the string. */
2809 len = 0;
2810 sep_len = (add_nl_p) ? 3 : 1;
2811 for (i = 0; i < num; i++)
2812 {
2813 len += sep_len;
2814 for (j = 0; j < 2; j++)
2815 if (opts[i][j])
2816 len += strlen (opts[i][j]);
2817 }
2818
2819 /* Build the string. */
2820 ret = ptr = (char *) xmalloc (len);
2821 line_len = 0;
2822
2823 for (i = 0; i < num; i++)
2824 {
2825 size_t len2[2];
2826
2827 for (j = 0; j < 2; j++)
2828 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2829
2830 if (i != 0)
2831 {
2832 *ptr++ = ' ';
2833 line_len++;
2834
2835 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2836 {
2837 *ptr++ = '\\';
2838 *ptr++ = '\n';
2839 line_len = 0;
2840 }
2841 }
2842
2843 for (j = 0; j < 2; j++)
2844 if (opts[i][j])
2845 {
2846 memcpy (ptr, opts[i][j], len2[j]);
2847 ptr += len2[j];
2848 line_len += len2[j];
2849 }
2850 }
2851
2852 *ptr = '\0';
2853 gcc_assert (ret + len >= ptr);
2854
2855 return ret;
2856 }
2857
2858 /* Return true, if profiling code should be emitted before
2859 prologue. Otherwise it returns false.
2860 Note: For x86 with "hotfix" it is sorried. */
2861 static bool
2862 ix86_profile_before_prologue (void)
2863 {
2864 return flag_fentry != 0;
2865 }
2866
2867 /* Function that is callable from the debugger to print the current
2868 options. */
2869 void
2870 ix86_debug_options (void)
2871 {
2872 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2873 ix86_arch_string, ix86_tune_string,
2874 ix86_fpmath, true);
2875
2876 if (opts)
2877 {
2878 fprintf (stderr, "%s\n\n", opts);
2879 free (opts);
2880 }
2881 else
2882 fputs ("<no options>\n\n", stderr);
2883
2884 return;
2885 }
2886 \f
2887 /* Override various settings based on options. If MAIN_ARGS_P, the
2888 options are from the command line, otherwise they are from
2889 attributes. */
2890
2891 static void
2892 ix86_option_override_internal (bool main_args_p)
2893 {
2894 int i;
2895 unsigned int ix86_arch_mask, ix86_tune_mask;
2896 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2897 const char *prefix;
2898 const char *suffix;
2899 const char *sw;
2900
2901 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2902 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2903 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2904 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2905 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2906 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2907 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2908 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2909 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2910 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2911 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2912 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2913 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2914 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2915 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2916 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2917 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2918 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2919 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2920 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2921 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2922 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2923 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2924 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2925 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2926 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2927 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2928 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2929 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2930 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2931 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2932 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2933 /* if this reaches 64, need to widen struct pta flags below */
2934
2935 static struct pta
2936 {
2937 const char *const name; /* processor name or nickname. */
2938 const enum processor_type processor;
2939 const enum attr_cpu schedule;
2940 const unsigned HOST_WIDE_INT flags;
2941 }
2942 const processor_alias_table[] =
2943 {
2944 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2945 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2946 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2947 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2948 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2949 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2950 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2951 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2952 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2953 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2954 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2955 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2956 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2957 PTA_MMX | PTA_SSE},
2958 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2959 PTA_MMX | PTA_SSE},
2960 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2961 PTA_MMX | PTA_SSE | PTA_SSE2},
2962 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2963 PTA_MMX |PTA_SSE | PTA_SSE2},
2964 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2965 PTA_MMX | PTA_SSE | PTA_SSE2},
2966 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2967 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2968 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2969 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2970 | PTA_CX16 | PTA_NO_SAHF},
2971 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2972 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2973 | PTA_SSSE3 | PTA_CX16},
2974 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2975 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2976 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
2977 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2978 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2979 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2980 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
2981 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2982 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2983 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2984 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2985 | PTA_RDRND | PTA_F16C},
2986 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2987 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2988 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2989 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2990 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2991 | PTA_FMA | PTA_MOVBE},
2992 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2993 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2994 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
2995 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2996 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
2997 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2998 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2999 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3000 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3001 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3002 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3003 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3004 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3005 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3006 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3007 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3008 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3009 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3010 {"x86-64", PROCESSOR_K8, CPU_K8,
3011 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3012 {"k8", PROCESSOR_K8, CPU_K8,
3013 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3014 | PTA_SSE2 | PTA_NO_SAHF},
3015 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3016 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3017 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3018 {"opteron", PROCESSOR_K8, CPU_K8,
3019 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3020 | PTA_SSE2 | PTA_NO_SAHF},
3021 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3022 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3023 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3024 {"athlon64", PROCESSOR_K8, CPU_K8,
3025 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3026 | PTA_SSE2 | PTA_NO_SAHF},
3027 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3028 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3029 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3030 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3031 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3032 | PTA_SSE2 | PTA_NO_SAHF},
3033 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3034 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3035 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3036 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3037 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3038 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3039 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3040 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3041 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3042 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3043 | PTA_XOP | PTA_LWP},
3044 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3045 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3046 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3047 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3048 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3049 | PTA_FMA},
3050 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3051 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3052 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3053 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3054 0 /* flags are only used for -march switch. */ },
3055 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3056 PTA_64BIT /* flags are only used for -march switch. */ },
3057 };
3058
3059 /* -mrecip options. */
3060 static struct
3061 {
3062 const char *string; /* option name */
3063 unsigned int mask; /* mask bits to set */
3064 }
3065 const recip_options[] =
3066 {
3067 { "all", RECIP_MASK_ALL },
3068 { "none", RECIP_MASK_NONE },
3069 { "div", RECIP_MASK_DIV },
3070 { "sqrt", RECIP_MASK_SQRT },
3071 { "vec-div", RECIP_MASK_VEC_DIV },
3072 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3073 };
3074
3075 int const pta_size = ARRAY_SIZE (processor_alias_table);
3076
3077 /* Set up prefix/suffix so the error messages refer to either the command
3078 line argument, or the attribute(target). */
3079 if (main_args_p)
3080 {
3081 prefix = "-m";
3082 suffix = "";
3083 sw = "switch";
3084 }
3085 else
3086 {
3087 prefix = "option(\"";
3088 suffix = "\")";
3089 sw = "attribute";
3090 }
3091
3092 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3093 SUBTARGET_OVERRIDE_OPTIONS;
3094 #endif
3095
3096 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3097 SUBSUBTARGET_OVERRIDE_OPTIONS;
3098 #endif
3099
3100 if (TARGET_X32)
3101 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3102
3103 /* -fPIC is the default for x86_64. */
3104 if (TARGET_MACHO && TARGET_64BIT)
3105 flag_pic = 2;
3106
3107 /* Need to check -mtune=generic first. */
3108 if (ix86_tune_string)
3109 {
3110 if (!strcmp (ix86_tune_string, "generic")
3111 || !strcmp (ix86_tune_string, "i686")
3112 /* As special support for cross compilers we read -mtune=native
3113 as -mtune=generic. With native compilers we won't see the
3114 -mtune=native, as it was changed by the driver. */
3115 || !strcmp (ix86_tune_string, "native"))
3116 {
3117 if (TARGET_64BIT)
3118 ix86_tune_string = "generic64";
3119 else
3120 ix86_tune_string = "generic32";
3121 }
3122 /* If this call is for setting the option attribute, allow the
3123 generic32/generic64 that was previously set. */
3124 else if (!main_args_p
3125 && (!strcmp (ix86_tune_string, "generic32")
3126 || !strcmp (ix86_tune_string, "generic64")))
3127 ;
3128 else if (!strncmp (ix86_tune_string, "generic", 7))
3129 error ("bad value (%s) for %stune=%s %s",
3130 ix86_tune_string, prefix, suffix, sw);
3131 else if (!strcmp (ix86_tune_string, "x86-64"))
3132 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3133 "%stune=k8%s or %stune=generic%s instead as appropriate",
3134 prefix, suffix, prefix, suffix, prefix, suffix);
3135 }
3136 else
3137 {
3138 if (ix86_arch_string)
3139 ix86_tune_string = ix86_arch_string;
3140 if (!ix86_tune_string)
3141 {
3142 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3143 ix86_tune_defaulted = 1;
3144 }
3145
3146 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3147 need to use a sensible tune option. */
3148 if (!strcmp (ix86_tune_string, "generic")
3149 || !strcmp (ix86_tune_string, "x86-64")
3150 || !strcmp (ix86_tune_string, "i686"))
3151 {
3152 if (TARGET_64BIT)
3153 ix86_tune_string = "generic64";
3154 else
3155 ix86_tune_string = "generic32";
3156 }
3157 }
3158
3159 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3160 {
3161 /* rep; movq isn't available in 32-bit code. */
3162 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3163 ix86_stringop_alg = no_stringop;
3164 }
3165
3166 if (!ix86_arch_string)
3167 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3168 else
3169 ix86_arch_specified = 1;
3170
3171 if (!global_options_set.x_ix86_abi)
3172 ix86_abi = DEFAULT_ABI;
3173
3174 if (global_options_set.x_ix86_cmodel)
3175 {
3176 switch (ix86_cmodel)
3177 {
3178 case CM_SMALL:
3179 case CM_SMALL_PIC:
3180 if (flag_pic)
3181 ix86_cmodel = CM_SMALL_PIC;
3182 if (!TARGET_64BIT)
3183 error ("code model %qs not supported in the %s bit mode",
3184 "small", "32");
3185 break;
3186
3187 case CM_MEDIUM:
3188 case CM_MEDIUM_PIC:
3189 if (flag_pic)
3190 ix86_cmodel = CM_MEDIUM_PIC;
3191 if (!TARGET_64BIT)
3192 error ("code model %qs not supported in the %s bit mode",
3193 "medium", "32");
3194 else if (TARGET_X32)
3195 error ("code model %qs not supported in x32 mode",
3196 "medium");
3197 break;
3198
3199 case CM_LARGE:
3200 case CM_LARGE_PIC:
3201 if (flag_pic)
3202 ix86_cmodel = CM_LARGE_PIC;
3203 if (!TARGET_64BIT)
3204 error ("code model %qs not supported in the %s bit mode",
3205 "large", "32");
3206 else if (TARGET_X32)
3207 error ("code model %qs not supported in x32 mode",
3208 "medium");
3209 break;
3210
3211 case CM_32:
3212 if (flag_pic)
3213 error ("code model %s does not support PIC mode", "32");
3214 if (TARGET_64BIT)
3215 error ("code model %qs not supported in the %s bit mode",
3216 "32", "64");
3217 break;
3218
3219 case CM_KERNEL:
3220 if (flag_pic)
3221 {
3222 error ("code model %s does not support PIC mode", "kernel");
3223 ix86_cmodel = CM_32;
3224 }
3225 if (!TARGET_64BIT)
3226 error ("code model %qs not supported in the %s bit mode",
3227 "kernel", "32");
3228 break;
3229
3230 default:
3231 gcc_unreachable ();
3232 }
3233 }
3234 else
3235 {
3236 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3237 use of rip-relative addressing. This eliminates fixups that
3238 would otherwise be needed if this object is to be placed in a
3239 DLL, and is essentially just as efficient as direct addressing. */
3240 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3241 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3242 else if (TARGET_64BIT)
3243 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3244 else
3245 ix86_cmodel = CM_32;
3246 }
3247 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3248 {
3249 error ("-masm=intel not supported in this configuration");
3250 ix86_asm_dialect = ASM_ATT;
3251 }
3252 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3253 sorry ("%i-bit mode not compiled in",
3254 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3255
3256 for (i = 0; i < pta_size; i++)
3257 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3258 {
3259 ix86_schedule = processor_alias_table[i].schedule;
3260 ix86_arch = processor_alias_table[i].processor;
3261 /* Default cpu tuning to the architecture. */
3262 ix86_tune = ix86_arch;
3263
3264 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3265 error ("CPU you selected does not support x86-64 "
3266 "instruction set");
3267
3268 if (processor_alias_table[i].flags & PTA_MMX
3269 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3270 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3271 if (processor_alias_table[i].flags & PTA_3DNOW
3272 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3273 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3274 if (processor_alias_table[i].flags & PTA_3DNOW_A
3275 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3276 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3277 if (processor_alias_table[i].flags & PTA_SSE
3278 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3279 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3280 if (processor_alias_table[i].flags & PTA_SSE2
3281 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3282 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3283 if (processor_alias_table[i].flags & PTA_SSE3
3284 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3285 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3286 if (processor_alias_table[i].flags & PTA_SSSE3
3287 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3288 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3289 if (processor_alias_table[i].flags & PTA_SSE4_1
3290 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3291 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3292 if (processor_alias_table[i].flags & PTA_SSE4_2
3293 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3294 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3295 if (processor_alias_table[i].flags & PTA_AVX
3296 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3297 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3298 if (processor_alias_table[i].flags & PTA_AVX2
3299 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3300 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3301 if (processor_alias_table[i].flags & PTA_FMA
3302 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3303 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3304 if (processor_alias_table[i].flags & PTA_SSE4A
3305 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3306 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3307 if (processor_alias_table[i].flags & PTA_FMA4
3308 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3309 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3310 if (processor_alias_table[i].flags & PTA_XOP
3311 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3312 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3313 if (processor_alias_table[i].flags & PTA_LWP
3314 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3315 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3316 if (processor_alias_table[i].flags & PTA_ABM
3317 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3318 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3319 if (processor_alias_table[i].flags & PTA_BMI
3320 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3321 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3322 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3323 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3324 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3325 if (processor_alias_table[i].flags & PTA_TBM
3326 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3327 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3328 if (processor_alias_table[i].flags & PTA_BMI2
3329 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3330 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3331 if (processor_alias_table[i].flags & PTA_CX16
3332 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3333 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3334 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3335 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3336 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3337 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3338 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3339 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3340 if (processor_alias_table[i].flags & PTA_MOVBE
3341 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3342 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3343 if (processor_alias_table[i].flags & PTA_AES
3344 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3345 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3346 if (processor_alias_table[i].flags & PTA_PCLMUL
3347 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3348 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3349 if (processor_alias_table[i].flags & PTA_FSGSBASE
3350 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3351 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3352 if (processor_alias_table[i].flags & PTA_RDRND
3353 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3354 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3355 if (processor_alias_table[i].flags & PTA_F16C
3356 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3357 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3358 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3359 x86_prefetch_sse = true;
3360
3361 break;
3362 }
3363
3364 if (!strcmp (ix86_arch_string, "generic"))
3365 error ("generic CPU can be used only for %stune=%s %s",
3366 prefix, suffix, sw);
3367 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3368 error ("bad value (%s) for %sarch=%s %s",
3369 ix86_arch_string, prefix, suffix, sw);
3370
3371 ix86_arch_mask = 1u << ix86_arch;
3372 for (i = 0; i < X86_ARCH_LAST; ++i)
3373 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3374
3375 for (i = 0; i < pta_size; i++)
3376 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3377 {
3378 ix86_schedule = processor_alias_table[i].schedule;
3379 ix86_tune = processor_alias_table[i].processor;
3380 if (TARGET_64BIT)
3381 {
3382 if (!(processor_alias_table[i].flags & PTA_64BIT))
3383 {
3384 if (ix86_tune_defaulted)
3385 {
3386 ix86_tune_string = "x86-64";
3387 for (i = 0; i < pta_size; i++)
3388 if (! strcmp (ix86_tune_string,
3389 processor_alias_table[i].name))
3390 break;
3391 ix86_schedule = processor_alias_table[i].schedule;
3392 ix86_tune = processor_alias_table[i].processor;
3393 }
3394 else
3395 error ("CPU you selected does not support x86-64 "
3396 "instruction set");
3397 }
3398 }
3399 else
3400 {
3401 /* Adjust tuning when compiling for 32-bit ABI. */
3402 switch (ix86_tune)
3403 {
3404 case PROCESSOR_GENERIC64:
3405 ix86_tune = PROCESSOR_GENERIC32;
3406 ix86_schedule = CPU_PENTIUMPRO;
3407 break;
3408
3409 case PROCESSOR_CORE2_64:
3410 ix86_tune = PROCESSOR_CORE2_32;
3411 break;
3412
3413 case PROCESSOR_COREI7_64:
3414 ix86_tune = PROCESSOR_COREI7_32;
3415 break;
3416
3417 default:
3418 break;
3419 }
3420 }
3421 /* Intel CPUs have always interpreted SSE prefetch instructions as
3422 NOPs; so, we can enable SSE prefetch instructions even when
3423 -mtune (rather than -march) points us to a processor that has them.
3424 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3425 higher processors. */
3426 if (TARGET_CMOVE
3427 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3428 x86_prefetch_sse = true;
3429 break;
3430 }
3431
3432 if (ix86_tune_specified && i == pta_size)
3433 error ("bad value (%s) for %stune=%s %s",
3434 ix86_tune_string, prefix, suffix, sw);
3435
3436 ix86_tune_mask = 1u << ix86_tune;
3437 for (i = 0; i < X86_TUNE_LAST; ++i)
3438 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3439
3440 #ifndef USE_IX86_FRAME_POINTER
3441 #define USE_IX86_FRAME_POINTER 0
3442 #endif
3443
3444 #ifndef USE_X86_64_FRAME_POINTER
3445 #define USE_X86_64_FRAME_POINTER 0
3446 #endif
3447
3448 /* Set the default values for switches whose default depends on TARGET_64BIT
3449 in case they weren't overwritten by command line options. */
3450 if (TARGET_64BIT)
3451 {
3452 if (optimize > 1 && !global_options_set.x_flag_zee)
3453 flag_zee = 1;
3454 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3455 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3456 if (flag_asynchronous_unwind_tables == 2)
3457 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3458 if (flag_pcc_struct_return == 2)
3459 flag_pcc_struct_return = 0;
3460 }
3461 else
3462 {
3463 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3464 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3465 if (flag_asynchronous_unwind_tables == 2)
3466 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3467 if (flag_pcc_struct_return == 2)
3468 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3469 }
3470
3471 if (optimize_size)
3472 ix86_cost = &ix86_size_cost;
3473 else
3474 ix86_cost = processor_target_table[ix86_tune].cost;
3475
3476 /* Arrange to set up i386_stack_locals for all functions. */
3477 init_machine_status = ix86_init_machine_status;
3478
3479 /* Validate -mregparm= value. */
3480 if (global_options_set.x_ix86_regparm)
3481 {
3482 if (TARGET_64BIT)
3483 warning (0, "-mregparm is ignored in 64-bit mode");
3484 if (ix86_regparm > REGPARM_MAX)
3485 {
3486 error ("-mregparm=%d is not between 0 and %d",
3487 ix86_regparm, REGPARM_MAX);
3488 ix86_regparm = 0;
3489 }
3490 }
3491 if (TARGET_64BIT)
3492 ix86_regparm = REGPARM_MAX;
3493
3494 /* Default align_* from the processor table. */
3495 if (align_loops == 0)
3496 {
3497 align_loops = processor_target_table[ix86_tune].align_loop;
3498 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3499 }
3500 if (align_jumps == 0)
3501 {
3502 align_jumps = processor_target_table[ix86_tune].align_jump;
3503 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3504 }
3505 if (align_functions == 0)
3506 {
3507 align_functions = processor_target_table[ix86_tune].align_func;
3508 }
3509
3510 /* Provide default for -mbranch-cost= value. */
3511 if (!global_options_set.x_ix86_branch_cost)
3512 ix86_branch_cost = ix86_cost->branch_cost;
3513
3514 if (TARGET_64BIT)
3515 {
3516 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3517
3518 /* Enable by default the SSE and MMX builtins. Do allow the user to
3519 explicitly disable any of these. In particular, disabling SSE and
3520 MMX for kernel code is extremely useful. */
3521 if (!ix86_arch_specified)
3522 ix86_isa_flags
3523 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3524 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3525
3526 if (TARGET_RTD)
3527 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3528 }
3529 else
3530 {
3531 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3532
3533 if (!ix86_arch_specified)
3534 ix86_isa_flags
3535 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3536
3537 /* i386 ABI does not specify red zone. It still makes sense to use it
3538 when programmer takes care to stack from being destroyed. */
3539 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3540 target_flags |= MASK_NO_RED_ZONE;
3541 }
3542
3543 /* Keep nonleaf frame pointers. */
3544 if (flag_omit_frame_pointer)
3545 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3546 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3547 flag_omit_frame_pointer = 1;
3548
3549 /* If we're doing fast math, we don't care about comparison order
3550 wrt NaNs. This lets us use a shorter comparison sequence. */
3551 if (flag_finite_math_only)
3552 target_flags &= ~MASK_IEEE_FP;
3553
3554 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3555 since the insns won't need emulation. */
3556 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3557 target_flags &= ~MASK_NO_FANCY_MATH_387;
3558
3559 /* Likewise, if the target doesn't have a 387, or we've specified
3560 software floating point, don't use 387 inline intrinsics. */
3561 if (!TARGET_80387)
3562 target_flags |= MASK_NO_FANCY_MATH_387;
3563
3564 /* Turn on MMX builtins for -msse. */
3565 if (TARGET_SSE)
3566 {
3567 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3568 x86_prefetch_sse = true;
3569 }
3570
3571 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3572 if (TARGET_SSE4_2 || TARGET_ABM)
3573 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3574
3575 /* Turn on lzcnt instruction for -mabm. */
3576 if (TARGET_ABM)
3577 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3578
3579 /* Validate -mpreferred-stack-boundary= value or default it to
3580 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3581 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3582 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3583 {
3584 int min = (TARGET_64BIT ? 4 : 2);
3585 int max = (TARGET_SEH ? 4 : 12);
3586
3587 if (ix86_preferred_stack_boundary_arg < min
3588 || ix86_preferred_stack_boundary_arg > max)
3589 {
3590 if (min == max)
3591 error ("-mpreferred-stack-boundary is not supported "
3592 "for this target");
3593 else
3594 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3595 ix86_preferred_stack_boundary_arg, min, max);
3596 }
3597 else
3598 ix86_preferred_stack_boundary
3599 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3600 }
3601
3602 /* Set the default value for -mstackrealign. */
3603 if (ix86_force_align_arg_pointer == -1)
3604 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3605
3606 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3607
3608 /* Validate -mincoming-stack-boundary= value or default it to
3609 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3610 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3611 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3612 {
3613 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3614 || ix86_incoming_stack_boundary_arg > 12)
3615 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3616 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3617 else
3618 {
3619 ix86_user_incoming_stack_boundary
3620 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3621 ix86_incoming_stack_boundary
3622 = ix86_user_incoming_stack_boundary;
3623 }
3624 }
3625
3626 /* Accept -msseregparm only if at least SSE support is enabled. */
3627 if (TARGET_SSEREGPARM
3628 && ! TARGET_SSE)
3629 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3630
3631 if (global_options_set.x_ix86_fpmath)
3632 {
3633 if (ix86_fpmath & FPMATH_SSE)
3634 {
3635 if (!TARGET_SSE)
3636 {
3637 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3638 ix86_fpmath = FPMATH_387;
3639 }
3640 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3641 {
3642 warning (0, "387 instruction set disabled, using SSE arithmetics");
3643 ix86_fpmath = FPMATH_SSE;
3644 }
3645 }
3646 }
3647 else
3648 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3649
3650 /* If the i387 is disabled, then do not return values in it. */
3651 if (!TARGET_80387)
3652 target_flags &= ~MASK_FLOAT_RETURNS;
3653
3654 /* Use external vectorized library in vectorizing intrinsics. */
3655 if (global_options_set.x_ix86_veclibabi_type)
3656 switch (ix86_veclibabi_type)
3657 {
3658 case ix86_veclibabi_type_svml:
3659 ix86_veclib_handler = ix86_veclibabi_svml;
3660 break;
3661
3662 case ix86_veclibabi_type_acml:
3663 ix86_veclib_handler = ix86_veclibabi_acml;
3664 break;
3665
3666 default:
3667 gcc_unreachable ();
3668 }
3669
3670 if ((!USE_IX86_FRAME_POINTER
3671 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3672 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3673 && !optimize_size)
3674 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3675
3676 /* ??? Unwind info is not correct around the CFG unless either a frame
3677 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3678 unwind info generation to be aware of the CFG and propagating states
3679 around edges. */
3680 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3681 || flag_exceptions || flag_non_call_exceptions)
3682 && flag_omit_frame_pointer
3683 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3684 {
3685 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3686 warning (0, "unwind tables currently require either a frame pointer "
3687 "or %saccumulate-outgoing-args%s for correctness",
3688 prefix, suffix);
3689 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3690 }
3691
3692 /* If stack probes are required, the space used for large function
3693 arguments on the stack must also be probed, so enable
3694 -maccumulate-outgoing-args so this happens in the prologue. */
3695 if (TARGET_STACK_PROBE
3696 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3697 {
3698 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3699 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3700 "for correctness", prefix, suffix);
3701 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3702 }
3703
3704 /* For sane SSE instruction set generation we need fcomi instruction.
3705 It is safe to enable all CMOVE instructions. Also, RDRAND intrinsic
3706 expands to a sequence that includes conditional move. */
3707 if (TARGET_SSE || TARGET_RDRND)
3708 TARGET_CMOVE = 1;
3709
3710 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3711 {
3712 char *p;
3713 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3714 p = strchr (internal_label_prefix, 'X');
3715 internal_label_prefix_len = p - internal_label_prefix;
3716 *p = '\0';
3717 }
3718
3719 /* When scheduling description is not available, disable scheduler pass
3720 so it won't slow down the compilation and make x87 code slower. */
3721 if (!TARGET_SCHEDULE)
3722 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3723
3724 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3725 ix86_cost->simultaneous_prefetches,
3726 global_options.x_param_values,
3727 global_options_set.x_param_values);
3728 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3729 global_options.x_param_values,
3730 global_options_set.x_param_values);
3731 maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3732 global_options.x_param_values,
3733 global_options_set.x_param_values);
3734 maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3735 global_options.x_param_values,
3736 global_options_set.x_param_values);
3737
3738 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3739 if (flag_prefetch_loop_arrays < 0
3740 && HAVE_prefetch
3741 && optimize >= 3
3742 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3743 flag_prefetch_loop_arrays = 1;
3744
3745 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3746 can be optimized to ap = __builtin_next_arg (0). */
3747 if (!TARGET_64BIT && !flag_split_stack)
3748 targetm.expand_builtin_va_start = NULL;
3749
3750 if (TARGET_64BIT)
3751 {
3752 ix86_gen_leave = gen_leave_rex64;
3753 ix86_gen_add3 = gen_adddi3;
3754 ix86_gen_sub3 = gen_subdi3;
3755 ix86_gen_sub3_carry = gen_subdi3_carry;
3756 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3757 ix86_gen_monitor = gen_sse3_monitor64;
3758 ix86_gen_andsp = gen_anddi3;
3759 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3760 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3761 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3762 }
3763 else
3764 {
3765 ix86_gen_leave = gen_leave;
3766 ix86_gen_add3 = gen_addsi3;
3767 ix86_gen_sub3 = gen_subsi3;
3768 ix86_gen_sub3_carry = gen_subsi3_carry;
3769 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3770 ix86_gen_monitor = gen_sse3_monitor;
3771 ix86_gen_andsp = gen_andsi3;
3772 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3773 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3774 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3775 }
3776
3777 #ifdef USE_IX86_CLD
3778 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3779 if (!TARGET_64BIT)
3780 target_flags |= MASK_CLD & ~target_flags_explicit;
3781 #endif
3782
3783 if (!TARGET_64BIT && flag_pic)
3784 {
3785 if (flag_fentry > 0)
3786 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3787 "with -fpic");
3788 flag_fentry = 0;
3789 }
3790 else if (TARGET_SEH)
3791 {
3792 if (flag_fentry == 0)
3793 sorry ("-mno-fentry isn%'t compatible with SEH");
3794 flag_fentry = 1;
3795 }
3796 else if (flag_fentry < 0)
3797 {
3798 #if defined(PROFILE_BEFORE_PROLOGUE)
3799 flag_fentry = 1;
3800 #else
3801 flag_fentry = 0;
3802 #endif
3803 }
3804
3805 if (TARGET_AVX)
3806 {
3807 /* When not optimize for size, enable vzeroupper optimization for
3808 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3809 AVX unaligned load/store. */
3810 if (!optimize_size)
3811 {
3812 if (flag_expensive_optimizations
3813 && !(target_flags_explicit & MASK_VZEROUPPER))
3814 target_flags |= MASK_VZEROUPPER;
3815 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3816 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3817 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3818 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3819 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3820 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3821 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
3822 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3823 target_flags |= MASK_PREFER_AVX128;
3824 }
3825 }
3826 else
3827 {
3828 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3829 target_flags &= ~MASK_VZEROUPPER;
3830 }
3831
3832 if (ix86_recip_name)
3833 {
3834 char *p = ASTRDUP (ix86_recip_name);
3835 char *q;
3836 unsigned int mask, i;
3837 bool invert;
3838
3839 while ((q = strtok (p, ",")) != NULL)
3840 {
3841 p = NULL;
3842 if (*q == '!')
3843 {
3844 invert = true;
3845 q++;
3846 }
3847 else
3848 invert = false;
3849
3850 if (!strcmp (q, "default"))
3851 mask = RECIP_MASK_ALL;
3852 else
3853 {
3854 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3855 if (!strcmp (q, recip_options[i].string))
3856 {
3857 mask = recip_options[i].mask;
3858 break;
3859 }
3860
3861 if (i == ARRAY_SIZE (recip_options))
3862 {
3863 error ("unknown option for -mrecip=%s", q);
3864 invert = false;
3865 mask = RECIP_MASK_NONE;
3866 }
3867 }
3868
3869 recip_mask_explicit |= mask;
3870 if (invert)
3871 recip_mask &= ~mask;
3872 else
3873 recip_mask |= mask;
3874 }
3875 }
3876
3877 if (TARGET_RECIP)
3878 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3879 else if (target_flags_explicit & MASK_RECIP)
3880 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3881
3882 /* Save the initial options in case the user does function specific
3883 options. */
3884 if (main_args_p)
3885 target_option_default_node = target_option_current_node
3886 = build_target_option_node ();
3887 }
3888
3889 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
3890
3891 static bool
3892 function_pass_avx256_p (const_rtx val)
3893 {
3894 if (!val)
3895 return false;
3896
3897 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3898 return true;
3899
3900 if (GET_CODE (val) == PARALLEL)
3901 {
3902 int i;
3903 rtx r;
3904
3905 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
3906 {
3907 r = XVECEXP (val, 0, i);
3908 if (GET_CODE (r) == EXPR_LIST
3909 && XEXP (r, 0)
3910 && REG_P (XEXP (r, 0))
3911 && (GET_MODE (XEXP (r, 0)) == OImode
3912 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
3913 return true;
3914 }
3915 }
3916
3917 return false;
3918 }
3919
3920 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3921
3922 static void
3923 ix86_option_override (void)
3924 {
3925 ix86_option_override_internal (true);
3926 }
3927
3928 /* Update register usage after having seen the compiler flags. */
3929
3930 static void
3931 ix86_conditional_register_usage (void)
3932 {
3933 int i;
3934 unsigned int j;
3935
3936 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3937 {
3938 if (fixed_regs[i] > 1)
3939 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
3940 if (call_used_regs[i] > 1)
3941 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
3942 }
3943
3944 /* The PIC register, if it exists, is fixed. */
3945 j = PIC_OFFSET_TABLE_REGNUM;
3946 if (j != INVALID_REGNUM)
3947 fixed_regs[j] = call_used_regs[j] = 1;
3948
3949 /* The 64-bit MS_ABI changes the set of call-used registers. */
3950 if (TARGET_64BIT_MS_ABI)
3951 {
3952 call_used_regs[SI_REG] = 0;
3953 call_used_regs[DI_REG] = 0;
3954 call_used_regs[XMM6_REG] = 0;
3955 call_used_regs[XMM7_REG] = 0;
3956 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3957 call_used_regs[i] = 0;
3958 }
3959
3960 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
3961 other call-clobbered regs for 64-bit. */
3962 if (TARGET_64BIT)
3963 {
3964 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3965
3966 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3967 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3968 && call_used_regs[i])
3969 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3970 }
3971
3972 /* If MMX is disabled, squash the registers. */
3973 if (! TARGET_MMX)
3974 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3975 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3976 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3977
3978 /* If SSE is disabled, squash the registers. */
3979 if (! TARGET_SSE)
3980 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3981 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3982 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3983
3984 /* If the FPU is disabled, squash the registers. */
3985 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3986 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3987 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3988 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3989
3990 /* If 32-bit, squash the 64-bit registers. */
3991 if (! TARGET_64BIT)
3992 {
3993 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3994 reg_names[i] = "";
3995 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3996 reg_names[i] = "";
3997 }
3998 }
3999
4000 \f
4001 /* Save the current options */
4002
4003 static void
4004 ix86_function_specific_save (struct cl_target_option *ptr)
4005 {
4006 ptr->arch = ix86_arch;
4007 ptr->schedule = ix86_schedule;
4008 ptr->tune = ix86_tune;
4009 ptr->branch_cost = ix86_branch_cost;
4010 ptr->tune_defaulted = ix86_tune_defaulted;
4011 ptr->arch_specified = ix86_arch_specified;
4012 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4013 ptr->ix86_target_flags_explicit = target_flags_explicit;
4014 ptr->x_recip_mask_explicit = recip_mask_explicit;
4015
4016 /* The fields are char but the variables are not; make sure the
4017 values fit in the fields. */
4018 gcc_assert (ptr->arch == ix86_arch);
4019 gcc_assert (ptr->schedule == ix86_schedule);
4020 gcc_assert (ptr->tune == ix86_tune);
4021 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4022 }
4023
4024 /* Restore the current options */
4025
4026 static void
4027 ix86_function_specific_restore (struct cl_target_option *ptr)
4028 {
4029 enum processor_type old_tune = ix86_tune;
4030 enum processor_type old_arch = ix86_arch;
4031 unsigned int ix86_arch_mask, ix86_tune_mask;
4032 int i;
4033
4034 ix86_arch = (enum processor_type) ptr->arch;
4035 ix86_schedule = (enum attr_cpu) ptr->schedule;
4036 ix86_tune = (enum processor_type) ptr->tune;
4037 ix86_branch_cost = ptr->branch_cost;
4038 ix86_tune_defaulted = ptr->tune_defaulted;
4039 ix86_arch_specified = ptr->arch_specified;
4040 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4041 target_flags_explicit = ptr->ix86_target_flags_explicit;
4042 recip_mask_explicit = ptr->x_recip_mask_explicit;
4043
4044 /* Recreate the arch feature tests if the arch changed */
4045 if (old_arch != ix86_arch)
4046 {
4047 ix86_arch_mask = 1u << ix86_arch;
4048 for (i = 0; i < X86_ARCH_LAST; ++i)
4049 ix86_arch_features[i]
4050 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4051 }
4052
4053 /* Recreate the tune optimization tests */
4054 if (old_tune != ix86_tune)
4055 {
4056 ix86_tune_mask = 1u << ix86_tune;
4057 for (i = 0; i < X86_TUNE_LAST; ++i)
4058 ix86_tune_features[i]
4059 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4060 }
4061 }
4062
4063 /* Print the current options */
4064
4065 static void
4066 ix86_function_specific_print (FILE *file, int indent,
4067 struct cl_target_option *ptr)
4068 {
4069 char *target_string
4070 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4071 NULL, NULL, ptr->x_ix86_fpmath, false);
4072
4073 fprintf (file, "%*sarch = %d (%s)\n",
4074 indent, "",
4075 ptr->arch,
4076 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4077 ? cpu_names[ptr->arch]
4078 : "<unknown>"));
4079
4080 fprintf (file, "%*stune = %d (%s)\n",
4081 indent, "",
4082 ptr->tune,
4083 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4084 ? cpu_names[ptr->tune]
4085 : "<unknown>"));
4086
4087 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4088
4089 if (target_string)
4090 {
4091 fprintf (file, "%*s%s\n", indent, "", target_string);
4092 free (target_string);
4093 }
4094 }
4095
4096 \f
4097 /* Inner function to process the attribute((target(...))), take an argument and
4098 set the current options from the argument. If we have a list, recursively go
4099 over the list. */
4100
4101 static bool
4102 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4103 struct gcc_options *enum_opts_set)
4104 {
4105 char *next_optstr;
4106 bool ret = true;
4107
4108 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4109 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4110 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4111 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4112 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4113
4114 enum ix86_opt_type
4115 {
4116 ix86_opt_unknown,
4117 ix86_opt_yes,
4118 ix86_opt_no,
4119 ix86_opt_str,
4120 ix86_opt_enum,
4121 ix86_opt_isa
4122 };
4123
4124 static const struct
4125 {
4126 const char *string;
4127 size_t len;
4128 enum ix86_opt_type type;
4129 int opt;
4130 int mask;
4131 } attrs[] = {
4132 /* isa options */
4133 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4134 IX86_ATTR_ISA ("abm", OPT_mabm),
4135 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4136 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4137 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4138 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4139 IX86_ATTR_ISA ("aes", OPT_maes),
4140 IX86_ATTR_ISA ("avx", OPT_mavx),
4141 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4142 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4143 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4144 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4145 IX86_ATTR_ISA ("sse", OPT_msse),
4146 IX86_ATTR_ISA ("sse2", OPT_msse2),
4147 IX86_ATTR_ISA ("sse3", OPT_msse3),
4148 IX86_ATTR_ISA ("sse4", OPT_msse4),
4149 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4150 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4151 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4152 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4153 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4154 IX86_ATTR_ISA ("fma", OPT_mfma),
4155 IX86_ATTR_ISA ("xop", OPT_mxop),
4156 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4157 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4158 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4159 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4160
4161 /* enum options */
4162 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4163
4164 /* string options */
4165 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4166 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4167
4168 /* flag options */
4169 IX86_ATTR_YES ("cld",
4170 OPT_mcld,
4171 MASK_CLD),
4172
4173 IX86_ATTR_NO ("fancy-math-387",
4174 OPT_mfancy_math_387,
4175 MASK_NO_FANCY_MATH_387),
4176
4177 IX86_ATTR_YES ("ieee-fp",
4178 OPT_mieee_fp,
4179 MASK_IEEE_FP),
4180
4181 IX86_ATTR_YES ("inline-all-stringops",
4182 OPT_minline_all_stringops,
4183 MASK_INLINE_ALL_STRINGOPS),
4184
4185 IX86_ATTR_YES ("inline-stringops-dynamically",
4186 OPT_minline_stringops_dynamically,
4187 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4188
4189 IX86_ATTR_NO ("align-stringops",
4190 OPT_mno_align_stringops,
4191 MASK_NO_ALIGN_STRINGOPS),
4192
4193 IX86_ATTR_YES ("recip",
4194 OPT_mrecip,
4195 MASK_RECIP),
4196
4197 };
4198
4199 /* If this is a list, recurse to get the options. */
4200 if (TREE_CODE (args) == TREE_LIST)
4201 {
4202 bool ret = true;
4203
4204 for (; args; args = TREE_CHAIN (args))
4205 if (TREE_VALUE (args)
4206 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4207 p_strings, enum_opts_set))
4208 ret = false;
4209
4210 return ret;
4211 }
4212
4213 else if (TREE_CODE (args) != STRING_CST)
4214 gcc_unreachable ();
4215
4216 /* Handle multiple arguments separated by commas. */
4217 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4218
4219 while (next_optstr && *next_optstr != '\0')
4220 {
4221 char *p = next_optstr;
4222 char *orig_p = p;
4223 char *comma = strchr (next_optstr, ',');
4224 const char *opt_string;
4225 size_t len, opt_len;
4226 int opt;
4227 bool opt_set_p;
4228 char ch;
4229 unsigned i;
4230 enum ix86_opt_type type = ix86_opt_unknown;
4231 int mask = 0;
4232
4233 if (comma)
4234 {
4235 *comma = '\0';
4236 len = comma - next_optstr;
4237 next_optstr = comma + 1;
4238 }
4239 else
4240 {
4241 len = strlen (p);
4242 next_optstr = NULL;
4243 }
4244
4245 /* Recognize no-xxx. */
4246 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4247 {
4248 opt_set_p = false;
4249 p += 3;
4250 len -= 3;
4251 }
4252 else
4253 opt_set_p = true;
4254
4255 /* Find the option. */
4256 ch = *p;
4257 opt = N_OPTS;
4258 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4259 {
4260 type = attrs[i].type;
4261 opt_len = attrs[i].len;
4262 if (ch == attrs[i].string[0]
4263 && ((type != ix86_opt_str && type != ix86_opt_enum)
4264 ? len == opt_len
4265 : len > opt_len)
4266 && memcmp (p, attrs[i].string, opt_len) == 0)
4267 {
4268 opt = attrs[i].opt;
4269 mask = attrs[i].mask;
4270 opt_string = attrs[i].string;
4271 break;
4272 }
4273 }
4274
4275 /* Process the option. */
4276 if (opt == N_OPTS)
4277 {
4278 error ("attribute(target(\"%s\")) is unknown", orig_p);
4279 ret = false;
4280 }
4281
4282 else if (type == ix86_opt_isa)
4283 {
4284 struct cl_decoded_option decoded;
4285
4286 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4287 ix86_handle_option (&global_options, &global_options_set,
4288 &decoded, input_location);
4289 }
4290
4291 else if (type == ix86_opt_yes || type == ix86_opt_no)
4292 {
4293 if (type == ix86_opt_no)
4294 opt_set_p = !opt_set_p;
4295
4296 if (opt_set_p)
4297 target_flags |= mask;
4298 else
4299 target_flags &= ~mask;
4300 }
4301
4302 else if (type == ix86_opt_str)
4303 {
4304 if (p_strings[opt])
4305 {
4306 error ("option(\"%s\") was already specified", opt_string);
4307 ret = false;
4308 }
4309 else
4310 p_strings[opt] = xstrdup (p + opt_len);
4311 }
4312
4313 else if (type == ix86_opt_enum)
4314 {
4315 bool arg_ok;
4316 int value;
4317
4318 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4319 if (arg_ok)
4320 set_option (&global_options, enum_opts_set, opt, value,
4321 p + opt_len, DK_UNSPECIFIED, input_location,
4322 global_dc);
4323 else
4324 {
4325 error ("attribute(target(\"%s\")) is unknown", orig_p);
4326 ret = false;
4327 }
4328 }
4329
4330 else
4331 gcc_unreachable ();
4332 }
4333
4334 return ret;
4335 }
4336
4337 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4338
4339 tree
4340 ix86_valid_target_attribute_tree (tree args)
4341 {
4342 const char *orig_arch_string = ix86_arch_string;
4343 const char *orig_tune_string = ix86_tune_string;
4344 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4345 int orig_tune_defaulted = ix86_tune_defaulted;
4346 int orig_arch_specified = ix86_arch_specified;
4347 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4348 tree t = NULL_TREE;
4349 int i;
4350 struct cl_target_option *def
4351 = TREE_TARGET_OPTION (target_option_default_node);
4352 struct gcc_options enum_opts_set;
4353
4354 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4355
4356 /* Process each of the options on the chain. */
4357 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4358 &enum_opts_set))
4359 return NULL_TREE;
4360
4361 /* If the changed options are different from the default, rerun
4362 ix86_option_override_internal, and then save the options away.
4363 The string options are are attribute options, and will be undone
4364 when we copy the save structure. */
4365 if (ix86_isa_flags != def->x_ix86_isa_flags
4366 || target_flags != def->x_target_flags
4367 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4368 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4369 || enum_opts_set.x_ix86_fpmath)
4370 {
4371 /* If we are using the default tune= or arch=, undo the string assigned,
4372 and use the default. */
4373 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4374 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4375 else if (!orig_arch_specified)
4376 ix86_arch_string = NULL;
4377
4378 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4379 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4380 else if (orig_tune_defaulted)
4381 ix86_tune_string = NULL;
4382
4383 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4384 if (enum_opts_set.x_ix86_fpmath)
4385 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4386 else if (!TARGET_64BIT && TARGET_SSE)
4387 {
4388 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4389 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4390 }
4391
4392 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4393 ix86_option_override_internal (false);
4394
4395 /* Add any builtin functions with the new isa if any. */
4396 ix86_add_new_builtins (ix86_isa_flags);
4397
4398 /* Save the current options unless we are validating options for
4399 #pragma. */
4400 t = build_target_option_node ();
4401
4402 ix86_arch_string = orig_arch_string;
4403 ix86_tune_string = orig_tune_string;
4404 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4405
4406 /* Free up memory allocated to hold the strings */
4407 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4408 free (option_strings[i]);
4409 }
4410
4411 return t;
4412 }
4413
4414 /* Hook to validate attribute((target("string"))). */
4415
4416 static bool
4417 ix86_valid_target_attribute_p (tree fndecl,
4418 tree ARG_UNUSED (name),
4419 tree args,
4420 int ARG_UNUSED (flags))
4421 {
4422 struct cl_target_option cur_target;
4423 bool ret = true;
4424 tree old_optimize = build_optimization_node ();
4425 tree new_target, new_optimize;
4426 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4427
4428 /* If the function changed the optimization levels as well as setting target
4429 options, start with the optimizations specified. */
4430 if (func_optimize && func_optimize != old_optimize)
4431 cl_optimization_restore (&global_options,
4432 TREE_OPTIMIZATION (func_optimize));
4433
4434 /* The target attributes may also change some optimization flags, so update
4435 the optimization options if necessary. */
4436 cl_target_option_save (&cur_target, &global_options);
4437 new_target = ix86_valid_target_attribute_tree (args);
4438 new_optimize = build_optimization_node ();
4439
4440 if (!new_target)
4441 ret = false;
4442
4443 else if (fndecl)
4444 {
4445 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4446
4447 if (old_optimize != new_optimize)
4448 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4449 }
4450
4451 cl_target_option_restore (&global_options, &cur_target);
4452
4453 if (old_optimize != new_optimize)
4454 cl_optimization_restore (&global_options,
4455 TREE_OPTIMIZATION (old_optimize));
4456
4457 return ret;
4458 }
4459
4460 \f
4461 /* Hook to determine if one function can safely inline another. */
4462
4463 static bool
4464 ix86_can_inline_p (tree caller, tree callee)
4465 {
4466 bool ret = false;
4467 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4468 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4469
4470 /* If callee has no option attributes, then it is ok to inline. */
4471 if (!callee_tree)
4472 ret = true;
4473
4474 /* If caller has no option attributes, but callee does then it is not ok to
4475 inline. */
4476 else if (!caller_tree)
4477 ret = false;
4478
4479 else
4480 {
4481 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4482 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4483
4484 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4485 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4486 function. */
4487 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4488 != callee_opts->x_ix86_isa_flags)
4489 ret = false;
4490
4491 /* See if we have the same non-isa options. */
4492 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4493 ret = false;
4494
4495 /* See if arch, tune, etc. are the same. */
4496 else if (caller_opts->arch != callee_opts->arch)
4497 ret = false;
4498
4499 else if (caller_opts->tune != callee_opts->tune)
4500 ret = false;
4501
4502 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4503 ret = false;
4504
4505 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4506 ret = false;
4507
4508 else
4509 ret = true;
4510 }
4511
4512 return ret;
4513 }
4514
4515 \f
4516 /* Remember the last target of ix86_set_current_function. */
4517 static GTY(()) tree ix86_previous_fndecl;
4518
4519 /* Establish appropriate back-end context for processing the function
4520 FNDECL. The argument might be NULL to indicate processing at top
4521 level, outside of any function scope. */
4522 static void
4523 ix86_set_current_function (tree fndecl)
4524 {
4525 /* Only change the context if the function changes. This hook is called
4526 several times in the course of compiling a function, and we don't want to
4527 slow things down too much or call target_reinit when it isn't safe. */
4528 if (fndecl && fndecl != ix86_previous_fndecl)
4529 {
4530 tree old_tree = (ix86_previous_fndecl
4531 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4532 : NULL_TREE);
4533
4534 tree new_tree = (fndecl
4535 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4536 : NULL_TREE);
4537
4538 ix86_previous_fndecl = fndecl;
4539 if (old_tree == new_tree)
4540 ;
4541
4542 else if (new_tree)
4543 {
4544 cl_target_option_restore (&global_options,
4545 TREE_TARGET_OPTION (new_tree));
4546 target_reinit ();
4547 }
4548
4549 else if (old_tree)
4550 {
4551 struct cl_target_option *def
4552 = TREE_TARGET_OPTION (target_option_current_node);
4553
4554 cl_target_option_restore (&global_options, def);
4555 target_reinit ();
4556 }
4557 }
4558 }
4559
4560 \f
4561 /* Return true if this goes in large data/bss. */
4562
4563 static bool
4564 ix86_in_large_data_p (tree exp)
4565 {
4566 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4567 return false;
4568
4569 /* Functions are never large data. */
4570 if (TREE_CODE (exp) == FUNCTION_DECL)
4571 return false;
4572
4573 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4574 {
4575 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4576 if (strcmp (section, ".ldata") == 0
4577 || strcmp (section, ".lbss") == 0)
4578 return true;
4579 return false;
4580 }
4581 else
4582 {
4583 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4584
4585 /* If this is an incomplete type with size 0, then we can't put it
4586 in data because it might be too big when completed. */
4587 if (!size || size > ix86_section_threshold)
4588 return true;
4589 }
4590
4591 return false;
4592 }
4593
4594 /* Switch to the appropriate section for output of DECL.
4595 DECL is either a `VAR_DECL' node or a constant of some sort.
4596 RELOC indicates whether forming the initial value of DECL requires
4597 link-time relocations. */
4598
4599 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4600 ATTRIBUTE_UNUSED;
4601
4602 static section *
4603 x86_64_elf_select_section (tree decl, int reloc,
4604 unsigned HOST_WIDE_INT align)
4605 {
4606 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4607 && ix86_in_large_data_p (decl))
4608 {
4609 const char *sname = NULL;
4610 unsigned int flags = SECTION_WRITE;
4611 switch (categorize_decl_for_section (decl, reloc))
4612 {
4613 case SECCAT_DATA:
4614 sname = ".ldata";
4615 break;
4616 case SECCAT_DATA_REL:
4617 sname = ".ldata.rel";
4618 break;
4619 case SECCAT_DATA_REL_LOCAL:
4620 sname = ".ldata.rel.local";
4621 break;
4622 case SECCAT_DATA_REL_RO:
4623 sname = ".ldata.rel.ro";
4624 break;
4625 case SECCAT_DATA_REL_RO_LOCAL:
4626 sname = ".ldata.rel.ro.local";
4627 break;
4628 case SECCAT_BSS:
4629 sname = ".lbss";
4630 flags |= SECTION_BSS;
4631 break;
4632 case SECCAT_RODATA:
4633 case SECCAT_RODATA_MERGE_STR:
4634 case SECCAT_RODATA_MERGE_STR_INIT:
4635 case SECCAT_RODATA_MERGE_CONST:
4636 sname = ".lrodata";
4637 flags = 0;
4638 break;
4639 case SECCAT_SRODATA:
4640 case SECCAT_SDATA:
4641 case SECCAT_SBSS:
4642 gcc_unreachable ();
4643 case SECCAT_TEXT:
4644 case SECCAT_TDATA:
4645 case SECCAT_TBSS:
4646 /* We don't split these for medium model. Place them into
4647 default sections and hope for best. */
4648 break;
4649 }
4650 if (sname)
4651 {
4652 /* We might get called with string constants, but get_named_section
4653 doesn't like them as they are not DECLs. Also, we need to set
4654 flags in that case. */
4655 if (!DECL_P (decl))
4656 return get_section (sname, flags, NULL);
4657 return get_named_section (decl, sname, reloc);
4658 }
4659 }
4660 return default_elf_select_section (decl, reloc, align);
4661 }
4662
4663 /* Build up a unique section name, expressed as a
4664 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4665 RELOC indicates whether the initial value of EXP requires
4666 link-time relocations. */
4667
4668 static void ATTRIBUTE_UNUSED
4669 x86_64_elf_unique_section (tree decl, int reloc)
4670 {
4671 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4672 && ix86_in_large_data_p (decl))
4673 {
4674 const char *prefix = NULL;
4675 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4676 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4677
4678 switch (categorize_decl_for_section (decl, reloc))
4679 {
4680 case SECCAT_DATA:
4681 case SECCAT_DATA_REL:
4682 case SECCAT_DATA_REL_LOCAL:
4683 case SECCAT_DATA_REL_RO:
4684 case SECCAT_DATA_REL_RO_LOCAL:
4685 prefix = one_only ? ".ld" : ".ldata";
4686 break;
4687 case SECCAT_BSS:
4688 prefix = one_only ? ".lb" : ".lbss";
4689 break;
4690 case SECCAT_RODATA:
4691 case SECCAT_RODATA_MERGE_STR:
4692 case SECCAT_RODATA_MERGE_STR_INIT:
4693 case SECCAT_RODATA_MERGE_CONST:
4694 prefix = one_only ? ".lr" : ".lrodata";
4695 break;
4696 case SECCAT_SRODATA:
4697 case SECCAT_SDATA:
4698 case SECCAT_SBSS:
4699 gcc_unreachable ();
4700 case SECCAT_TEXT:
4701 case SECCAT_TDATA:
4702 case SECCAT_TBSS:
4703 /* We don't split these for medium model. Place them into
4704 default sections and hope for best. */
4705 break;
4706 }
4707 if (prefix)
4708 {
4709 const char *name, *linkonce;
4710 char *string;
4711
4712 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4713 name = targetm.strip_name_encoding (name);
4714
4715 /* If we're using one_only, then there needs to be a .gnu.linkonce
4716 prefix to the section name. */
4717 linkonce = one_only ? ".gnu.linkonce" : "";
4718
4719 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4720
4721 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4722 return;
4723 }
4724 }
4725 default_unique_section (decl, reloc);
4726 }
4727
4728 #ifdef COMMON_ASM_OP
4729 /* This says how to output assembler code to declare an
4730 uninitialized external linkage data object.
4731
4732 For medium model x86-64 we need to use .largecomm opcode for
4733 large objects. */
4734 void
4735 x86_elf_aligned_common (FILE *file,
4736 const char *name, unsigned HOST_WIDE_INT size,
4737 int align)
4738 {
4739 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4740 && size > (unsigned int)ix86_section_threshold)
4741 fputs (".largecomm\t", file);
4742 else
4743 fputs (COMMON_ASM_OP, file);
4744 assemble_name (file, name);
4745 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4746 size, align / BITS_PER_UNIT);
4747 }
4748 #endif
4749
4750 /* Utility function for targets to use in implementing
4751 ASM_OUTPUT_ALIGNED_BSS. */
4752
4753 void
4754 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4755 const char *name, unsigned HOST_WIDE_INT size,
4756 int align)
4757 {
4758 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4759 && size > (unsigned int)ix86_section_threshold)
4760 switch_to_section (get_named_section (decl, ".lbss", 0));
4761 else
4762 switch_to_section (bss_section);
4763 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4764 #ifdef ASM_DECLARE_OBJECT_NAME
4765 last_assemble_variable_decl = decl;
4766 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4767 #else
4768 /* Standard thing is just output label for the object. */
4769 ASM_OUTPUT_LABEL (file, name);
4770 #endif /* ASM_DECLARE_OBJECT_NAME */
4771 ASM_OUTPUT_SKIP (file, size ? size : 1);
4772 }
4773 \f
4774 /* Decide whether we must probe the stack before any space allocation
4775 on this target. It's essentially TARGET_STACK_PROBE except when
4776 -fstack-check causes the stack to be already probed differently. */
4777
4778 bool
4779 ix86_target_stack_probe (void)
4780 {
4781 /* Do not probe the stack twice if static stack checking is enabled. */
4782 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4783 return false;
4784
4785 return TARGET_STACK_PROBE;
4786 }
4787 \f
4788 /* Decide whether we can make a sibling call to a function. DECL is the
4789 declaration of the function being targeted by the call and EXP is the
4790 CALL_EXPR representing the call. */
4791
4792 static bool
4793 ix86_function_ok_for_sibcall (tree decl, tree exp)
4794 {
4795 tree type, decl_or_type;
4796 rtx a, b;
4797
4798 /* If we are generating position-independent code, we cannot sibcall
4799 optimize any indirect call, or a direct call to a global function,
4800 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4801 if (!TARGET_MACHO
4802 && !TARGET_64BIT
4803 && flag_pic
4804 && (!decl || !targetm.binds_local_p (decl)))
4805 return false;
4806
4807 /* If we need to align the outgoing stack, then sibcalling would
4808 unalign the stack, which may break the called function. */
4809 if (ix86_minimum_incoming_stack_boundary (true)
4810 < PREFERRED_STACK_BOUNDARY)
4811 return false;
4812
4813 if (decl)
4814 {
4815 decl_or_type = decl;
4816 type = TREE_TYPE (decl);
4817 }
4818 else
4819 {
4820 /* We're looking at the CALL_EXPR, we need the type of the function. */
4821 type = CALL_EXPR_FN (exp); /* pointer expression */
4822 type = TREE_TYPE (type); /* pointer type */
4823 type = TREE_TYPE (type); /* function type */
4824 decl_or_type = type;
4825 }
4826
4827 /* Check that the return value locations are the same. Like
4828 if we are returning floats on the 80387 register stack, we cannot
4829 make a sibcall from a function that doesn't return a float to a
4830 function that does or, conversely, from a function that does return
4831 a float to a function that doesn't; the necessary stack adjustment
4832 would not be executed. This is also the place we notice
4833 differences in the return value ABI. Note that it is ok for one
4834 of the functions to have void return type as long as the return
4835 value of the other is passed in a register. */
4836 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4837 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4838 cfun->decl, false);
4839 if (STACK_REG_P (a) || STACK_REG_P (b))
4840 {
4841 if (!rtx_equal_p (a, b))
4842 return false;
4843 }
4844 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4845 {
4846 /* Disable sibcall if we need to generate vzeroupper after
4847 callee returns. */
4848 if (TARGET_VZEROUPPER
4849 && cfun->machine->callee_return_avx256_p
4850 && !cfun->machine->caller_return_avx256_p)
4851 return false;
4852 }
4853 else if (!rtx_equal_p (a, b))
4854 return false;
4855
4856 if (TARGET_64BIT)
4857 {
4858 /* The SYSV ABI has more call-clobbered registers;
4859 disallow sibcalls from MS to SYSV. */
4860 if (cfun->machine->call_abi == MS_ABI
4861 && ix86_function_type_abi (type) == SYSV_ABI)
4862 return false;
4863 }
4864 else
4865 {
4866 /* If this call is indirect, we'll need to be able to use a
4867 call-clobbered register for the address of the target function.
4868 Make sure that all such registers are not used for passing
4869 parameters. Note that DLLIMPORT functions are indirect. */
4870 if (!decl
4871 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4872 {
4873 if (ix86_function_regparm (type, NULL) >= 3)
4874 {
4875 /* ??? Need to count the actual number of registers to be used,
4876 not the possible number of registers. Fix later. */
4877 return false;
4878 }
4879 }
4880 }
4881
4882 /* Otherwise okay. That also includes certain types of indirect calls. */
4883 return true;
4884 }
4885
4886 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4887 and "sseregparm" calling convention attributes;
4888 arguments as in struct attribute_spec.handler. */
4889
4890 static tree
4891 ix86_handle_cconv_attribute (tree *node, tree name,
4892 tree args,
4893 int flags ATTRIBUTE_UNUSED,
4894 bool *no_add_attrs)
4895 {
4896 if (TREE_CODE (*node) != FUNCTION_TYPE
4897 && TREE_CODE (*node) != METHOD_TYPE
4898 && TREE_CODE (*node) != FIELD_DECL
4899 && TREE_CODE (*node) != TYPE_DECL)
4900 {
4901 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4902 name);
4903 *no_add_attrs = true;
4904 return NULL_TREE;
4905 }
4906
4907 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4908 if (is_attribute_p ("regparm", name))
4909 {
4910 tree cst;
4911
4912 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4913 {
4914 error ("fastcall and regparm attributes are not compatible");
4915 }
4916
4917 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4918 {
4919 error ("regparam and thiscall attributes are not compatible");
4920 }
4921
4922 cst = TREE_VALUE (args);
4923 if (TREE_CODE (cst) != INTEGER_CST)
4924 {
4925 warning (OPT_Wattributes,
4926 "%qE attribute requires an integer constant argument",
4927 name);
4928 *no_add_attrs = true;
4929 }
4930 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4931 {
4932 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4933 name, REGPARM_MAX);
4934 *no_add_attrs = true;
4935 }
4936
4937 return NULL_TREE;
4938 }
4939
4940 if (TARGET_64BIT)
4941 {
4942 /* Do not warn when emulating the MS ABI. */
4943 if ((TREE_CODE (*node) != FUNCTION_TYPE
4944 && TREE_CODE (*node) != METHOD_TYPE)
4945 || ix86_function_type_abi (*node) != MS_ABI)
4946 warning (OPT_Wattributes, "%qE attribute ignored",
4947 name);
4948 *no_add_attrs = true;
4949 return NULL_TREE;
4950 }
4951
4952 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4953 if (is_attribute_p ("fastcall", name))
4954 {
4955 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4956 {
4957 error ("fastcall and cdecl attributes are not compatible");
4958 }
4959 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4960 {
4961 error ("fastcall and stdcall attributes are not compatible");
4962 }
4963 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4964 {
4965 error ("fastcall and regparm attributes are not compatible");
4966 }
4967 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4968 {
4969 error ("fastcall and thiscall attributes are not compatible");
4970 }
4971 }
4972
4973 /* Can combine stdcall with fastcall (redundant), regparm and
4974 sseregparm. */
4975 else if (is_attribute_p ("stdcall", name))
4976 {
4977 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4978 {
4979 error ("stdcall and cdecl attributes are not compatible");
4980 }
4981 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4982 {
4983 error ("stdcall and fastcall attributes are not compatible");
4984 }
4985 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4986 {
4987 error ("stdcall and thiscall attributes are not compatible");
4988 }
4989 }
4990
4991 /* Can combine cdecl with regparm and sseregparm. */
4992 else if (is_attribute_p ("cdecl", name))
4993 {
4994 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4995 {
4996 error ("stdcall and cdecl attributes are not compatible");
4997 }
4998 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4999 {
5000 error ("fastcall and cdecl attributes are not compatible");
5001 }
5002 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5003 {
5004 error ("cdecl and thiscall attributes are not compatible");
5005 }
5006 }
5007 else if (is_attribute_p ("thiscall", name))
5008 {
5009 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5010 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5011 name);
5012 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5013 {
5014 error ("stdcall and thiscall attributes are not compatible");
5015 }
5016 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5017 {
5018 error ("fastcall and thiscall attributes are not compatible");
5019 }
5020 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5021 {
5022 error ("cdecl and thiscall attributes are not compatible");
5023 }
5024 }
5025
5026 /* Can combine sseregparm with all attributes. */
5027
5028 return NULL_TREE;
5029 }
5030
5031 /* This function determines from TYPE the calling-convention. */
5032
5033 unsigned int
5034 ix86_get_callcvt (const_tree type)
5035 {
5036 unsigned int ret = 0;
5037 bool is_stdarg;
5038 tree attrs;
5039
5040 if (TARGET_64BIT)
5041 return IX86_CALLCVT_CDECL;
5042
5043 attrs = TYPE_ATTRIBUTES (type);
5044 if (attrs != NULL_TREE)
5045 {
5046 if (lookup_attribute ("cdecl", attrs))
5047 ret |= IX86_CALLCVT_CDECL;
5048 else if (lookup_attribute ("stdcall", attrs))
5049 ret |= IX86_CALLCVT_STDCALL;
5050 else if (lookup_attribute ("fastcall", attrs))
5051 ret |= IX86_CALLCVT_FASTCALL;
5052 else if (lookup_attribute ("thiscall", attrs))
5053 ret |= IX86_CALLCVT_THISCALL;
5054
5055 /* Regparam isn't allowed for thiscall and fastcall. */
5056 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5057 {
5058 if (lookup_attribute ("regparm", attrs))
5059 ret |= IX86_CALLCVT_REGPARM;
5060 if (lookup_attribute ("sseregparm", attrs))
5061 ret |= IX86_CALLCVT_SSEREGPARM;
5062 }
5063
5064 if (IX86_BASE_CALLCVT(ret) != 0)
5065 return ret;
5066 }
5067
5068 is_stdarg = stdarg_p (type);
5069 if (TARGET_RTD && !is_stdarg)
5070 return IX86_CALLCVT_STDCALL | ret;
5071
5072 if (ret != 0
5073 || is_stdarg
5074 || TREE_CODE (type) != METHOD_TYPE
5075 || ix86_function_type_abi (type) != MS_ABI)
5076 return IX86_CALLCVT_CDECL | ret;
5077
5078 return IX86_CALLCVT_THISCALL;
5079 }
5080
5081 /* Return 0 if the attributes for two types are incompatible, 1 if they
5082 are compatible, and 2 if they are nearly compatible (which causes a
5083 warning to be generated). */
5084
5085 static int
5086 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5087 {
5088 unsigned int ccvt1, ccvt2;
5089
5090 if (TREE_CODE (type1) != FUNCTION_TYPE
5091 && TREE_CODE (type1) != METHOD_TYPE)
5092 return 1;
5093
5094 ccvt1 = ix86_get_callcvt (type1);
5095 ccvt2 = ix86_get_callcvt (type2);
5096 if (ccvt1 != ccvt2)
5097 return 0;
5098 if (ix86_function_regparm (type1, NULL)
5099 != ix86_function_regparm (type2, NULL))
5100 return 0;
5101
5102 return 1;
5103 }
5104 \f
5105 /* Return the regparm value for a function with the indicated TYPE and DECL.
5106 DECL may be NULL when calling function indirectly
5107 or considering a libcall. */
5108
5109 static int
5110 ix86_function_regparm (const_tree type, const_tree decl)
5111 {
5112 tree attr;
5113 int regparm;
5114 unsigned int ccvt;
5115
5116 if (TARGET_64BIT)
5117 return (ix86_function_type_abi (type) == SYSV_ABI
5118 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5119 ccvt = ix86_get_callcvt (type);
5120 regparm = ix86_regparm;
5121
5122 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5123 {
5124 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5125 if (attr)
5126 {
5127 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5128 return regparm;
5129 }
5130 }
5131 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5132 return 2;
5133 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5134 return 1;
5135
5136 /* Use register calling convention for local functions when possible. */
5137 if (decl
5138 && TREE_CODE (decl) == FUNCTION_DECL
5139 && optimize
5140 && !(profile_flag && !flag_fentry))
5141 {
5142 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5143 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5144 if (i && i->local && i->can_change_signature)
5145 {
5146 int local_regparm, globals = 0, regno;
5147
5148 /* Make sure no regparm register is taken by a
5149 fixed register variable. */
5150 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5151 if (fixed_regs[local_regparm])
5152 break;
5153
5154 /* We don't want to use regparm(3) for nested functions as
5155 these use a static chain pointer in the third argument. */
5156 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5157 local_regparm = 2;
5158
5159 /* In 32-bit mode save a register for the split stack. */
5160 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5161 local_regparm = 2;
5162
5163 /* Each fixed register usage increases register pressure,
5164 so less registers should be used for argument passing.
5165 This functionality can be overriden by an explicit
5166 regparm value. */
5167 for (regno = 0; regno <= DI_REG; regno++)
5168 if (fixed_regs[regno])
5169 globals++;
5170
5171 local_regparm
5172 = globals < local_regparm ? local_regparm - globals : 0;
5173
5174 if (local_regparm > regparm)
5175 regparm = local_regparm;
5176 }
5177 }
5178
5179 return regparm;
5180 }
5181
5182 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5183 DFmode (2) arguments in SSE registers for a function with the
5184 indicated TYPE and DECL. DECL may be NULL when calling function
5185 indirectly or considering a libcall. Otherwise return 0. */
5186
5187 static int
5188 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5189 {
5190 gcc_assert (!TARGET_64BIT);
5191
5192 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5193 by the sseregparm attribute. */
5194 if (TARGET_SSEREGPARM
5195 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5196 {
5197 if (!TARGET_SSE)
5198 {
5199 if (warn)
5200 {
5201 if (decl)
5202 error ("calling %qD with attribute sseregparm without "
5203 "SSE/SSE2 enabled", decl);
5204 else
5205 error ("calling %qT with attribute sseregparm without "
5206 "SSE/SSE2 enabled", type);
5207 }
5208 return 0;
5209 }
5210
5211 return 2;
5212 }
5213
5214 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5215 (and DFmode for SSE2) arguments in SSE registers. */
5216 if (decl && TARGET_SSE_MATH && optimize
5217 && !(profile_flag && !flag_fentry))
5218 {
5219 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5220 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5221 if (i && i->local && i->can_change_signature)
5222 return TARGET_SSE2 ? 2 : 1;
5223 }
5224
5225 return 0;
5226 }
5227
5228 /* Return true if EAX is live at the start of the function. Used by
5229 ix86_expand_prologue to determine if we need special help before
5230 calling allocate_stack_worker. */
5231
5232 static bool
5233 ix86_eax_live_at_start_p (void)
5234 {
5235 /* Cheat. Don't bother working forward from ix86_function_regparm
5236 to the function type to whether an actual argument is located in
5237 eax. Instead just look at cfg info, which is still close enough
5238 to correct at this point. This gives false positives for broken
5239 functions that might use uninitialized data that happens to be
5240 allocated in eax, but who cares? */
5241 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5242 }
5243
5244 static bool
5245 ix86_keep_aggregate_return_pointer (tree fntype)
5246 {
5247 tree attr;
5248
5249 if (!TARGET_64BIT)
5250 {
5251 attr = lookup_attribute ("callee_pop_aggregate_return",
5252 TYPE_ATTRIBUTES (fntype));
5253 if (attr)
5254 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5255
5256 /* For 32-bit MS-ABI the default is to keep aggregate
5257 return pointer. */
5258 if (ix86_function_type_abi (fntype) == MS_ABI)
5259 return true;
5260 }
5261 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5262 }
5263
5264 /* Value is the number of bytes of arguments automatically
5265 popped when returning from a subroutine call.
5266 FUNDECL is the declaration node of the function (as a tree),
5267 FUNTYPE is the data type of the function (as a tree),
5268 or for a library call it is an identifier node for the subroutine name.
5269 SIZE is the number of bytes of arguments passed on the stack.
5270
5271 On the 80386, the RTD insn may be used to pop them if the number
5272 of args is fixed, but if the number is variable then the caller
5273 must pop them all. RTD can't be used for library calls now
5274 because the library is compiled with the Unix compiler.
5275 Use of RTD is a selectable option, since it is incompatible with
5276 standard Unix calling sequences. If the option is not selected,
5277 the caller must always pop the args.
5278
5279 The attribute stdcall is equivalent to RTD on a per module basis. */
5280
5281 static int
5282 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5283 {
5284 unsigned int ccvt;
5285
5286 /* None of the 64-bit ABIs pop arguments. */
5287 if (TARGET_64BIT)
5288 return 0;
5289
5290 ccvt = ix86_get_callcvt (funtype);
5291
5292 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5293 | IX86_CALLCVT_THISCALL)) != 0
5294 && ! stdarg_p (funtype))
5295 return size;
5296
5297 /* Lose any fake structure return argument if it is passed on the stack. */
5298 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5299 && !ix86_keep_aggregate_return_pointer (funtype))
5300 {
5301 int nregs = ix86_function_regparm (funtype, fundecl);
5302 if (nregs == 0)
5303 return GET_MODE_SIZE (Pmode);
5304 }
5305
5306 return 0;
5307 }
5308 \f
5309 /* Argument support functions. */
5310
5311 /* Return true when register may be used to pass function parameters. */
5312 bool
5313 ix86_function_arg_regno_p (int regno)
5314 {
5315 int i;
5316 const int *parm_regs;
5317
5318 if (!TARGET_64BIT)
5319 {
5320 if (TARGET_MACHO)
5321 return (regno < REGPARM_MAX
5322 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5323 else
5324 return (regno < REGPARM_MAX
5325 || (TARGET_MMX && MMX_REGNO_P (regno)
5326 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5327 || (TARGET_SSE && SSE_REGNO_P (regno)
5328 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5329 }
5330
5331 if (TARGET_MACHO)
5332 {
5333 if (SSE_REGNO_P (regno) && TARGET_SSE)
5334 return true;
5335 }
5336 else
5337 {
5338 if (TARGET_SSE && SSE_REGNO_P (regno)
5339 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5340 return true;
5341 }
5342
5343 /* TODO: The function should depend on current function ABI but
5344 builtins.c would need updating then. Therefore we use the
5345 default ABI. */
5346
5347 /* RAX is used as hidden argument to va_arg functions. */
5348 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5349 return true;
5350
5351 if (ix86_abi == MS_ABI)
5352 parm_regs = x86_64_ms_abi_int_parameter_registers;
5353 else
5354 parm_regs = x86_64_int_parameter_registers;
5355 for (i = 0; i < (ix86_abi == MS_ABI
5356 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5357 if (regno == parm_regs[i])
5358 return true;
5359 return false;
5360 }
5361
5362 /* Return if we do not know how to pass TYPE solely in registers. */
5363
5364 static bool
5365 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5366 {
5367 if (must_pass_in_stack_var_size_or_pad (mode, type))
5368 return true;
5369
5370 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5371 The layout_type routine is crafty and tries to trick us into passing
5372 currently unsupported vector types on the stack by using TImode. */
5373 return (!TARGET_64BIT && mode == TImode
5374 && type && TREE_CODE (type) != VECTOR_TYPE);
5375 }
5376
5377 /* It returns the size, in bytes, of the area reserved for arguments passed
5378 in registers for the function represented by fndecl dependent to the used
5379 abi format. */
5380 int
5381 ix86_reg_parm_stack_space (const_tree fndecl)
5382 {
5383 enum calling_abi call_abi = SYSV_ABI;
5384 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5385 call_abi = ix86_function_abi (fndecl);
5386 else
5387 call_abi = ix86_function_type_abi (fndecl);
5388 if (TARGET_64BIT && call_abi == MS_ABI)
5389 return 32;
5390 return 0;
5391 }
5392
5393 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5394 call abi used. */
5395 enum calling_abi
5396 ix86_function_type_abi (const_tree fntype)
5397 {
5398 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5399 {
5400 enum calling_abi abi = ix86_abi;
5401 if (abi == SYSV_ABI)
5402 {
5403 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5404 abi = MS_ABI;
5405 }
5406 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5407 abi = SYSV_ABI;
5408 return abi;
5409 }
5410 return ix86_abi;
5411 }
5412
5413 static bool
5414 ix86_function_ms_hook_prologue (const_tree fn)
5415 {
5416 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5417 {
5418 if (decl_function_context (fn) != NULL_TREE)
5419 error_at (DECL_SOURCE_LOCATION (fn),
5420 "ms_hook_prologue is not compatible with nested function");
5421 else
5422 return true;
5423 }
5424 return false;
5425 }
5426
5427 static enum calling_abi
5428 ix86_function_abi (const_tree fndecl)
5429 {
5430 if (! fndecl)
5431 return ix86_abi;
5432 return ix86_function_type_abi (TREE_TYPE (fndecl));
5433 }
5434
5435 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5436 call abi used. */
5437 enum calling_abi
5438 ix86_cfun_abi (void)
5439 {
5440 if (! cfun)
5441 return ix86_abi;
5442 return cfun->machine->call_abi;
5443 }
5444
5445 /* Write the extra assembler code needed to declare a function properly. */
5446
5447 void
5448 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5449 tree decl)
5450 {
5451 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5452
5453 if (is_ms_hook)
5454 {
5455 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5456 unsigned int filler_cc = 0xcccccccc;
5457
5458 for (i = 0; i < filler_count; i += 4)
5459 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5460 }
5461
5462 #ifdef SUBTARGET_ASM_UNWIND_INIT
5463 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5464 #endif
5465
5466 ASM_OUTPUT_LABEL (asm_out_file, fname);
5467
5468 /* Output magic byte marker, if hot-patch attribute is set. */
5469 if (is_ms_hook)
5470 {
5471 if (TARGET_64BIT)
5472 {
5473 /* leaq [%rsp + 0], %rsp */
5474 asm_fprintf (asm_out_file, ASM_BYTE
5475 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5476 }
5477 else
5478 {
5479 /* movl.s %edi, %edi
5480 push %ebp
5481 movl.s %esp, %ebp */
5482 asm_fprintf (asm_out_file, ASM_BYTE
5483 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5484 }
5485 }
5486 }
5487
5488 /* regclass.c */
5489 extern void init_regs (void);
5490
5491 /* Implementation of call abi switching target hook. Specific to FNDECL
5492 the specific call register sets are set. See also
5493 ix86_conditional_register_usage for more details. */
5494 void
5495 ix86_call_abi_override (const_tree fndecl)
5496 {
5497 if (fndecl == NULL_TREE)
5498 cfun->machine->call_abi = ix86_abi;
5499 else
5500 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5501 }
5502
5503 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5504 expensive re-initialization of init_regs each time we switch function context
5505 since this is needed only during RTL expansion. */
5506 static void
5507 ix86_maybe_switch_abi (void)
5508 {
5509 if (TARGET_64BIT &&
5510 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5511 reinit_regs ();
5512 }
5513
5514 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5515 for a call to a function whose data type is FNTYPE.
5516 For a library call, FNTYPE is 0. */
5517
5518 void
5519 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5520 tree fntype, /* tree ptr for function decl */
5521 rtx libname, /* SYMBOL_REF of library name or 0 */
5522 tree fndecl,
5523 int caller)
5524 {
5525 struct cgraph_local_info *i;
5526 tree fnret_type;
5527
5528 memset (cum, 0, sizeof (*cum));
5529
5530 /* Initialize for the current callee. */
5531 if (caller)
5532 {
5533 cfun->machine->callee_pass_avx256_p = false;
5534 cfun->machine->callee_return_avx256_p = false;
5535 }
5536
5537 if (fndecl)
5538 {
5539 i = cgraph_local_info (fndecl);
5540 cum->call_abi = ix86_function_abi (fndecl);
5541 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5542 }
5543 else
5544 {
5545 i = NULL;
5546 cum->call_abi = ix86_function_type_abi (fntype);
5547 if (fntype)
5548 fnret_type = TREE_TYPE (fntype);
5549 else
5550 fnret_type = NULL;
5551 }
5552
5553 if (TARGET_VZEROUPPER && fnret_type)
5554 {
5555 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5556 false);
5557 if (function_pass_avx256_p (fnret_value))
5558 {
5559 /* The return value of this function uses 256bit AVX modes. */
5560 if (caller)
5561 cfun->machine->callee_return_avx256_p = true;
5562 else
5563 cfun->machine->caller_return_avx256_p = true;
5564 }
5565 }
5566
5567 cum->caller = caller;
5568
5569 /* Set up the number of registers to use for passing arguments. */
5570
5571 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5572 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5573 "or subtarget optimization implying it");
5574 cum->nregs = ix86_regparm;
5575 if (TARGET_64BIT)
5576 {
5577 cum->nregs = (cum->call_abi == SYSV_ABI
5578 ? X86_64_REGPARM_MAX
5579 : X86_64_MS_REGPARM_MAX);
5580 }
5581 if (TARGET_SSE)
5582 {
5583 cum->sse_nregs = SSE_REGPARM_MAX;
5584 if (TARGET_64BIT)
5585 {
5586 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5587 ? X86_64_SSE_REGPARM_MAX
5588 : X86_64_MS_SSE_REGPARM_MAX);
5589 }
5590 }
5591 if (TARGET_MMX)
5592 cum->mmx_nregs = MMX_REGPARM_MAX;
5593 cum->warn_avx = true;
5594 cum->warn_sse = true;
5595 cum->warn_mmx = true;
5596
5597 /* Because type might mismatch in between caller and callee, we need to
5598 use actual type of function for local calls.
5599 FIXME: cgraph_analyze can be told to actually record if function uses
5600 va_start so for local functions maybe_vaarg can be made aggressive
5601 helping K&R code.
5602 FIXME: once typesytem is fixed, we won't need this code anymore. */
5603 if (i && i->local && i->can_change_signature)
5604 fntype = TREE_TYPE (fndecl);
5605 cum->maybe_vaarg = (fntype
5606 ? (!prototype_p (fntype) || stdarg_p (fntype))
5607 : !libname);
5608
5609 if (!TARGET_64BIT)
5610 {
5611 /* If there are variable arguments, then we won't pass anything
5612 in registers in 32-bit mode. */
5613 if (stdarg_p (fntype))
5614 {
5615 cum->nregs = 0;
5616 cum->sse_nregs = 0;
5617 cum->mmx_nregs = 0;
5618 cum->warn_avx = 0;
5619 cum->warn_sse = 0;
5620 cum->warn_mmx = 0;
5621 return;
5622 }
5623
5624 /* Use ecx and edx registers if function has fastcall attribute,
5625 else look for regparm information. */
5626 if (fntype)
5627 {
5628 unsigned int ccvt = ix86_get_callcvt (fntype);
5629 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5630 {
5631 cum->nregs = 1;
5632 cum->fastcall = 1; /* Same first register as in fastcall. */
5633 }
5634 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5635 {
5636 cum->nregs = 2;
5637 cum->fastcall = 1;
5638 }
5639 else
5640 cum->nregs = ix86_function_regparm (fntype, fndecl);
5641 }
5642
5643 /* Set up the number of SSE registers used for passing SFmode
5644 and DFmode arguments. Warn for mismatching ABI. */
5645 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5646 }
5647 }
5648
5649 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5650 But in the case of vector types, it is some vector mode.
5651
5652 When we have only some of our vector isa extensions enabled, then there
5653 are some modes for which vector_mode_supported_p is false. For these
5654 modes, the generic vector support in gcc will choose some non-vector mode
5655 in order to implement the type. By computing the natural mode, we'll
5656 select the proper ABI location for the operand and not depend on whatever
5657 the middle-end decides to do with these vector types.
5658
5659 The midde-end can't deal with the vector types > 16 bytes. In this
5660 case, we return the original mode and warn ABI change if CUM isn't
5661 NULL. */
5662
5663 static enum machine_mode
5664 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5665 {
5666 enum machine_mode mode = TYPE_MODE (type);
5667
5668 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5669 {
5670 HOST_WIDE_INT size = int_size_in_bytes (type);
5671 if ((size == 8 || size == 16 || size == 32)
5672 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5673 && TYPE_VECTOR_SUBPARTS (type) > 1)
5674 {
5675 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5676
5677 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5678 mode = MIN_MODE_VECTOR_FLOAT;
5679 else
5680 mode = MIN_MODE_VECTOR_INT;
5681
5682 /* Get the mode which has this inner mode and number of units. */
5683 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5684 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5685 && GET_MODE_INNER (mode) == innermode)
5686 {
5687 if (size == 32 && !TARGET_AVX)
5688 {
5689 static bool warnedavx;
5690
5691 if (cum
5692 && !warnedavx
5693 && cum->warn_avx)
5694 {
5695 warnedavx = true;
5696 warning (0, "AVX vector argument without AVX "
5697 "enabled changes the ABI");
5698 }
5699 return TYPE_MODE (type);
5700 }
5701 else
5702 return mode;
5703 }
5704
5705 gcc_unreachable ();
5706 }
5707 }
5708
5709 return mode;
5710 }
5711
5712 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5713 this may not agree with the mode that the type system has chosen for the
5714 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5715 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5716
5717 static rtx
5718 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5719 unsigned int regno)
5720 {
5721 rtx tmp;
5722
5723 if (orig_mode != BLKmode)
5724 tmp = gen_rtx_REG (orig_mode, regno);
5725 else
5726 {
5727 tmp = gen_rtx_REG (mode, regno);
5728 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5729 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5730 }
5731
5732 return tmp;
5733 }
5734
5735 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5736 of this code is to classify each 8bytes of incoming argument by the register
5737 class and assign registers accordingly. */
5738
5739 /* Return the union class of CLASS1 and CLASS2.
5740 See the x86-64 PS ABI for details. */
5741
5742 static enum x86_64_reg_class
5743 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5744 {
5745 /* Rule #1: If both classes are equal, this is the resulting class. */
5746 if (class1 == class2)
5747 return class1;
5748
5749 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5750 the other class. */
5751 if (class1 == X86_64_NO_CLASS)
5752 return class2;
5753 if (class2 == X86_64_NO_CLASS)
5754 return class1;
5755
5756 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5757 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5758 return X86_64_MEMORY_CLASS;
5759
5760 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5761 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5762 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5763 return X86_64_INTEGERSI_CLASS;
5764 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5765 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5766 return X86_64_INTEGER_CLASS;
5767
5768 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5769 MEMORY is used. */
5770 if (class1 == X86_64_X87_CLASS
5771 || class1 == X86_64_X87UP_CLASS
5772 || class1 == X86_64_COMPLEX_X87_CLASS
5773 || class2 == X86_64_X87_CLASS
5774 || class2 == X86_64_X87UP_CLASS
5775 || class2 == X86_64_COMPLEX_X87_CLASS)
5776 return X86_64_MEMORY_CLASS;
5777
5778 /* Rule #6: Otherwise class SSE is used. */
5779 return X86_64_SSE_CLASS;
5780 }
5781
5782 /* Classify the argument of type TYPE and mode MODE.
5783 CLASSES will be filled by the register class used to pass each word
5784 of the operand. The number of words is returned. In case the parameter
5785 should be passed in memory, 0 is returned. As a special case for zero
5786 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5787
5788 BIT_OFFSET is used internally for handling records and specifies offset
5789 of the offset in bits modulo 256 to avoid overflow cases.
5790
5791 See the x86-64 PS ABI for details.
5792 */
5793
5794 static int
5795 classify_argument (enum machine_mode mode, const_tree type,
5796 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5797 {
5798 HOST_WIDE_INT bytes =
5799 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5800 int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5801
5802 /* Variable sized entities are always passed/returned in memory. */
5803 if (bytes < 0)
5804 return 0;
5805
5806 if (mode != VOIDmode
5807 && targetm.calls.must_pass_in_stack (mode, type))
5808 return 0;
5809
5810 if (type && AGGREGATE_TYPE_P (type))
5811 {
5812 int i;
5813 tree field;
5814 enum x86_64_reg_class subclasses[MAX_CLASSES];
5815
5816 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5817 if (bytes > 32)
5818 return 0;
5819
5820 for (i = 0; i < words; i++)
5821 classes[i] = X86_64_NO_CLASS;
5822
5823 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5824 signalize memory class, so handle it as special case. */
5825 if (!words)
5826 {
5827 classes[0] = X86_64_NO_CLASS;
5828 return 1;
5829 }
5830
5831 /* Classify each field of record and merge classes. */
5832 switch (TREE_CODE (type))
5833 {
5834 case RECORD_TYPE:
5835 /* And now merge the fields of structure. */
5836 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5837 {
5838 if (TREE_CODE (field) == FIELD_DECL)
5839 {
5840 int num;
5841
5842 if (TREE_TYPE (field) == error_mark_node)
5843 continue;
5844
5845 /* Bitfields are always classified as integer. Handle them
5846 early, since later code would consider them to be
5847 misaligned integers. */
5848 if (DECL_BIT_FIELD (field))
5849 {
5850 for (i = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5851 i < ((int_bit_position (field) + (bit_offset % 64))
5852 + tree_low_cst (DECL_SIZE (field), 0)
5853 + 63) / 8 / 8; i++)
5854 classes[i] =
5855 merge_classes (X86_64_INTEGER_CLASS,
5856 classes[i]);
5857 }
5858 else
5859 {
5860 int pos;
5861
5862 type = TREE_TYPE (field);
5863
5864 /* Flexible array member is ignored. */
5865 if (TYPE_MODE (type) == BLKmode
5866 && TREE_CODE (type) == ARRAY_TYPE
5867 && TYPE_SIZE (type) == NULL_TREE
5868 && TYPE_DOMAIN (type) != NULL_TREE
5869 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5870 == NULL_TREE))
5871 {
5872 static bool warned;
5873
5874 if (!warned && warn_psabi)
5875 {
5876 warned = true;
5877 inform (input_location,
5878 "the ABI of passing struct with"
5879 " a flexible array member has"
5880 " changed in GCC 4.4");
5881 }
5882 continue;
5883 }
5884 num = classify_argument (TYPE_MODE (type), type,
5885 subclasses,
5886 (int_bit_position (field)
5887 + bit_offset) % 256);
5888 if (!num)
5889 return 0;
5890 pos = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5891 for (i = 0; i < num && (i + pos) < words; i++)
5892 classes[i + pos] =
5893 merge_classes (subclasses[i], classes[i + pos]);
5894 }
5895 }
5896 }
5897 break;
5898
5899 case ARRAY_TYPE:
5900 /* Arrays are handled as small records. */
5901 {
5902 int num;
5903 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5904 TREE_TYPE (type), subclasses, bit_offset);
5905 if (!num)
5906 return 0;
5907
5908 /* The partial classes are now full classes. */
5909 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5910 subclasses[0] = X86_64_SSE_CLASS;
5911 if (subclasses[0] == X86_64_INTEGERSI_CLASS
5912 && !((bit_offset % 64) == 0 && bytes == 4))
5913 subclasses[0] = X86_64_INTEGER_CLASS;
5914
5915 for (i = 0; i < words; i++)
5916 classes[i] = subclasses[i % num];
5917
5918 break;
5919 }
5920 case UNION_TYPE:
5921 case QUAL_UNION_TYPE:
5922 /* Unions are similar to RECORD_TYPE but offset is always 0.
5923 */
5924 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5925 {
5926 if (TREE_CODE (field) == FIELD_DECL)
5927 {
5928 int num;
5929
5930 if (TREE_TYPE (field) == error_mark_node)
5931 continue;
5932
5933 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
5934 TREE_TYPE (field), subclasses,
5935 bit_offset);
5936 if (!num)
5937 return 0;
5938 for (i = 0; i < num; i++)
5939 classes[i] = merge_classes (subclasses[i], classes[i]);
5940 }
5941 }
5942 break;
5943
5944 default:
5945 gcc_unreachable ();
5946 }
5947
5948 if (words > 2)
5949 {
5950 /* When size > 16 bytes, if the first one isn't
5951 X86_64_SSE_CLASS or any other ones aren't
5952 X86_64_SSEUP_CLASS, everything should be passed in
5953 memory. */
5954 if (classes[0] != X86_64_SSE_CLASS)
5955 return 0;
5956
5957 for (i = 1; i < words; i++)
5958 if (classes[i] != X86_64_SSEUP_CLASS)
5959 return 0;
5960 }
5961
5962 /* Final merger cleanup. */
5963 for (i = 0; i < words; i++)
5964 {
5965 /* If one class is MEMORY, everything should be passed in
5966 memory. */
5967 if (classes[i] == X86_64_MEMORY_CLASS)
5968 return 0;
5969
5970 /* The X86_64_SSEUP_CLASS should be always preceded by
5971 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
5972 if (classes[i] == X86_64_SSEUP_CLASS
5973 && classes[i - 1] != X86_64_SSE_CLASS
5974 && classes[i - 1] != X86_64_SSEUP_CLASS)
5975 {
5976 /* The first one should never be X86_64_SSEUP_CLASS. */
5977 gcc_assert (i != 0);
5978 classes[i] = X86_64_SSE_CLASS;
5979 }
5980
5981 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
5982 everything should be passed in memory. */
5983 if (classes[i] == X86_64_X87UP_CLASS
5984 && (classes[i - 1] != X86_64_X87_CLASS))
5985 {
5986 static bool warned;
5987
5988 /* The first one should never be X86_64_X87UP_CLASS. */
5989 gcc_assert (i != 0);
5990 if (!warned && warn_psabi)
5991 {
5992 warned = true;
5993 inform (input_location,
5994 "the ABI of passing union with long double"
5995 " has changed in GCC 4.4");
5996 }
5997 return 0;
5998 }
5999 }
6000 return words;
6001 }
6002
6003 /* Compute alignment needed. We align all types to natural boundaries with
6004 exception of XFmode that is aligned to 64bits. */
6005 if (mode != VOIDmode && mode != BLKmode)
6006 {
6007 int mode_alignment = GET_MODE_BITSIZE (mode);
6008
6009 if (mode == XFmode)
6010 mode_alignment = 128;
6011 else if (mode == XCmode)
6012 mode_alignment = 256;
6013 if (COMPLEX_MODE_P (mode))
6014 mode_alignment /= 2;
6015 /* Misaligned fields are always returned in memory. */
6016 if (bit_offset % mode_alignment)
6017 return 0;
6018 }
6019
6020 /* for V1xx modes, just use the base mode */
6021 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6022 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6023 mode = GET_MODE_INNER (mode);
6024
6025 /* Classification of atomic types. */
6026 switch (mode)
6027 {
6028 case SDmode:
6029 case DDmode:
6030 classes[0] = X86_64_SSE_CLASS;
6031 return 1;
6032 case TDmode:
6033 classes[0] = X86_64_SSE_CLASS;
6034 classes[1] = X86_64_SSEUP_CLASS;
6035 return 2;
6036 case DImode:
6037 case SImode:
6038 case HImode:
6039 case QImode:
6040 case CSImode:
6041 case CHImode:
6042 case CQImode:
6043 {
6044 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6045
6046 if (size <= 32)
6047 {
6048 classes[0] = X86_64_INTEGERSI_CLASS;
6049 return 1;
6050 }
6051 else if (size <= 64)
6052 {
6053 classes[0] = X86_64_INTEGER_CLASS;
6054 return 1;
6055 }
6056 else if (size <= 64+32)
6057 {
6058 classes[0] = X86_64_INTEGER_CLASS;
6059 classes[1] = X86_64_INTEGERSI_CLASS;
6060 return 2;
6061 }
6062 else if (size <= 64+64)
6063 {
6064 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6065 return 2;
6066 }
6067 else
6068 gcc_unreachable ();
6069 }
6070 case CDImode:
6071 case TImode:
6072 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6073 return 2;
6074 case COImode:
6075 case OImode:
6076 /* OImode shouldn't be used directly. */
6077 gcc_unreachable ();
6078 case CTImode:
6079 return 0;
6080 case SFmode:
6081 if (!(bit_offset % 64))
6082 classes[0] = X86_64_SSESF_CLASS;
6083 else
6084 classes[0] = X86_64_SSE_CLASS;
6085 return 1;
6086 case DFmode:
6087 classes[0] = X86_64_SSEDF_CLASS;
6088 return 1;
6089 case XFmode:
6090 classes[0] = X86_64_X87_CLASS;
6091 classes[1] = X86_64_X87UP_CLASS;
6092 return 2;
6093 case TFmode:
6094 classes[0] = X86_64_SSE_CLASS;
6095 classes[1] = X86_64_SSEUP_CLASS;
6096 return 2;
6097 case SCmode:
6098 classes[0] = X86_64_SSE_CLASS;
6099 if (!(bit_offset % 64))
6100 return 1;
6101 else
6102 {
6103 static bool warned;
6104
6105 if (!warned && warn_psabi)
6106 {
6107 warned = true;
6108 inform (input_location,
6109 "the ABI of passing structure with complex float"
6110 " member has changed in GCC 4.4");
6111 }
6112 classes[1] = X86_64_SSESF_CLASS;
6113 return 2;
6114 }
6115 case DCmode:
6116 classes[0] = X86_64_SSEDF_CLASS;
6117 classes[1] = X86_64_SSEDF_CLASS;
6118 return 2;
6119 case XCmode:
6120 classes[0] = X86_64_COMPLEX_X87_CLASS;
6121 return 1;
6122 case TCmode:
6123 /* This modes is larger than 16 bytes. */
6124 return 0;
6125 case V8SFmode:
6126 case V8SImode:
6127 case V32QImode:
6128 case V16HImode:
6129 case V4DFmode:
6130 case V4DImode:
6131 classes[0] = X86_64_SSE_CLASS;
6132 classes[1] = X86_64_SSEUP_CLASS;
6133 classes[2] = X86_64_SSEUP_CLASS;
6134 classes[3] = X86_64_SSEUP_CLASS;
6135 return 4;
6136 case V4SFmode:
6137 case V4SImode:
6138 case V16QImode:
6139 case V8HImode:
6140 case V2DFmode:
6141 case V2DImode:
6142 classes[0] = X86_64_SSE_CLASS;
6143 classes[1] = X86_64_SSEUP_CLASS;
6144 return 2;
6145 case V1TImode:
6146 case V1DImode:
6147 case V2SFmode:
6148 case V2SImode:
6149 case V4HImode:
6150 case V8QImode:
6151 classes[0] = X86_64_SSE_CLASS;
6152 return 1;
6153 case BLKmode:
6154 case VOIDmode:
6155 return 0;
6156 default:
6157 gcc_assert (VECTOR_MODE_P (mode));
6158
6159 if (bytes > 16)
6160 return 0;
6161
6162 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6163
6164 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6165 classes[0] = X86_64_INTEGERSI_CLASS;
6166 else
6167 classes[0] = X86_64_INTEGER_CLASS;
6168 classes[1] = X86_64_INTEGER_CLASS;
6169 return 1 + (bytes > 8);
6170 }
6171 }
6172
6173 /* Examine the argument and return set number of register required in each
6174 class. Return 0 iff parameter should be passed in memory. */
6175 static int
6176 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6177 int *int_nregs, int *sse_nregs)
6178 {
6179 enum x86_64_reg_class regclass[MAX_CLASSES];
6180 int n = classify_argument (mode, type, regclass, 0);
6181
6182 *int_nregs = 0;
6183 *sse_nregs = 0;
6184 if (!n)
6185 return 0;
6186 for (n--; n >= 0; n--)
6187 switch (regclass[n])
6188 {
6189 case X86_64_INTEGER_CLASS:
6190 case X86_64_INTEGERSI_CLASS:
6191 (*int_nregs)++;
6192 break;
6193 case X86_64_SSE_CLASS:
6194 case X86_64_SSESF_CLASS:
6195 case X86_64_SSEDF_CLASS:
6196 (*sse_nregs)++;
6197 break;
6198 case X86_64_NO_CLASS:
6199 case X86_64_SSEUP_CLASS:
6200 break;
6201 case X86_64_X87_CLASS:
6202 case X86_64_X87UP_CLASS:
6203 if (!in_return)
6204 return 0;
6205 break;
6206 case X86_64_COMPLEX_X87_CLASS:
6207 return in_return ? 2 : 0;
6208 case X86_64_MEMORY_CLASS:
6209 gcc_unreachable ();
6210 }
6211 return 1;
6212 }
6213
6214 /* Construct container for the argument used by GCC interface. See
6215 FUNCTION_ARG for the detailed description. */
6216
6217 static rtx
6218 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6219 const_tree type, int in_return, int nintregs, int nsseregs,
6220 const int *intreg, int sse_regno)
6221 {
6222 /* The following variables hold the static issued_error state. */
6223 static bool issued_sse_arg_error;
6224 static bool issued_sse_ret_error;
6225 static bool issued_x87_ret_error;
6226
6227 enum machine_mode tmpmode;
6228 int bytes =
6229 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6230 enum x86_64_reg_class regclass[MAX_CLASSES];
6231 int n;
6232 int i;
6233 int nexps = 0;
6234 int needed_sseregs, needed_intregs;
6235 rtx exp[MAX_CLASSES];
6236 rtx ret;
6237
6238 n = classify_argument (mode, type, regclass, 0);
6239 if (!n)
6240 return NULL;
6241 if (!examine_argument (mode, type, in_return, &needed_intregs,
6242 &needed_sseregs))
6243 return NULL;
6244 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6245 return NULL;
6246
6247 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6248 some less clueful developer tries to use floating-point anyway. */
6249 if (needed_sseregs && !TARGET_SSE)
6250 {
6251 if (in_return)
6252 {
6253 if (!issued_sse_ret_error)
6254 {
6255 error ("SSE register return with SSE disabled");
6256 issued_sse_ret_error = true;
6257 }
6258 }
6259 else if (!issued_sse_arg_error)
6260 {
6261 error ("SSE register argument with SSE disabled");
6262 issued_sse_arg_error = true;
6263 }
6264 return NULL;
6265 }
6266
6267 /* Likewise, error if the ABI requires us to return values in the
6268 x87 registers and the user specified -mno-80387. */
6269 if (!TARGET_80387 && in_return)
6270 for (i = 0; i < n; i++)
6271 if (regclass[i] == X86_64_X87_CLASS
6272 || regclass[i] == X86_64_X87UP_CLASS
6273 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6274 {
6275 if (!issued_x87_ret_error)
6276 {
6277 error ("x87 register return with x87 disabled");
6278 issued_x87_ret_error = true;
6279 }
6280 return NULL;
6281 }
6282
6283 /* First construct simple cases. Avoid SCmode, since we want to use
6284 single register to pass this type. */
6285 if (n == 1 && mode != SCmode)
6286 switch (regclass[0])
6287 {
6288 case X86_64_INTEGER_CLASS:
6289 case X86_64_INTEGERSI_CLASS:
6290 return gen_rtx_REG (mode, intreg[0]);
6291 case X86_64_SSE_CLASS:
6292 case X86_64_SSESF_CLASS:
6293 case X86_64_SSEDF_CLASS:
6294 if (mode != BLKmode)
6295 return gen_reg_or_parallel (mode, orig_mode,
6296 SSE_REGNO (sse_regno));
6297 break;
6298 case X86_64_X87_CLASS:
6299 case X86_64_COMPLEX_X87_CLASS:
6300 return gen_rtx_REG (mode, FIRST_STACK_REG);
6301 case X86_64_NO_CLASS:
6302 /* Zero sized array, struct or class. */
6303 return NULL;
6304 default:
6305 gcc_unreachable ();
6306 }
6307 if (n == 2 && regclass[0] == X86_64_SSE_CLASS
6308 && regclass[1] == X86_64_SSEUP_CLASS && mode != BLKmode)
6309 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6310 if (n == 4
6311 && regclass[0] == X86_64_SSE_CLASS
6312 && regclass[1] == X86_64_SSEUP_CLASS
6313 && regclass[2] == X86_64_SSEUP_CLASS
6314 && regclass[3] == X86_64_SSEUP_CLASS
6315 && mode != BLKmode)
6316 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6317
6318 if (n == 2
6319 && regclass[0] == X86_64_X87_CLASS && regclass[1] == X86_64_X87UP_CLASS)
6320 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6321 if (n == 2 && regclass[0] == X86_64_INTEGER_CLASS
6322 && regclass[1] == X86_64_INTEGER_CLASS
6323 && (mode == CDImode || mode == TImode || mode == TFmode)
6324 && intreg[0] + 1 == intreg[1])
6325 return gen_rtx_REG (mode, intreg[0]);
6326
6327 /* Otherwise figure out the entries of the PARALLEL. */
6328 for (i = 0; i < n; i++)
6329 {
6330 int pos;
6331
6332 switch (regclass[i])
6333 {
6334 case X86_64_NO_CLASS:
6335 break;
6336 case X86_64_INTEGER_CLASS:
6337 case X86_64_INTEGERSI_CLASS:
6338 /* Merge TImodes on aligned occasions here too. */
6339 if (i * 8 + 8 > bytes)
6340 tmpmode = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6341 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6342 tmpmode = SImode;
6343 else
6344 tmpmode = DImode;
6345 /* We've requested 24 bytes we don't have mode for. Use DImode. */
6346 if (tmpmode == BLKmode)
6347 tmpmode = DImode;
6348 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6349 gen_rtx_REG (tmpmode, *intreg),
6350 GEN_INT (i*8));
6351 intreg++;
6352 break;
6353 case X86_64_SSESF_CLASS:
6354 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6355 gen_rtx_REG (SFmode,
6356 SSE_REGNO (sse_regno)),
6357 GEN_INT (i*8));
6358 sse_regno++;
6359 break;
6360 case X86_64_SSEDF_CLASS:
6361 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6362 gen_rtx_REG (DFmode,
6363 SSE_REGNO (sse_regno)),
6364 GEN_INT (i*8));
6365 sse_regno++;
6366 break;
6367 case X86_64_SSE_CLASS:
6368 pos = i;
6369 switch (n)
6370 {
6371 case 1:
6372 tmpmode = DImode;
6373 break;
6374 case 2:
6375 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6376 {
6377 tmpmode = TImode;
6378 i++;
6379 }
6380 else
6381 tmpmode = DImode;
6382 break;
6383 case 4:
6384 gcc_assert (i == 0
6385 && regclass[1] == X86_64_SSEUP_CLASS
6386 && regclass[2] == X86_64_SSEUP_CLASS
6387 && regclass[3] == X86_64_SSEUP_CLASS);
6388 tmpmode = OImode;
6389 i += 3;
6390 break;
6391 default:
6392 gcc_unreachable ();
6393 }
6394 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6395 gen_rtx_REG (tmpmode,
6396 SSE_REGNO (sse_regno)),
6397 GEN_INT (pos*8));
6398 sse_regno++;
6399 break;
6400 default:
6401 gcc_unreachable ();
6402 }
6403 }
6404
6405 /* Empty aligned struct, union or class. */
6406 if (nexps == 0)
6407 return NULL;
6408
6409 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6410 for (i = 0; i < nexps; i++)
6411 XVECEXP (ret, 0, i) = exp [i];
6412 return ret;
6413 }
6414
6415 /* Update the data in CUM to advance over an argument of mode MODE
6416 and data type TYPE. (TYPE is null for libcalls where that information
6417 may not be available.) */
6418
6419 static void
6420 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6421 const_tree type, HOST_WIDE_INT bytes,
6422 HOST_WIDE_INT words)
6423 {
6424 switch (mode)
6425 {
6426 default:
6427 break;
6428
6429 case BLKmode:
6430 if (bytes < 0)
6431 break;
6432 /* FALLTHRU */
6433
6434 case DImode:
6435 case SImode:
6436 case HImode:
6437 case QImode:
6438 cum->words += words;
6439 cum->nregs -= words;
6440 cum->regno += words;
6441
6442 if (cum->nregs <= 0)
6443 {
6444 cum->nregs = 0;
6445 cum->regno = 0;
6446 }
6447 break;
6448
6449 case OImode:
6450 /* OImode shouldn't be used directly. */
6451 gcc_unreachable ();
6452
6453 case DFmode:
6454 if (cum->float_in_sse < 2)
6455 break;
6456 case SFmode:
6457 if (cum->float_in_sse < 1)
6458 break;
6459 /* FALLTHRU */
6460
6461 case V8SFmode:
6462 case V8SImode:
6463 case V32QImode:
6464 case V16HImode:
6465 case V4DFmode:
6466 case V4DImode:
6467 case TImode:
6468 case V16QImode:
6469 case V8HImode:
6470 case V4SImode:
6471 case V2DImode:
6472 case V4SFmode:
6473 case V2DFmode:
6474 if (!type || !AGGREGATE_TYPE_P (type))
6475 {
6476 cum->sse_words += words;
6477 cum->sse_nregs -= 1;
6478 cum->sse_regno += 1;
6479 if (cum->sse_nregs <= 0)
6480 {
6481 cum->sse_nregs = 0;
6482 cum->sse_regno = 0;
6483 }
6484 }
6485 break;
6486
6487 case V8QImode:
6488 case V4HImode:
6489 case V2SImode:
6490 case V2SFmode:
6491 case V1TImode:
6492 case V1DImode:
6493 if (!type || !AGGREGATE_TYPE_P (type))
6494 {
6495 cum->mmx_words += words;
6496 cum->mmx_nregs -= 1;
6497 cum->mmx_regno += 1;
6498 if (cum->mmx_nregs <= 0)
6499 {
6500 cum->mmx_nregs = 0;
6501 cum->mmx_regno = 0;
6502 }
6503 }
6504 break;
6505 }
6506 }
6507
6508 static void
6509 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6510 const_tree type, HOST_WIDE_INT words, bool named)
6511 {
6512 int int_nregs, sse_nregs;
6513
6514 /* Unnamed 256bit vector mode parameters are passed on stack. */
6515 if (!named && VALID_AVX256_REG_MODE (mode))
6516 return;
6517
6518 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6519 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6520 {
6521 cum->nregs -= int_nregs;
6522 cum->sse_nregs -= sse_nregs;
6523 cum->regno += int_nregs;
6524 cum->sse_regno += sse_nregs;
6525 }
6526 else
6527 {
6528 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6529 cum->words = (cum->words + align - 1) & ~(align - 1);
6530 cum->words += words;
6531 }
6532 }
6533
6534 static void
6535 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6536 HOST_WIDE_INT words)
6537 {
6538 /* Otherwise, this should be passed indirect. */
6539 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6540
6541 cum->words += words;
6542 if (cum->nregs > 0)
6543 {
6544 cum->nregs -= 1;
6545 cum->regno += 1;
6546 }
6547 }
6548
6549 /* Update the data in CUM to advance over an argument of mode MODE and
6550 data type TYPE. (TYPE is null for libcalls where that information
6551 may not be available.) */
6552
6553 static void
6554 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6555 const_tree type, bool named)
6556 {
6557 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6558 HOST_WIDE_INT bytes, words;
6559
6560 if (mode == BLKmode)
6561 bytes = int_size_in_bytes (type);
6562 else
6563 bytes = GET_MODE_SIZE (mode);
6564 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6565
6566 if (type)
6567 mode = type_natural_mode (type, NULL);
6568
6569 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6570 function_arg_advance_ms_64 (cum, bytes, words);
6571 else if (TARGET_64BIT)
6572 function_arg_advance_64 (cum, mode, type, words, named);
6573 else
6574 function_arg_advance_32 (cum, mode, type, bytes, words);
6575 }
6576
6577 /* Define where to put the arguments to a function.
6578 Value is zero to push the argument on the stack,
6579 or a hard register in which to store the argument.
6580
6581 MODE is the argument's machine mode.
6582 TYPE is the data type of the argument (as a tree).
6583 This is null for libcalls where that information may
6584 not be available.
6585 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6586 the preceding args and about the function being called.
6587 NAMED is nonzero if this argument is a named parameter
6588 (otherwise it is an extra parameter matching an ellipsis). */
6589
6590 static rtx
6591 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6592 enum machine_mode orig_mode, const_tree type,
6593 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6594 {
6595 static bool warnedsse, warnedmmx;
6596
6597 /* Avoid the AL settings for the Unix64 ABI. */
6598 if (mode == VOIDmode)
6599 return constm1_rtx;
6600
6601 switch (mode)
6602 {
6603 default:
6604 break;
6605
6606 case BLKmode:
6607 if (bytes < 0)
6608 break;
6609 /* FALLTHRU */
6610 case DImode:
6611 case SImode:
6612 case HImode:
6613 case QImode:
6614 if (words <= cum->nregs)
6615 {
6616 int regno = cum->regno;
6617
6618 /* Fastcall allocates the first two DWORD (SImode) or
6619 smaller arguments to ECX and EDX if it isn't an
6620 aggregate type . */
6621 if (cum->fastcall)
6622 {
6623 if (mode == BLKmode
6624 || mode == DImode
6625 || (type && AGGREGATE_TYPE_P (type)))
6626 break;
6627
6628 /* ECX not EAX is the first allocated register. */
6629 if (regno == AX_REG)
6630 regno = CX_REG;
6631 }
6632 return gen_rtx_REG (mode, regno);
6633 }
6634 break;
6635
6636 case DFmode:
6637 if (cum->float_in_sse < 2)
6638 break;
6639 case SFmode:
6640 if (cum->float_in_sse < 1)
6641 break;
6642 /* FALLTHRU */
6643 case TImode:
6644 /* In 32bit, we pass TImode in xmm registers. */
6645 case V16QImode:
6646 case V8HImode:
6647 case V4SImode:
6648 case V2DImode:
6649 case V4SFmode:
6650 case V2DFmode:
6651 if (!type || !AGGREGATE_TYPE_P (type))
6652 {
6653 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6654 {
6655 warnedsse = true;
6656 warning (0, "SSE vector argument without SSE enabled "
6657 "changes the ABI");
6658 }
6659 if (cum->sse_nregs)
6660 return gen_reg_or_parallel (mode, orig_mode,
6661 cum->sse_regno + FIRST_SSE_REG);
6662 }
6663 break;
6664
6665 case OImode:
6666 /* OImode shouldn't be used directly. */
6667 gcc_unreachable ();
6668
6669 case V8SFmode:
6670 case V8SImode:
6671 case V32QImode:
6672 case V16HImode:
6673 case V4DFmode:
6674 case V4DImode:
6675 if (!type || !AGGREGATE_TYPE_P (type))
6676 {
6677 if (cum->sse_nregs)
6678 return gen_reg_or_parallel (mode, orig_mode,
6679 cum->sse_regno + FIRST_SSE_REG);
6680 }
6681 break;
6682
6683 case V8QImode:
6684 case V4HImode:
6685 case V2SImode:
6686 case V2SFmode:
6687 case V1TImode:
6688 case V1DImode:
6689 if (!type || !AGGREGATE_TYPE_P (type))
6690 {
6691 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6692 {
6693 warnedmmx = true;
6694 warning (0, "MMX vector argument without MMX enabled "
6695 "changes the ABI");
6696 }
6697 if (cum->mmx_nregs)
6698 return gen_reg_or_parallel (mode, orig_mode,
6699 cum->mmx_regno + FIRST_MMX_REG);
6700 }
6701 break;
6702 }
6703
6704 return NULL_RTX;
6705 }
6706
6707 static rtx
6708 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6709 enum machine_mode orig_mode, const_tree type, bool named)
6710 {
6711 /* Handle a hidden AL argument containing number of registers
6712 for varargs x86-64 functions. */
6713 if (mode == VOIDmode)
6714 return GEN_INT (cum->maybe_vaarg
6715 ? (cum->sse_nregs < 0
6716 ? X86_64_SSE_REGPARM_MAX
6717 : cum->sse_regno)
6718 : -1);
6719
6720 switch (mode)
6721 {
6722 default:
6723 break;
6724
6725 case V8SFmode:
6726 case V8SImode:
6727 case V32QImode:
6728 case V16HImode:
6729 case V4DFmode:
6730 case V4DImode:
6731 /* Unnamed 256bit vector mode parameters are passed on stack. */
6732 if (!named)
6733 return NULL;
6734 break;
6735 }
6736
6737 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6738 cum->sse_nregs,
6739 &x86_64_int_parameter_registers [cum->regno],
6740 cum->sse_regno);
6741 }
6742
6743 static rtx
6744 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6745 enum machine_mode orig_mode, bool named,
6746 HOST_WIDE_INT bytes)
6747 {
6748 unsigned int regno;
6749
6750 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6751 We use value of -2 to specify that current function call is MSABI. */
6752 if (mode == VOIDmode)
6753 return GEN_INT (-2);
6754
6755 /* If we've run out of registers, it goes on the stack. */
6756 if (cum->nregs == 0)
6757 return NULL_RTX;
6758
6759 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6760
6761 /* Only floating point modes are passed in anything but integer regs. */
6762 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6763 {
6764 if (named)
6765 regno = cum->regno + FIRST_SSE_REG;
6766 else
6767 {
6768 rtx t1, t2;
6769
6770 /* Unnamed floating parameters are passed in both the
6771 SSE and integer registers. */
6772 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6773 t2 = gen_rtx_REG (mode, regno);
6774 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6775 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6776 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6777 }
6778 }
6779 /* Handle aggregated types passed in register. */
6780 if (orig_mode == BLKmode)
6781 {
6782 if (bytes > 0 && bytes <= 8)
6783 mode = (bytes > 4 ? DImode : SImode);
6784 if (mode == BLKmode)
6785 mode = DImode;
6786 }
6787
6788 return gen_reg_or_parallel (mode, orig_mode, regno);
6789 }
6790
6791 /* Return where to put the arguments to a function.
6792 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6793
6794 MODE is the argument's machine mode. TYPE is the data type of the
6795 argument. It is null for libcalls where that information may not be
6796 available. CUM gives information about the preceding args and about
6797 the function being called. NAMED is nonzero if this argument is a
6798 named parameter (otherwise it is an extra parameter matching an
6799 ellipsis). */
6800
6801 static rtx
6802 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6803 const_tree type, bool named)
6804 {
6805 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6806 enum machine_mode mode = omode;
6807 HOST_WIDE_INT bytes, words;
6808 rtx arg;
6809
6810 if (mode == BLKmode)
6811 bytes = int_size_in_bytes (type);
6812 else
6813 bytes = GET_MODE_SIZE (mode);
6814 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6815
6816 /* To simplify the code below, represent vector types with a vector mode
6817 even if MMX/SSE are not active. */
6818 if (type && TREE_CODE (type) == VECTOR_TYPE)
6819 mode = type_natural_mode (type, cum);
6820
6821 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6822 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6823 else if (TARGET_64BIT)
6824 arg = function_arg_64 (cum, mode, omode, type, named);
6825 else
6826 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6827
6828 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6829 {
6830 /* This argument uses 256bit AVX modes. */
6831 if (cum->caller)
6832 cfun->machine->callee_pass_avx256_p = true;
6833 else
6834 cfun->machine->caller_pass_avx256_p = true;
6835 }
6836
6837 return arg;
6838 }
6839
6840 /* A C expression that indicates when an argument must be passed by
6841 reference. If nonzero for an argument, a copy of that argument is
6842 made in memory and a pointer to the argument is passed instead of
6843 the argument itself. The pointer is passed in whatever way is
6844 appropriate for passing a pointer to that type. */
6845
6846 static bool
6847 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6848 enum machine_mode mode ATTRIBUTE_UNUSED,
6849 const_tree type, bool named ATTRIBUTE_UNUSED)
6850 {
6851 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6852
6853 /* See Windows x64 Software Convention. */
6854 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6855 {
6856 int msize = (int) GET_MODE_SIZE (mode);
6857 if (type)
6858 {
6859 /* Arrays are passed by reference. */
6860 if (TREE_CODE (type) == ARRAY_TYPE)
6861 return true;
6862
6863 if (AGGREGATE_TYPE_P (type))
6864 {
6865 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6866 are passed by reference. */
6867 msize = int_size_in_bytes (type);
6868 }
6869 }
6870
6871 /* __m128 is passed by reference. */
6872 switch (msize) {
6873 case 1: case 2: case 4: case 8:
6874 break;
6875 default:
6876 return true;
6877 }
6878 }
6879 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6880 return 1;
6881
6882 return 0;
6883 }
6884
6885 /* Return true when TYPE should be 128bit aligned for 32bit argument
6886 passing ABI. XXX: This function is obsolete and is only used for
6887 checking psABI compatibility with previous versions of GCC. */
6888
6889 static bool
6890 ix86_compat_aligned_value_p (const_tree type)
6891 {
6892 enum machine_mode mode = TYPE_MODE (type);
6893 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6894 || mode == TDmode
6895 || mode == TFmode
6896 || mode == TCmode)
6897 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6898 return true;
6899 if (TYPE_ALIGN (type) < 128)
6900 return false;
6901
6902 if (AGGREGATE_TYPE_P (type))
6903 {
6904 /* Walk the aggregates recursively. */
6905 switch (TREE_CODE (type))
6906 {
6907 case RECORD_TYPE:
6908 case UNION_TYPE:
6909 case QUAL_UNION_TYPE:
6910 {
6911 tree field;
6912
6913 /* Walk all the structure fields. */
6914 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6915 {
6916 if (TREE_CODE (field) == FIELD_DECL
6917 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
6918 return true;
6919 }
6920 break;
6921 }
6922
6923 case ARRAY_TYPE:
6924 /* Just for use if some languages passes arrays by value. */
6925 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
6926 return true;
6927 break;
6928
6929 default:
6930 gcc_unreachable ();
6931 }
6932 }
6933 return false;
6934 }
6935
6936 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
6937 XXX: This function is obsolete and is only used for checking psABI
6938 compatibility with previous versions of GCC. */
6939
6940 static unsigned int
6941 ix86_compat_function_arg_boundary (enum machine_mode mode,
6942 const_tree type, unsigned int align)
6943 {
6944 /* In 32bit, only _Decimal128 and __float128 are aligned to their
6945 natural boundaries. */
6946 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
6947 {
6948 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
6949 make an exception for SSE modes since these require 128bit
6950 alignment.
6951
6952 The handling here differs from field_alignment. ICC aligns MMX
6953 arguments to 4 byte boundaries, while structure fields are aligned
6954 to 8 byte boundaries. */
6955 if (!type)
6956 {
6957 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
6958 align = PARM_BOUNDARY;
6959 }
6960 else
6961 {
6962 if (!ix86_compat_aligned_value_p (type))
6963 align = PARM_BOUNDARY;
6964 }
6965 }
6966 if (align > BIGGEST_ALIGNMENT)
6967 align = BIGGEST_ALIGNMENT;
6968 return align;
6969 }
6970
6971 /* Return true when TYPE should be 128bit aligned for 32bit argument
6972 passing ABI. */
6973
6974 static bool
6975 ix86_contains_aligned_value_p (const_tree type)
6976 {
6977 enum machine_mode mode = TYPE_MODE (type);
6978
6979 if (mode == XFmode || mode == XCmode)
6980 return false;
6981
6982 if (TYPE_ALIGN (type) < 128)
6983 return false;
6984
6985 if (AGGREGATE_TYPE_P (type))
6986 {
6987 /* Walk the aggregates recursively. */
6988 switch (TREE_CODE (type))
6989 {
6990 case RECORD_TYPE:
6991 case UNION_TYPE:
6992 case QUAL_UNION_TYPE:
6993 {
6994 tree field;
6995
6996 /* Walk all the structure fields. */
6997 for (field = TYPE_FIELDS (type);
6998 field;
6999 field = DECL_CHAIN (field))
7000 {
7001 if (TREE_CODE (field) == FIELD_DECL
7002 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7003 return true;
7004 }
7005 break;
7006 }
7007
7008 case ARRAY_TYPE:
7009 /* Just for use if some languages passes arrays by value. */
7010 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7011 return true;
7012 break;
7013
7014 default:
7015 gcc_unreachable ();
7016 }
7017 }
7018 else
7019 return TYPE_ALIGN (type) >= 128;
7020
7021 return false;
7022 }
7023
7024 /* Gives the alignment boundary, in bits, of an argument with the
7025 specified mode and type. */
7026
7027 static unsigned int
7028 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7029 {
7030 unsigned int align;
7031 if (type)
7032 {
7033 /* Since the main variant type is used for call, we convert it to
7034 the main variant type. */
7035 type = TYPE_MAIN_VARIANT (type);
7036 align = TYPE_ALIGN (type);
7037 }
7038 else
7039 align = GET_MODE_ALIGNMENT (mode);
7040 if (align < PARM_BOUNDARY)
7041 align = PARM_BOUNDARY;
7042 else
7043 {
7044 static bool warned;
7045 unsigned int saved_align = align;
7046
7047 if (!TARGET_64BIT)
7048 {
7049 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7050 if (!type)
7051 {
7052 if (mode == XFmode || mode == XCmode)
7053 align = PARM_BOUNDARY;
7054 }
7055 else if (!ix86_contains_aligned_value_p (type))
7056 align = PARM_BOUNDARY;
7057
7058 if (align < 128)
7059 align = PARM_BOUNDARY;
7060 }
7061
7062 if (warn_psabi
7063 && !warned
7064 && align != ix86_compat_function_arg_boundary (mode, type,
7065 saved_align))
7066 {
7067 warned = true;
7068 inform (input_location,
7069 "The ABI for passing parameters with %d-byte"
7070 " alignment has changed in GCC 4.6",
7071 align / BITS_PER_UNIT);
7072 }
7073 }
7074
7075 return align;
7076 }
7077
7078 /* Return true if N is a possible register number of function value. */
7079
7080 static bool
7081 ix86_function_value_regno_p (const unsigned int regno)
7082 {
7083 switch (regno)
7084 {
7085 case AX_REG:
7086 return true;
7087
7088 case FIRST_FLOAT_REG:
7089 /* TODO: The function should depend on current function ABI but
7090 builtins.c would need updating then. Therefore we use the
7091 default ABI. */
7092 if (TARGET_64BIT && ix86_abi == MS_ABI)
7093 return false;
7094 return TARGET_FLOAT_RETURNS_IN_80387;
7095
7096 case FIRST_SSE_REG:
7097 return TARGET_SSE;
7098
7099 case FIRST_MMX_REG:
7100 if (TARGET_MACHO || TARGET_64BIT)
7101 return false;
7102 return TARGET_MMX;
7103 }
7104
7105 return false;
7106 }
7107
7108 /* Define how to find the value returned by a function.
7109 VALTYPE is the data type of the value (as a tree).
7110 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7111 otherwise, FUNC is 0. */
7112
7113 static rtx
7114 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7115 const_tree fntype, const_tree fn)
7116 {
7117 unsigned int regno;
7118
7119 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7120 we normally prevent this case when mmx is not available. However
7121 some ABIs may require the result to be returned like DImode. */
7122 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7123 regno = FIRST_MMX_REG;
7124
7125 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7126 we prevent this case when sse is not available. However some ABIs
7127 may require the result to be returned like integer TImode. */
7128 else if (mode == TImode
7129 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7130 regno = FIRST_SSE_REG;
7131
7132 /* 32-byte vector modes in %ymm0. */
7133 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7134 regno = FIRST_SSE_REG;
7135
7136 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7137 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7138 regno = FIRST_FLOAT_REG;
7139 else
7140 /* Most things go in %eax. */
7141 regno = AX_REG;
7142
7143 /* Override FP return register with %xmm0 for local functions when
7144 SSE math is enabled or for functions with sseregparm attribute. */
7145 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7146 {
7147 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7148 if ((sse_level >= 1 && mode == SFmode)
7149 || (sse_level == 2 && mode == DFmode))
7150 regno = FIRST_SSE_REG;
7151 }
7152
7153 /* OImode shouldn't be used directly. */
7154 gcc_assert (mode != OImode);
7155
7156 return gen_rtx_REG (orig_mode, regno);
7157 }
7158
7159 static rtx
7160 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7161 const_tree valtype)
7162 {
7163 rtx ret;
7164
7165 /* Handle libcalls, which don't provide a type node. */
7166 if (valtype == NULL)
7167 {
7168 unsigned int regno;
7169
7170 switch (mode)
7171 {
7172 case SFmode:
7173 case SCmode:
7174 case DFmode:
7175 case DCmode:
7176 case TFmode:
7177 case SDmode:
7178 case DDmode:
7179 case TDmode:
7180 regno = FIRST_SSE_REG;
7181 break;
7182 case XFmode:
7183 case XCmode:
7184 regno = FIRST_FLOAT_REG;
7185 break;
7186 case TCmode:
7187 return NULL;
7188 default:
7189 regno = AX_REG;
7190 }
7191
7192 return gen_rtx_REG (mode, regno);
7193 }
7194 else if (POINTER_TYPE_P (valtype))
7195 {
7196 /* Pointers are always returned in Pmode. */
7197 mode = Pmode;
7198 }
7199
7200 ret = construct_container (mode, orig_mode, valtype, 1,
7201 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7202 x86_64_int_return_registers, 0);
7203
7204 /* For zero sized structures, construct_container returns NULL, but we
7205 need to keep rest of compiler happy by returning meaningful value. */
7206 if (!ret)
7207 ret = gen_rtx_REG (orig_mode, AX_REG);
7208
7209 return ret;
7210 }
7211
7212 static rtx
7213 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7214 {
7215 unsigned int regno = AX_REG;
7216
7217 if (TARGET_SSE)
7218 {
7219 switch (GET_MODE_SIZE (mode))
7220 {
7221 case 16:
7222 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7223 && !COMPLEX_MODE_P (mode))
7224 regno = FIRST_SSE_REG;
7225 break;
7226 case 8:
7227 case 4:
7228 if (mode == SFmode || mode == DFmode)
7229 regno = FIRST_SSE_REG;
7230 break;
7231 default:
7232 break;
7233 }
7234 }
7235 return gen_rtx_REG (orig_mode, regno);
7236 }
7237
7238 static rtx
7239 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7240 enum machine_mode orig_mode, enum machine_mode mode)
7241 {
7242 const_tree fn, fntype;
7243
7244 fn = NULL_TREE;
7245 if (fntype_or_decl && DECL_P (fntype_or_decl))
7246 fn = fntype_or_decl;
7247 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7248
7249 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7250 return function_value_ms_64 (orig_mode, mode);
7251 else if (TARGET_64BIT)
7252 return function_value_64 (orig_mode, mode, valtype);
7253 else
7254 return function_value_32 (orig_mode, mode, fntype, fn);
7255 }
7256
7257 static rtx
7258 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7259 bool outgoing ATTRIBUTE_UNUSED)
7260 {
7261 enum machine_mode mode, orig_mode;
7262
7263 orig_mode = TYPE_MODE (valtype);
7264 mode = type_natural_mode (valtype, NULL);
7265 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7266 }
7267
7268 /* Pointer function arguments and return values are promoted to Pmode. */
7269
7270 static enum machine_mode
7271 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7272 int *punsignedp, const_tree fntype,
7273 int for_return)
7274 {
7275 if (type != NULL_TREE && POINTER_TYPE_P (type))
7276 {
7277 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7278 return Pmode;
7279 }
7280 return default_promote_function_mode (type, mode, punsignedp, fntype,
7281 for_return);
7282 }
7283
7284 rtx
7285 ix86_libcall_value (enum machine_mode mode)
7286 {
7287 return ix86_function_value_1 (NULL, NULL, mode, mode);
7288 }
7289
7290 /* Return true iff type is returned in memory. */
7291
7292 static bool ATTRIBUTE_UNUSED
7293 return_in_memory_32 (const_tree type, enum machine_mode mode)
7294 {
7295 HOST_WIDE_INT size;
7296
7297 if (mode == BLKmode)
7298 return true;
7299
7300 size = int_size_in_bytes (type);
7301
7302 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7303 return false;
7304
7305 if (VECTOR_MODE_P (mode) || mode == TImode)
7306 {
7307 /* User-created vectors small enough to fit in EAX. */
7308 if (size < 8)
7309 return false;
7310
7311 /* MMX/3dNow values are returned in MM0,
7312 except when it doesn't exits or the ABI prescribes otherwise. */
7313 if (size == 8)
7314 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7315
7316 /* SSE values are returned in XMM0, except when it doesn't exist. */
7317 if (size == 16)
7318 return !TARGET_SSE;
7319
7320 /* AVX values are returned in YMM0, except when it doesn't exist. */
7321 if (size == 32)
7322 return !TARGET_AVX;
7323 }
7324
7325 if (mode == XFmode)
7326 return false;
7327
7328 if (size > 12)
7329 return true;
7330
7331 /* OImode shouldn't be used directly. */
7332 gcc_assert (mode != OImode);
7333
7334 return false;
7335 }
7336
7337 static bool ATTRIBUTE_UNUSED
7338 return_in_memory_64 (const_tree type, enum machine_mode mode)
7339 {
7340 int needed_intregs, needed_sseregs;
7341 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7342 }
7343
7344 static bool ATTRIBUTE_UNUSED
7345 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7346 {
7347 HOST_WIDE_INT size = int_size_in_bytes (type);
7348
7349 /* __m128 is returned in xmm0. */
7350 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7351 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7352 return false;
7353
7354 /* Otherwise, the size must be exactly in [1248]. */
7355 return size != 1 && size != 2 && size != 4 && size != 8;
7356 }
7357
7358 static bool
7359 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7360 {
7361 #ifdef SUBTARGET_RETURN_IN_MEMORY
7362 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7363 #else
7364 const enum machine_mode mode = type_natural_mode (type, NULL);
7365
7366 if (TARGET_64BIT)
7367 {
7368 if (ix86_function_type_abi (fntype) == MS_ABI)
7369 return return_in_memory_ms_64 (type, mode);
7370 else
7371 return return_in_memory_64 (type, mode);
7372 }
7373 else
7374 return return_in_memory_32 (type, mode);
7375 #endif
7376 }
7377
7378 /* When returning SSE vector types, we have a choice of either
7379 (1) being abi incompatible with a -march switch, or
7380 (2) generating an error.
7381 Given no good solution, I think the safest thing is one warning.
7382 The user won't be able to use -Werror, but....
7383
7384 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7385 called in response to actually generating a caller or callee that
7386 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7387 via aggregate_value_p for general type probing from tree-ssa. */
7388
7389 static rtx
7390 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7391 {
7392 static bool warnedsse, warnedmmx;
7393
7394 if (!TARGET_64BIT && type)
7395 {
7396 /* Look at the return type of the function, not the function type. */
7397 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7398
7399 if (!TARGET_SSE && !warnedsse)
7400 {
7401 if (mode == TImode
7402 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7403 {
7404 warnedsse = true;
7405 warning (0, "SSE vector return without SSE enabled "
7406 "changes the ABI");
7407 }
7408 }
7409
7410 if (!TARGET_MMX && !warnedmmx)
7411 {
7412 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7413 {
7414 warnedmmx = true;
7415 warning (0, "MMX vector return without MMX enabled "
7416 "changes the ABI");
7417 }
7418 }
7419 }
7420
7421 return NULL;
7422 }
7423
7424 \f
7425 /* Create the va_list data type. */
7426
7427 /* Returns the calling convention specific va_list date type.
7428 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7429
7430 static tree
7431 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7432 {
7433 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7434
7435 /* For i386 we use plain pointer to argument area. */
7436 if (!TARGET_64BIT || abi == MS_ABI)
7437 return build_pointer_type (char_type_node);
7438
7439 record = lang_hooks.types.make_type (RECORD_TYPE);
7440 type_decl = build_decl (BUILTINS_LOCATION,
7441 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7442
7443 f_gpr = build_decl (BUILTINS_LOCATION,
7444 FIELD_DECL, get_identifier ("gp_offset"),
7445 unsigned_type_node);
7446 f_fpr = build_decl (BUILTINS_LOCATION,
7447 FIELD_DECL, get_identifier ("fp_offset"),
7448 unsigned_type_node);
7449 f_ovf = build_decl (BUILTINS_LOCATION,
7450 FIELD_DECL, get_identifier ("overflow_arg_area"),
7451 ptr_type_node);
7452 f_sav = build_decl (BUILTINS_LOCATION,
7453 FIELD_DECL, get_identifier ("reg_save_area"),
7454 ptr_type_node);
7455
7456 va_list_gpr_counter_field = f_gpr;
7457 va_list_fpr_counter_field = f_fpr;
7458
7459 DECL_FIELD_CONTEXT (f_gpr) = record;
7460 DECL_FIELD_CONTEXT (f_fpr) = record;
7461 DECL_FIELD_CONTEXT (f_ovf) = record;
7462 DECL_FIELD_CONTEXT (f_sav) = record;
7463
7464 TYPE_STUB_DECL (record) = type_decl;
7465 TYPE_NAME (record) = type_decl;
7466 TYPE_FIELDS (record) = f_gpr;
7467 DECL_CHAIN (f_gpr) = f_fpr;
7468 DECL_CHAIN (f_fpr) = f_ovf;
7469 DECL_CHAIN (f_ovf) = f_sav;
7470
7471 layout_type (record);
7472
7473 /* The correct type is an array type of one element. */
7474 return build_array_type (record, build_index_type (size_zero_node));
7475 }
7476
7477 /* Setup the builtin va_list data type and for 64-bit the additional
7478 calling convention specific va_list data types. */
7479
7480 static tree
7481 ix86_build_builtin_va_list (void)
7482 {
7483 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7484
7485 /* Initialize abi specific va_list builtin types. */
7486 if (TARGET_64BIT)
7487 {
7488 tree t;
7489 if (ix86_abi == MS_ABI)
7490 {
7491 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7492 if (TREE_CODE (t) != RECORD_TYPE)
7493 t = build_variant_type_copy (t);
7494 sysv_va_list_type_node = t;
7495 }
7496 else
7497 {
7498 t = ret;
7499 if (TREE_CODE (t) != RECORD_TYPE)
7500 t = build_variant_type_copy (t);
7501 sysv_va_list_type_node = t;
7502 }
7503 if (ix86_abi != MS_ABI)
7504 {
7505 t = ix86_build_builtin_va_list_abi (MS_ABI);
7506 if (TREE_CODE (t) != RECORD_TYPE)
7507 t = build_variant_type_copy (t);
7508 ms_va_list_type_node = t;
7509 }
7510 else
7511 {
7512 t = ret;
7513 if (TREE_CODE (t) != RECORD_TYPE)
7514 t = build_variant_type_copy (t);
7515 ms_va_list_type_node = t;
7516 }
7517 }
7518
7519 return ret;
7520 }
7521
7522 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7523
7524 static void
7525 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7526 {
7527 rtx save_area, mem;
7528 alias_set_type set;
7529 int i, max;
7530
7531 /* GPR size of varargs save area. */
7532 if (cfun->va_list_gpr_size)
7533 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7534 else
7535 ix86_varargs_gpr_size = 0;
7536
7537 /* FPR size of varargs save area. We don't need it if we don't pass
7538 anything in SSE registers. */
7539 if (TARGET_SSE && cfun->va_list_fpr_size)
7540 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7541 else
7542 ix86_varargs_fpr_size = 0;
7543
7544 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7545 return;
7546
7547 save_area = frame_pointer_rtx;
7548 set = get_varargs_alias_set ();
7549
7550 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7551 if (max > X86_64_REGPARM_MAX)
7552 max = X86_64_REGPARM_MAX;
7553
7554 for (i = cum->regno; i < max; i++)
7555 {
7556 mem = gen_rtx_MEM (Pmode,
7557 plus_constant (save_area, i * UNITS_PER_WORD));
7558 MEM_NOTRAP_P (mem) = 1;
7559 set_mem_alias_set (mem, set);
7560 emit_move_insn (mem, gen_rtx_REG (Pmode,
7561 x86_64_int_parameter_registers[i]));
7562 }
7563
7564 if (ix86_varargs_fpr_size)
7565 {
7566 enum machine_mode smode;
7567 rtx label, test;
7568
7569 /* Now emit code to save SSE registers. The AX parameter contains number
7570 of SSE parameter registers used to call this function, though all we
7571 actually check here is the zero/non-zero status. */
7572
7573 label = gen_label_rtx ();
7574 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7575 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7576 label));
7577
7578 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7579 we used movdqa (i.e. TImode) instead? Perhaps even better would
7580 be if we could determine the real mode of the data, via a hook
7581 into pass_stdarg. Ignore all that for now. */
7582 smode = V4SFmode;
7583 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7584 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7585
7586 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7587 if (max > X86_64_SSE_REGPARM_MAX)
7588 max = X86_64_SSE_REGPARM_MAX;
7589
7590 for (i = cum->sse_regno; i < max; ++i)
7591 {
7592 mem = plus_constant (save_area, i * 16 + ix86_varargs_gpr_size);
7593 mem = gen_rtx_MEM (smode, mem);
7594 MEM_NOTRAP_P (mem) = 1;
7595 set_mem_alias_set (mem, set);
7596 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7597
7598 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7599 }
7600
7601 emit_label (label);
7602 }
7603 }
7604
7605 static void
7606 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7607 {
7608 alias_set_type set = get_varargs_alias_set ();
7609 int i;
7610
7611 /* Reset to zero, as there might be a sysv vaarg used
7612 before. */
7613 ix86_varargs_gpr_size = 0;
7614 ix86_varargs_fpr_size = 0;
7615
7616 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7617 {
7618 rtx reg, mem;
7619
7620 mem = gen_rtx_MEM (Pmode,
7621 plus_constant (virtual_incoming_args_rtx,
7622 i * UNITS_PER_WORD));
7623 MEM_NOTRAP_P (mem) = 1;
7624 set_mem_alias_set (mem, set);
7625
7626 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7627 emit_move_insn (mem, reg);
7628 }
7629 }
7630
7631 static void
7632 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7633 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7634 int no_rtl)
7635 {
7636 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7637 CUMULATIVE_ARGS next_cum;
7638 tree fntype;
7639
7640 /* This argument doesn't appear to be used anymore. Which is good,
7641 because the old code here didn't suppress rtl generation. */
7642 gcc_assert (!no_rtl);
7643
7644 if (!TARGET_64BIT)
7645 return;
7646
7647 fntype = TREE_TYPE (current_function_decl);
7648
7649 /* For varargs, we do not want to skip the dummy va_dcl argument.
7650 For stdargs, we do want to skip the last named argument. */
7651 next_cum = *cum;
7652 if (stdarg_p (fntype))
7653 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7654 true);
7655
7656 if (cum->call_abi == MS_ABI)
7657 setup_incoming_varargs_ms_64 (&next_cum);
7658 else
7659 setup_incoming_varargs_64 (&next_cum);
7660 }
7661
7662 /* Checks if TYPE is of kind va_list char *. */
7663
7664 static bool
7665 is_va_list_char_pointer (tree type)
7666 {
7667 tree canonic;
7668
7669 /* For 32-bit it is always true. */
7670 if (!TARGET_64BIT)
7671 return true;
7672 canonic = ix86_canonical_va_list_type (type);
7673 return (canonic == ms_va_list_type_node
7674 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7675 }
7676
7677 /* Implement va_start. */
7678
7679 static void
7680 ix86_va_start (tree valist, rtx nextarg)
7681 {
7682 HOST_WIDE_INT words, n_gpr, n_fpr;
7683 tree f_gpr, f_fpr, f_ovf, f_sav;
7684 tree gpr, fpr, ovf, sav, t;
7685 tree type;
7686 rtx ovf_rtx;
7687
7688 if (flag_split_stack
7689 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7690 {
7691 unsigned int scratch_regno;
7692
7693 /* When we are splitting the stack, we can't refer to the stack
7694 arguments using internal_arg_pointer, because they may be on
7695 the old stack. The split stack prologue will arrange to
7696 leave a pointer to the old stack arguments in a scratch
7697 register, which we here copy to a pseudo-register. The split
7698 stack prologue can't set the pseudo-register directly because
7699 it (the prologue) runs before any registers have been saved. */
7700
7701 scratch_regno = split_stack_prologue_scratch_regno ();
7702 if (scratch_regno != INVALID_REGNUM)
7703 {
7704 rtx reg, seq;
7705
7706 reg = gen_reg_rtx (Pmode);
7707 cfun->machine->split_stack_varargs_pointer = reg;
7708
7709 start_sequence ();
7710 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7711 seq = get_insns ();
7712 end_sequence ();
7713
7714 push_topmost_sequence ();
7715 emit_insn_after (seq, entry_of_function ());
7716 pop_topmost_sequence ();
7717 }
7718 }
7719
7720 /* Only 64bit target needs something special. */
7721 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7722 {
7723 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7724 std_expand_builtin_va_start (valist, nextarg);
7725 else
7726 {
7727 rtx va_r, next;
7728
7729 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7730 next = expand_binop (ptr_mode, add_optab,
7731 cfun->machine->split_stack_varargs_pointer,
7732 crtl->args.arg_offset_rtx,
7733 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7734 convert_move (va_r, next, 0);
7735 }
7736 return;
7737 }
7738
7739 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7740 f_fpr = DECL_CHAIN (f_gpr);
7741 f_ovf = DECL_CHAIN (f_fpr);
7742 f_sav = DECL_CHAIN (f_ovf);
7743
7744 valist = build_simple_mem_ref (valist);
7745 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7746 /* The following should be folded into the MEM_REF offset. */
7747 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7748 f_gpr, NULL_TREE);
7749 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7750 f_fpr, NULL_TREE);
7751 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7752 f_ovf, NULL_TREE);
7753 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7754 f_sav, NULL_TREE);
7755
7756 /* Count number of gp and fp argument registers used. */
7757 words = crtl->args.info.words;
7758 n_gpr = crtl->args.info.regno;
7759 n_fpr = crtl->args.info.sse_regno;
7760
7761 if (cfun->va_list_gpr_size)
7762 {
7763 type = TREE_TYPE (gpr);
7764 t = build2 (MODIFY_EXPR, type,
7765 gpr, build_int_cst (type, n_gpr * 8));
7766 TREE_SIDE_EFFECTS (t) = 1;
7767 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7768 }
7769
7770 if (TARGET_SSE && cfun->va_list_fpr_size)
7771 {
7772 type = TREE_TYPE (fpr);
7773 t = build2 (MODIFY_EXPR, type, fpr,
7774 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7775 TREE_SIDE_EFFECTS (t) = 1;
7776 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7777 }
7778
7779 /* Find the overflow area. */
7780 type = TREE_TYPE (ovf);
7781 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7782 ovf_rtx = crtl->args.internal_arg_pointer;
7783 else
7784 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7785 t = make_tree (type, ovf_rtx);
7786 if (words != 0)
7787 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7788 t = build2 (MODIFY_EXPR, type, ovf, t);
7789 TREE_SIDE_EFFECTS (t) = 1;
7790 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7791
7792 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7793 {
7794 /* Find the register save area.
7795 Prologue of the function save it right above stack frame. */
7796 type = TREE_TYPE (sav);
7797 t = make_tree (type, frame_pointer_rtx);
7798 if (!ix86_varargs_gpr_size)
7799 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7800 t = build2 (MODIFY_EXPR, type, sav, t);
7801 TREE_SIDE_EFFECTS (t) = 1;
7802 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7803 }
7804 }
7805
7806 /* Implement va_arg. */
7807
7808 static tree
7809 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7810 gimple_seq *post_p)
7811 {
7812 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7813 tree f_gpr, f_fpr, f_ovf, f_sav;
7814 tree gpr, fpr, ovf, sav, t;
7815 int size, rsize;
7816 tree lab_false, lab_over = NULL_TREE;
7817 tree addr, t2;
7818 rtx container;
7819 int indirect_p = 0;
7820 tree ptrtype;
7821 enum machine_mode nat_mode;
7822 unsigned int arg_boundary;
7823
7824 /* Only 64bit target needs something special. */
7825 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7826 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7827
7828 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7829 f_fpr = DECL_CHAIN (f_gpr);
7830 f_ovf = DECL_CHAIN (f_fpr);
7831 f_sav = DECL_CHAIN (f_ovf);
7832
7833 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7834 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7835 valist = build_va_arg_indirect_ref (valist);
7836 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7837 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7838 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7839
7840 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7841 if (indirect_p)
7842 type = build_pointer_type (type);
7843 size = int_size_in_bytes (type);
7844 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7845
7846 nat_mode = type_natural_mode (type, NULL);
7847 switch (nat_mode)
7848 {
7849 case V8SFmode:
7850 case V8SImode:
7851 case V32QImode:
7852 case V16HImode:
7853 case V4DFmode:
7854 case V4DImode:
7855 /* Unnamed 256bit vector mode parameters are passed on stack. */
7856 if (!TARGET_64BIT_MS_ABI)
7857 {
7858 container = NULL;
7859 break;
7860 }
7861
7862 default:
7863 container = construct_container (nat_mode, TYPE_MODE (type),
7864 type, 0, X86_64_REGPARM_MAX,
7865 X86_64_SSE_REGPARM_MAX, intreg,
7866 0);
7867 break;
7868 }
7869
7870 /* Pull the value out of the saved registers. */
7871
7872 addr = create_tmp_var (ptr_type_node, "addr");
7873
7874 if (container)
7875 {
7876 int needed_intregs, needed_sseregs;
7877 bool need_temp;
7878 tree int_addr, sse_addr;
7879
7880 lab_false = create_artificial_label (UNKNOWN_LOCATION);
7881 lab_over = create_artificial_label (UNKNOWN_LOCATION);
7882
7883 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7884
7885 need_temp = (!REG_P (container)
7886 && ((needed_intregs && TYPE_ALIGN (type) > 64)
7887 || TYPE_ALIGN (type) > 128));
7888
7889 /* In case we are passing structure, verify that it is consecutive block
7890 on the register save area. If not we need to do moves. */
7891 if (!need_temp && !REG_P (container))
7892 {
7893 /* Verify that all registers are strictly consecutive */
7894 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7895 {
7896 int i;
7897
7898 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7899 {
7900 rtx slot = XVECEXP (container, 0, i);
7901 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7902 || INTVAL (XEXP (slot, 1)) != i * 16)
7903 need_temp = 1;
7904 }
7905 }
7906 else
7907 {
7908 int i;
7909
7910 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7911 {
7912 rtx slot = XVECEXP (container, 0, i);
7913 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
7914 || INTVAL (XEXP (slot, 1)) != i * 8)
7915 need_temp = 1;
7916 }
7917 }
7918 }
7919 if (!need_temp)
7920 {
7921 int_addr = addr;
7922 sse_addr = addr;
7923 }
7924 else
7925 {
7926 int_addr = create_tmp_var (ptr_type_node, "int_addr");
7927 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
7928 }
7929
7930 /* First ensure that we fit completely in registers. */
7931 if (needed_intregs)
7932 {
7933 t = build_int_cst (TREE_TYPE (gpr),
7934 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
7935 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
7936 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7937 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7938 gimplify_and_add (t, pre_p);
7939 }
7940 if (needed_sseregs)
7941 {
7942 t = build_int_cst (TREE_TYPE (fpr),
7943 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
7944 + X86_64_REGPARM_MAX * 8);
7945 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
7946 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7947 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7948 gimplify_and_add (t, pre_p);
7949 }
7950
7951 /* Compute index to start of area used for integer regs. */
7952 if (needed_intregs)
7953 {
7954 /* int_addr = gpr + sav; */
7955 t = fold_build_pointer_plus (sav, gpr);
7956 gimplify_assign (int_addr, t, pre_p);
7957 }
7958 if (needed_sseregs)
7959 {
7960 /* sse_addr = fpr + sav; */
7961 t = fold_build_pointer_plus (sav, fpr);
7962 gimplify_assign (sse_addr, t, pre_p);
7963 }
7964 if (need_temp)
7965 {
7966 int i, prev_size = 0;
7967 tree temp = create_tmp_var (type, "va_arg_tmp");
7968
7969 /* addr = &temp; */
7970 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
7971 gimplify_assign (addr, t, pre_p);
7972
7973 for (i = 0; i < XVECLEN (container, 0); i++)
7974 {
7975 rtx slot = XVECEXP (container, 0, i);
7976 rtx reg = XEXP (slot, 0);
7977 enum machine_mode mode = GET_MODE (reg);
7978 tree piece_type;
7979 tree addr_type;
7980 tree daddr_type;
7981 tree src_addr, src;
7982 int src_offset;
7983 tree dest_addr, dest;
7984 int cur_size = GET_MODE_SIZE (mode);
7985
7986 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
7987 prev_size = INTVAL (XEXP (slot, 1));
7988 if (prev_size + cur_size > size)
7989 {
7990 cur_size = size - prev_size;
7991 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
7992 if (mode == BLKmode)
7993 mode = QImode;
7994 }
7995 piece_type = lang_hooks.types.type_for_mode (mode, 1);
7996 if (mode == GET_MODE (reg))
7997 addr_type = build_pointer_type (piece_type);
7998 else
7999 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8000 true);
8001 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8002 true);
8003
8004 if (SSE_REGNO_P (REGNO (reg)))
8005 {
8006 src_addr = sse_addr;
8007 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8008 }
8009 else
8010 {
8011 src_addr = int_addr;
8012 src_offset = REGNO (reg) * 8;
8013 }
8014 src_addr = fold_convert (addr_type, src_addr);
8015 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8016
8017 dest_addr = fold_convert (daddr_type, addr);
8018 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8019 if (cur_size == GET_MODE_SIZE (mode))
8020 {
8021 src = build_va_arg_indirect_ref (src_addr);
8022 dest = build_va_arg_indirect_ref (dest_addr);
8023
8024 gimplify_assign (dest, src, pre_p);
8025 }
8026 else
8027 {
8028 tree copy
8029 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8030 3, dest_addr, src_addr,
8031 size_int (cur_size));
8032 gimplify_and_add (copy, pre_p);
8033 }
8034 prev_size += cur_size;
8035 }
8036 }
8037
8038 if (needed_intregs)
8039 {
8040 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8041 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8042 gimplify_assign (gpr, t, pre_p);
8043 }
8044
8045 if (needed_sseregs)
8046 {
8047 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8048 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8049 gimplify_assign (fpr, t, pre_p);
8050 }
8051
8052 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8053
8054 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8055 }
8056
8057 /* ... otherwise out of the overflow area. */
8058
8059 /* When we align parameter on stack for caller, if the parameter
8060 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8061 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8062 here with caller. */
8063 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8064 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8065 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8066
8067 /* Care for on-stack alignment if needed. */
8068 if (arg_boundary <= 64 || size == 0)
8069 t = ovf;
8070 else
8071 {
8072 HOST_WIDE_INT align = arg_boundary / 8;
8073 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8074 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8075 build_int_cst (TREE_TYPE (t), -align));
8076 }
8077
8078 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8079 gimplify_assign (addr, t, pre_p);
8080
8081 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8082 gimplify_assign (unshare_expr (ovf), t, pre_p);
8083
8084 if (container)
8085 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8086
8087 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8088 addr = fold_convert (ptrtype, addr);
8089
8090 if (indirect_p)
8091 addr = build_va_arg_indirect_ref (addr);
8092 return build_va_arg_indirect_ref (addr);
8093 }
8094 \f
8095 /* Return true if OPNUM's MEM should be matched
8096 in movabs* patterns. */
8097
8098 bool
8099 ix86_check_movabs (rtx insn, int opnum)
8100 {
8101 rtx set, mem;
8102
8103 set = PATTERN (insn);
8104 if (GET_CODE (set) == PARALLEL)
8105 set = XVECEXP (set, 0, 0);
8106 gcc_assert (GET_CODE (set) == SET);
8107 mem = XEXP (set, opnum);
8108 while (GET_CODE (mem) == SUBREG)
8109 mem = SUBREG_REG (mem);
8110 gcc_assert (MEM_P (mem));
8111 return volatile_ok || !MEM_VOLATILE_P (mem);
8112 }
8113 \f
8114 /* Initialize the table of extra 80387 mathematical constants. */
8115
8116 static void
8117 init_ext_80387_constants (void)
8118 {
8119 static const char * cst[5] =
8120 {
8121 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8122 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8123 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8124 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8125 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8126 };
8127 int i;
8128
8129 for (i = 0; i < 5; i++)
8130 {
8131 real_from_string (&ext_80387_constants_table[i], cst[i]);
8132 /* Ensure each constant is rounded to XFmode precision. */
8133 real_convert (&ext_80387_constants_table[i],
8134 XFmode, &ext_80387_constants_table[i]);
8135 }
8136
8137 ext_80387_constants_init = 1;
8138 }
8139
8140 /* Return non-zero if the constant is something that
8141 can be loaded with a special instruction. */
8142
8143 int
8144 standard_80387_constant_p (rtx x)
8145 {
8146 enum machine_mode mode = GET_MODE (x);
8147
8148 REAL_VALUE_TYPE r;
8149
8150 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8151 return -1;
8152
8153 if (x == CONST0_RTX (mode))
8154 return 1;
8155 if (x == CONST1_RTX (mode))
8156 return 2;
8157
8158 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8159
8160 /* For XFmode constants, try to find a special 80387 instruction when
8161 optimizing for size or on those CPUs that benefit from them. */
8162 if (mode == XFmode
8163 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8164 {
8165 int i;
8166
8167 if (! ext_80387_constants_init)
8168 init_ext_80387_constants ();
8169
8170 for (i = 0; i < 5; i++)
8171 if (real_identical (&r, &ext_80387_constants_table[i]))
8172 return i + 3;
8173 }
8174
8175 /* Load of the constant -0.0 or -1.0 will be split as
8176 fldz;fchs or fld1;fchs sequence. */
8177 if (real_isnegzero (&r))
8178 return 8;
8179 if (real_identical (&r, &dconstm1))
8180 return 9;
8181
8182 return 0;
8183 }
8184
8185 /* Return the opcode of the special instruction to be used to load
8186 the constant X. */
8187
8188 const char *
8189 standard_80387_constant_opcode (rtx x)
8190 {
8191 switch (standard_80387_constant_p (x))
8192 {
8193 case 1:
8194 return "fldz";
8195 case 2:
8196 return "fld1";
8197 case 3:
8198 return "fldlg2";
8199 case 4:
8200 return "fldln2";
8201 case 5:
8202 return "fldl2e";
8203 case 6:
8204 return "fldl2t";
8205 case 7:
8206 return "fldpi";
8207 case 8:
8208 case 9:
8209 return "#";
8210 default:
8211 gcc_unreachable ();
8212 }
8213 }
8214
8215 /* Return the CONST_DOUBLE representing the 80387 constant that is
8216 loaded by the specified special instruction. The argument IDX
8217 matches the return value from standard_80387_constant_p. */
8218
8219 rtx
8220 standard_80387_constant_rtx (int idx)
8221 {
8222 int i;
8223
8224 if (! ext_80387_constants_init)
8225 init_ext_80387_constants ();
8226
8227 switch (idx)
8228 {
8229 case 3:
8230 case 4:
8231 case 5:
8232 case 6:
8233 case 7:
8234 i = idx - 3;
8235 break;
8236
8237 default:
8238 gcc_unreachable ();
8239 }
8240
8241 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8242 XFmode);
8243 }
8244
8245 /* Return 1 if X is all 0s and 2 if x is all 1s
8246 in supported SSE/AVX vector mode. */
8247
8248 int
8249 standard_sse_constant_p (rtx x)
8250 {
8251 enum machine_mode mode = GET_MODE (x);
8252
8253 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8254 return 1;
8255 if (vector_all_ones_operand (x, mode))
8256 switch (mode)
8257 {
8258 case V16QImode:
8259 case V8HImode:
8260 case V4SImode:
8261 case V2DImode:
8262 if (TARGET_SSE2)
8263 return 2;
8264 case V32QImode:
8265 case V16HImode:
8266 case V8SImode:
8267 case V4DImode:
8268 if (TARGET_AVX2)
8269 return 2;
8270 default:
8271 break;
8272 }
8273
8274 return 0;
8275 }
8276
8277 /* Return the opcode of the special instruction to be used to load
8278 the constant X. */
8279
8280 const char *
8281 standard_sse_constant_opcode (rtx insn, rtx x)
8282 {
8283 switch (standard_sse_constant_p (x))
8284 {
8285 case 1:
8286 switch (get_attr_mode (insn))
8287 {
8288 case MODE_TI:
8289 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8290 return "%vpxor\t%0, %d0";
8291 case MODE_V2DF:
8292 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8293 return "%vxorpd\t%0, %d0";
8294 case MODE_V4SF:
8295 return "%vxorps\t%0, %d0";
8296
8297 case MODE_OI:
8298 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8299 return "vpxor\t%x0, %x0, %x0";
8300 case MODE_V4DF:
8301 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8302 return "vxorpd\t%x0, %x0, %x0";
8303 case MODE_V8SF:
8304 return "vxorps\t%x0, %x0, %x0";
8305
8306 default:
8307 break;
8308 }
8309
8310 case 2:
8311 if (TARGET_AVX)
8312 return "vpcmpeqd\t%0, %0, %0";
8313 else
8314 return "pcmpeqd\t%0, %0";
8315
8316 default:
8317 break;
8318 }
8319 gcc_unreachable ();
8320 }
8321
8322 /* Returns true if OP contains a symbol reference */
8323
8324 bool
8325 symbolic_reference_mentioned_p (rtx op)
8326 {
8327 const char *fmt;
8328 int i;
8329
8330 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8331 return true;
8332
8333 fmt = GET_RTX_FORMAT (GET_CODE (op));
8334 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8335 {
8336 if (fmt[i] == 'E')
8337 {
8338 int j;
8339
8340 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8341 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8342 return true;
8343 }
8344
8345 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8346 return true;
8347 }
8348
8349 return false;
8350 }
8351
8352 /* Return true if it is appropriate to emit `ret' instructions in the
8353 body of a function. Do this only if the epilogue is simple, needing a
8354 couple of insns. Prior to reloading, we can't tell how many registers
8355 must be saved, so return false then. Return false if there is no frame
8356 marker to de-allocate. */
8357
8358 bool
8359 ix86_can_use_return_insn_p (void)
8360 {
8361 struct ix86_frame frame;
8362
8363 if (! reload_completed || frame_pointer_needed)
8364 return 0;
8365
8366 /* Don't allow more than 32k pop, since that's all we can do
8367 with one instruction. */
8368 if (crtl->args.pops_args && crtl->args.size >= 32768)
8369 return 0;
8370
8371 ix86_compute_frame_layout (&frame);
8372 return (frame.stack_pointer_offset == UNITS_PER_WORD
8373 && (frame.nregs + frame.nsseregs) == 0);
8374 }
8375 \f
8376 /* Value should be nonzero if functions must have frame pointers.
8377 Zero means the frame pointer need not be set up (and parms may
8378 be accessed via the stack pointer) in functions that seem suitable. */
8379
8380 static bool
8381 ix86_frame_pointer_required (void)
8382 {
8383 /* If we accessed previous frames, then the generated code expects
8384 to be able to access the saved ebp value in our frame. */
8385 if (cfun->machine->accesses_prev_frame)
8386 return true;
8387
8388 /* Several x86 os'es need a frame pointer for other reasons,
8389 usually pertaining to setjmp. */
8390 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8391 return true;
8392
8393 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8394 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8395 return true;
8396
8397 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8398 turns off the frame pointer by default. Turn it back on now if
8399 we've not got a leaf function. */
8400 if (TARGET_OMIT_LEAF_FRAME_POINTER
8401 && (!current_function_is_leaf
8402 || ix86_current_function_calls_tls_descriptor))
8403 return true;
8404
8405 if (crtl->profile && !flag_fentry)
8406 return true;
8407
8408 return false;
8409 }
8410
8411 /* Record that the current function accesses previous call frames. */
8412
8413 void
8414 ix86_setup_frame_addresses (void)
8415 {
8416 cfun->machine->accesses_prev_frame = 1;
8417 }
8418 \f
8419 #ifndef USE_HIDDEN_LINKONCE
8420 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8421 # define USE_HIDDEN_LINKONCE 1
8422 # else
8423 # define USE_HIDDEN_LINKONCE 0
8424 # endif
8425 #endif
8426
8427 static int pic_labels_used;
8428
8429 /* Fills in the label name that should be used for a pc thunk for
8430 the given register. */
8431
8432 static void
8433 get_pc_thunk_name (char name[32], unsigned int regno)
8434 {
8435 gcc_assert (!TARGET_64BIT);
8436
8437 if (USE_HIDDEN_LINKONCE)
8438 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8439 else
8440 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8441 }
8442
8443
8444 /* This function generates code for -fpic that loads %ebx with
8445 the return address of the caller and then returns. */
8446
8447 static void
8448 ix86_code_end (void)
8449 {
8450 rtx xops[2];
8451 int regno;
8452
8453 for (regno = AX_REG; regno <= SP_REG; regno++)
8454 {
8455 char name[32];
8456 tree decl;
8457
8458 if (!(pic_labels_used & (1 << regno)))
8459 continue;
8460
8461 get_pc_thunk_name (name, regno);
8462
8463 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8464 get_identifier (name),
8465 build_function_type_list (void_type_node, NULL_TREE));
8466 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8467 NULL_TREE, void_type_node);
8468 TREE_PUBLIC (decl) = 1;
8469 TREE_STATIC (decl) = 1;
8470
8471 #if TARGET_MACHO
8472 if (TARGET_MACHO)
8473 {
8474 switch_to_section (darwin_sections[text_coal_section]);
8475 fputs ("\t.weak_definition\t", asm_out_file);
8476 assemble_name (asm_out_file, name);
8477 fputs ("\n\t.private_extern\t", asm_out_file);
8478 assemble_name (asm_out_file, name);
8479 putc ('\n', asm_out_file);
8480 ASM_OUTPUT_LABEL (asm_out_file, name);
8481 DECL_WEAK (decl) = 1;
8482 }
8483 else
8484 #endif
8485 if (USE_HIDDEN_LINKONCE)
8486 {
8487 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8488
8489 targetm.asm_out.unique_section (decl, 0);
8490 switch_to_section (get_named_section (decl, NULL, 0));
8491
8492 targetm.asm_out.globalize_label (asm_out_file, name);
8493 fputs ("\t.hidden\t", asm_out_file);
8494 assemble_name (asm_out_file, name);
8495 putc ('\n', asm_out_file);
8496 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8497 }
8498 else
8499 {
8500 switch_to_section (text_section);
8501 ASM_OUTPUT_LABEL (asm_out_file, name);
8502 }
8503
8504 DECL_INITIAL (decl) = make_node (BLOCK);
8505 current_function_decl = decl;
8506 init_function_start (decl);
8507 first_function_block_is_cold = false;
8508 /* Make sure unwind info is emitted for the thunk if needed. */
8509 final_start_function (emit_barrier (), asm_out_file, 1);
8510
8511 /* Pad stack IP move with 4 instructions (two NOPs count
8512 as one instruction). */
8513 if (TARGET_PAD_SHORT_FUNCTION)
8514 {
8515 int i = 8;
8516
8517 while (i--)
8518 fputs ("\tnop\n", asm_out_file);
8519 }
8520
8521 xops[0] = gen_rtx_REG (Pmode, regno);
8522 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8523 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8524 fputs ("\tret\n", asm_out_file);
8525 final_end_function ();
8526 init_insn_lengths ();
8527 free_after_compilation (cfun);
8528 set_cfun (NULL);
8529 current_function_decl = NULL;
8530 }
8531
8532 if (flag_split_stack)
8533 file_end_indicate_split_stack ();
8534 }
8535
8536 /* Emit code for the SET_GOT patterns. */
8537
8538 const char *
8539 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8540 {
8541 rtx xops[3];
8542
8543 xops[0] = dest;
8544
8545 if (TARGET_VXWORKS_RTP && flag_pic)
8546 {
8547 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8548 xops[2] = gen_rtx_MEM (Pmode,
8549 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8550 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8551
8552 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8553 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8554 an unadorned address. */
8555 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8556 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8557 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8558 return "";
8559 }
8560
8561 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8562
8563 if (!flag_pic)
8564 {
8565 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8566
8567 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8568
8569 #if TARGET_MACHO
8570 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8571 is what will be referenced by the Mach-O PIC subsystem. */
8572 if (!label)
8573 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8574 #endif
8575
8576 targetm.asm_out.internal_label (asm_out_file, "L",
8577 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8578 }
8579 else
8580 {
8581 char name[32];
8582 get_pc_thunk_name (name, REGNO (dest));
8583 pic_labels_used |= 1 << REGNO (dest);
8584
8585 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8586 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8587 output_asm_insn ("call\t%X2", xops);
8588 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8589 is what will be referenced by the Mach-O PIC subsystem. */
8590 #if TARGET_MACHO
8591 if (!label)
8592 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8593 else
8594 targetm.asm_out.internal_label (asm_out_file, "L",
8595 CODE_LABEL_NUMBER (label));
8596 #endif
8597 }
8598
8599 if (!TARGET_MACHO)
8600 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8601
8602 return "";
8603 }
8604
8605 /* Generate an "push" pattern for input ARG. */
8606
8607 static rtx
8608 gen_push (rtx arg)
8609 {
8610 struct machine_function *m = cfun->machine;
8611
8612 if (m->fs.cfa_reg == stack_pointer_rtx)
8613 m->fs.cfa_offset += UNITS_PER_WORD;
8614 m->fs.sp_offset += UNITS_PER_WORD;
8615
8616 return gen_rtx_SET (VOIDmode,
8617 gen_rtx_MEM (Pmode,
8618 gen_rtx_PRE_DEC (Pmode,
8619 stack_pointer_rtx)),
8620 arg);
8621 }
8622
8623 /* Generate an "pop" pattern for input ARG. */
8624
8625 static rtx
8626 gen_pop (rtx arg)
8627 {
8628 return gen_rtx_SET (VOIDmode,
8629 arg,
8630 gen_rtx_MEM (Pmode,
8631 gen_rtx_POST_INC (Pmode,
8632 stack_pointer_rtx)));
8633 }
8634
8635 /* Return >= 0 if there is an unused call-clobbered register available
8636 for the entire function. */
8637
8638 static unsigned int
8639 ix86_select_alt_pic_regnum (void)
8640 {
8641 if (current_function_is_leaf
8642 && !crtl->profile
8643 && !ix86_current_function_calls_tls_descriptor)
8644 {
8645 int i, drap;
8646 /* Can't use the same register for both PIC and DRAP. */
8647 if (crtl->drap_reg)
8648 drap = REGNO (crtl->drap_reg);
8649 else
8650 drap = -1;
8651 for (i = 2; i >= 0; --i)
8652 if (i != drap && !df_regs_ever_live_p (i))
8653 return i;
8654 }
8655
8656 return INVALID_REGNUM;
8657 }
8658
8659 /* Return TRUE if we need to save REGNO. */
8660
8661 static bool
8662 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8663 {
8664 if (pic_offset_table_rtx
8665 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8666 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8667 || crtl->profile
8668 || crtl->calls_eh_return
8669 || crtl->uses_const_pool))
8670 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8671
8672 if (crtl->calls_eh_return && maybe_eh_return)
8673 {
8674 unsigned i;
8675 for (i = 0; ; i++)
8676 {
8677 unsigned test = EH_RETURN_DATA_REGNO (i);
8678 if (test == INVALID_REGNUM)
8679 break;
8680 if (test == regno)
8681 return true;
8682 }
8683 }
8684
8685 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8686 return true;
8687
8688 return (df_regs_ever_live_p (regno)
8689 && !call_used_regs[regno]
8690 && !fixed_regs[regno]
8691 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8692 }
8693
8694 /* Return number of saved general prupose registers. */
8695
8696 static int
8697 ix86_nsaved_regs (void)
8698 {
8699 int nregs = 0;
8700 int regno;
8701
8702 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8703 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8704 nregs ++;
8705 return nregs;
8706 }
8707
8708 /* Return number of saved SSE registrers. */
8709
8710 static int
8711 ix86_nsaved_sseregs (void)
8712 {
8713 int nregs = 0;
8714 int regno;
8715
8716 if (!TARGET_64BIT_MS_ABI)
8717 return 0;
8718 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8719 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8720 nregs ++;
8721 return nregs;
8722 }
8723
8724 /* Given FROM and TO register numbers, say whether this elimination is
8725 allowed. If stack alignment is needed, we can only replace argument
8726 pointer with hard frame pointer, or replace frame pointer with stack
8727 pointer. Otherwise, frame pointer elimination is automatically
8728 handled and all other eliminations are valid. */
8729
8730 static bool
8731 ix86_can_eliminate (const int from, const int to)
8732 {
8733 if (stack_realign_fp)
8734 return ((from == ARG_POINTER_REGNUM
8735 && to == HARD_FRAME_POINTER_REGNUM)
8736 || (from == FRAME_POINTER_REGNUM
8737 && to == STACK_POINTER_REGNUM));
8738 else
8739 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8740 }
8741
8742 /* Return the offset between two registers, one to be eliminated, and the other
8743 its replacement, at the start of a routine. */
8744
8745 HOST_WIDE_INT
8746 ix86_initial_elimination_offset (int from, int to)
8747 {
8748 struct ix86_frame frame;
8749 ix86_compute_frame_layout (&frame);
8750
8751 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8752 return frame.hard_frame_pointer_offset;
8753 else if (from == FRAME_POINTER_REGNUM
8754 && to == HARD_FRAME_POINTER_REGNUM)
8755 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8756 else
8757 {
8758 gcc_assert (to == STACK_POINTER_REGNUM);
8759
8760 if (from == ARG_POINTER_REGNUM)
8761 return frame.stack_pointer_offset;
8762
8763 gcc_assert (from == FRAME_POINTER_REGNUM);
8764 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8765 }
8766 }
8767
8768 /* In a dynamically-aligned function, we can't know the offset from
8769 stack pointer to frame pointer, so we must ensure that setjmp
8770 eliminates fp against the hard fp (%ebp) rather than trying to
8771 index from %esp up to the top of the frame across a gap that is
8772 of unknown (at compile-time) size. */
8773 static rtx
8774 ix86_builtin_setjmp_frame_value (void)
8775 {
8776 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8777 }
8778
8779 /* When using -fsplit-stack, the allocation routines set a field in
8780 the TCB to the bottom of the stack plus this much space, measured
8781 in bytes. */
8782
8783 #define SPLIT_STACK_AVAILABLE 256
8784
8785 /* Fill structure ix86_frame about frame of currently computed function. */
8786
8787 static void
8788 ix86_compute_frame_layout (struct ix86_frame *frame)
8789 {
8790 unsigned int stack_alignment_needed;
8791 HOST_WIDE_INT offset;
8792 unsigned int preferred_alignment;
8793 HOST_WIDE_INT size = get_frame_size ();
8794 HOST_WIDE_INT to_allocate;
8795
8796 frame->nregs = ix86_nsaved_regs ();
8797 frame->nsseregs = ix86_nsaved_sseregs ();
8798
8799 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8800 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8801
8802 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8803 function prologues and leaf. */
8804 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8805 && (!current_function_is_leaf || cfun->calls_alloca != 0
8806 || ix86_current_function_calls_tls_descriptor))
8807 {
8808 preferred_alignment = 16;
8809 stack_alignment_needed = 16;
8810 crtl->preferred_stack_boundary = 128;
8811 crtl->stack_alignment_needed = 128;
8812 }
8813
8814 gcc_assert (!size || stack_alignment_needed);
8815 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8816 gcc_assert (preferred_alignment <= stack_alignment_needed);
8817
8818 /* For SEH we have to limit the amount of code movement into the prologue.
8819 At present we do this via a BLOCKAGE, at which point there's very little
8820 scheduling that can be done, which means that there's very little point
8821 in doing anything except PUSHs. */
8822 if (TARGET_SEH)
8823 cfun->machine->use_fast_prologue_epilogue = false;
8824
8825 /* During reload iteration the amount of registers saved can change.
8826 Recompute the value as needed. Do not recompute when amount of registers
8827 didn't change as reload does multiple calls to the function and does not
8828 expect the decision to change within single iteration. */
8829 else if (!optimize_function_for_size_p (cfun)
8830 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8831 {
8832 int count = frame->nregs;
8833 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8834
8835 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8836
8837 /* The fast prologue uses move instead of push to save registers. This
8838 is significantly longer, but also executes faster as modern hardware
8839 can execute the moves in parallel, but can't do that for push/pop.
8840
8841 Be careful about choosing what prologue to emit: When function takes
8842 many instructions to execute we may use slow version as well as in
8843 case function is known to be outside hot spot (this is known with
8844 feedback only). Weight the size of function by number of registers
8845 to save as it is cheap to use one or two push instructions but very
8846 slow to use many of them. */
8847 if (count)
8848 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8849 if (node->frequency < NODE_FREQUENCY_NORMAL
8850 || (flag_branch_probabilities
8851 && node->frequency < NODE_FREQUENCY_HOT))
8852 cfun->machine->use_fast_prologue_epilogue = false;
8853 else
8854 cfun->machine->use_fast_prologue_epilogue
8855 = !expensive_function_p (count);
8856 }
8857
8858 frame->save_regs_using_mov
8859 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
8860 /* If static stack checking is enabled and done with probes,
8861 the registers need to be saved before allocating the frame. */
8862 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
8863
8864 /* Skip return address. */
8865 offset = UNITS_PER_WORD;
8866
8867 /* Skip pushed static chain. */
8868 if (ix86_static_chain_on_stack)
8869 offset += UNITS_PER_WORD;
8870
8871 /* Skip saved base pointer. */
8872 if (frame_pointer_needed)
8873 offset += UNITS_PER_WORD;
8874 frame->hfp_save_offset = offset;
8875
8876 /* The traditional frame pointer location is at the top of the frame. */
8877 frame->hard_frame_pointer_offset = offset;
8878
8879 /* Register save area */
8880 offset += frame->nregs * UNITS_PER_WORD;
8881 frame->reg_save_offset = offset;
8882
8883 /* Align and set SSE register save area. */
8884 if (frame->nsseregs)
8885 {
8886 /* The only ABI that has saved SSE registers (Win64) also has a
8887 16-byte aligned default stack, and thus we don't need to be
8888 within the re-aligned local stack frame to save them. */
8889 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
8890 offset = (offset + 16 - 1) & -16;
8891 offset += frame->nsseregs * 16;
8892 }
8893 frame->sse_reg_save_offset = offset;
8894
8895 /* The re-aligned stack starts here. Values before this point are not
8896 directly comparable with values below this point. In order to make
8897 sure that no value happens to be the same before and after, force
8898 the alignment computation below to add a non-zero value. */
8899 if (stack_realign_fp)
8900 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
8901
8902 /* Va-arg area */
8903 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8904 offset += frame->va_arg_size;
8905
8906 /* Align start of frame for local function. */
8907 if (stack_realign_fp
8908 || offset != frame->sse_reg_save_offset
8909 || size != 0
8910 || !current_function_is_leaf
8911 || cfun->calls_alloca
8912 || ix86_current_function_calls_tls_descriptor)
8913 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
8914
8915 /* Frame pointer points here. */
8916 frame->frame_pointer_offset = offset;
8917
8918 offset += size;
8919
8920 /* Add outgoing arguments area. Can be skipped if we eliminated
8921 all the function calls as dead code.
8922 Skipping is however impossible when function calls alloca. Alloca
8923 expander assumes that last crtl->outgoing_args_size
8924 of stack frame are unused. */
8925 if (ACCUMULATE_OUTGOING_ARGS
8926 && (!current_function_is_leaf || cfun->calls_alloca
8927 || ix86_current_function_calls_tls_descriptor))
8928 {
8929 offset += crtl->outgoing_args_size;
8930 frame->outgoing_arguments_size = crtl->outgoing_args_size;
8931 }
8932 else
8933 frame->outgoing_arguments_size = 0;
8934
8935 /* Align stack boundary. Only needed if we're calling another function
8936 or using alloca. */
8937 if (!current_function_is_leaf || cfun->calls_alloca
8938 || ix86_current_function_calls_tls_descriptor)
8939 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
8940
8941 /* We've reached end of stack frame. */
8942 frame->stack_pointer_offset = offset;
8943
8944 /* Size prologue needs to allocate. */
8945 to_allocate = offset - frame->sse_reg_save_offset;
8946
8947 if ((!to_allocate && frame->nregs <= 1)
8948 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
8949 frame->save_regs_using_mov = false;
8950
8951 if (ix86_using_red_zone ()
8952 && current_function_sp_is_unchanging
8953 && current_function_is_leaf
8954 && !ix86_current_function_calls_tls_descriptor)
8955 {
8956 frame->red_zone_size = to_allocate;
8957 if (frame->save_regs_using_mov)
8958 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
8959 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
8960 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
8961 }
8962 else
8963 frame->red_zone_size = 0;
8964 frame->stack_pointer_offset -= frame->red_zone_size;
8965
8966 /* The SEH frame pointer location is near the bottom of the frame.
8967 This is enforced by the fact that the difference between the
8968 stack pointer and the frame pointer is limited to 240 bytes in
8969 the unwind data structure. */
8970 if (TARGET_SEH)
8971 {
8972 HOST_WIDE_INT diff;
8973
8974 /* If we can leave the frame pointer where it is, do so. */
8975 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
8976 if (diff > 240 || (diff & 15) != 0)
8977 {
8978 /* Ideally we'd determine what portion of the local stack frame
8979 (within the constraint of the lowest 240) is most heavily used.
8980 But without that complication, simply bias the frame pointer
8981 by 128 bytes so as to maximize the amount of the local stack
8982 frame that is addressable with 8-bit offsets. */
8983 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
8984 }
8985 }
8986 }
8987
8988 /* This is semi-inlined memory_address_length, but simplified
8989 since we know that we're always dealing with reg+offset, and
8990 to avoid having to create and discard all that rtl. */
8991
8992 static inline int
8993 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
8994 {
8995 int len = 4;
8996
8997 if (offset == 0)
8998 {
8999 /* EBP and R13 cannot be encoded without an offset. */
9000 len = (regno == BP_REG || regno == R13_REG);
9001 }
9002 else if (IN_RANGE (offset, -128, 127))
9003 len = 1;
9004
9005 /* ESP and R12 must be encoded with a SIB byte. */
9006 if (regno == SP_REG || regno == R12_REG)
9007 len++;
9008
9009 return len;
9010 }
9011
9012 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9013 The valid base registers are taken from CFUN->MACHINE->FS. */
9014
9015 static rtx
9016 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9017 {
9018 const struct machine_function *m = cfun->machine;
9019 rtx base_reg = NULL;
9020 HOST_WIDE_INT base_offset = 0;
9021
9022 if (m->use_fast_prologue_epilogue)
9023 {
9024 /* Choose the base register most likely to allow the most scheduling
9025 opportunities. Generally FP is valid througout the function,
9026 while DRAP must be reloaded within the epilogue. But choose either
9027 over the SP due to increased encoding size. */
9028
9029 if (m->fs.fp_valid)
9030 {
9031 base_reg = hard_frame_pointer_rtx;
9032 base_offset = m->fs.fp_offset - cfa_offset;
9033 }
9034 else if (m->fs.drap_valid)
9035 {
9036 base_reg = crtl->drap_reg;
9037 base_offset = 0 - cfa_offset;
9038 }
9039 else if (m->fs.sp_valid)
9040 {
9041 base_reg = stack_pointer_rtx;
9042 base_offset = m->fs.sp_offset - cfa_offset;
9043 }
9044 }
9045 else
9046 {
9047 HOST_WIDE_INT toffset;
9048 int len = 16, tlen;
9049
9050 /* Choose the base register with the smallest address encoding.
9051 With a tie, choose FP > DRAP > SP. */
9052 if (m->fs.sp_valid)
9053 {
9054 base_reg = stack_pointer_rtx;
9055 base_offset = m->fs.sp_offset - cfa_offset;
9056 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9057 }
9058 if (m->fs.drap_valid)
9059 {
9060 toffset = 0 - cfa_offset;
9061 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9062 if (tlen <= len)
9063 {
9064 base_reg = crtl->drap_reg;
9065 base_offset = toffset;
9066 len = tlen;
9067 }
9068 }
9069 if (m->fs.fp_valid)
9070 {
9071 toffset = m->fs.fp_offset - cfa_offset;
9072 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9073 if (tlen <= len)
9074 {
9075 base_reg = hard_frame_pointer_rtx;
9076 base_offset = toffset;
9077 len = tlen;
9078 }
9079 }
9080 }
9081 gcc_assert (base_reg != NULL);
9082
9083 return plus_constant (base_reg, base_offset);
9084 }
9085
9086 /* Emit code to save registers in the prologue. */
9087
9088 static void
9089 ix86_emit_save_regs (void)
9090 {
9091 unsigned int regno;
9092 rtx insn;
9093
9094 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9095 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9096 {
9097 insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
9098 RTX_FRAME_RELATED_P (insn) = 1;
9099 }
9100 }
9101
9102 /* Emit a single register save at CFA - CFA_OFFSET. */
9103
9104 static void
9105 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9106 HOST_WIDE_INT cfa_offset)
9107 {
9108 struct machine_function *m = cfun->machine;
9109 rtx reg = gen_rtx_REG (mode, regno);
9110 rtx mem, addr, base, insn;
9111
9112 addr = choose_baseaddr (cfa_offset);
9113 mem = gen_frame_mem (mode, addr);
9114
9115 /* For SSE saves, we need to indicate the 128-bit alignment. */
9116 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9117
9118 insn = emit_move_insn (mem, reg);
9119 RTX_FRAME_RELATED_P (insn) = 1;
9120
9121 base = addr;
9122 if (GET_CODE (base) == PLUS)
9123 base = XEXP (base, 0);
9124 gcc_checking_assert (REG_P (base));
9125
9126 /* When saving registers into a re-aligned local stack frame, avoid
9127 any tricky guessing by dwarf2out. */
9128 if (m->fs.realigned)
9129 {
9130 gcc_checking_assert (stack_realign_drap);
9131
9132 if (regno == REGNO (crtl->drap_reg))
9133 {
9134 /* A bit of a hack. We force the DRAP register to be saved in
9135 the re-aligned stack frame, which provides us with a copy
9136 of the CFA that will last past the prologue. Install it. */
9137 gcc_checking_assert (cfun->machine->fs.fp_valid);
9138 addr = plus_constant (hard_frame_pointer_rtx,
9139 cfun->machine->fs.fp_offset - cfa_offset);
9140 mem = gen_rtx_MEM (mode, addr);
9141 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9142 }
9143 else
9144 {
9145 /* The frame pointer is a stable reference within the
9146 aligned frame. Use it. */
9147 gcc_checking_assert (cfun->machine->fs.fp_valid);
9148 addr = plus_constant (hard_frame_pointer_rtx,
9149 cfun->machine->fs.fp_offset - cfa_offset);
9150 mem = gen_rtx_MEM (mode, addr);
9151 add_reg_note (insn, REG_CFA_EXPRESSION,
9152 gen_rtx_SET (VOIDmode, mem, reg));
9153 }
9154 }
9155
9156 /* The memory may not be relative to the current CFA register,
9157 which means that we may need to generate a new pattern for
9158 use by the unwind info. */
9159 else if (base != m->fs.cfa_reg)
9160 {
9161 addr = plus_constant (m->fs.cfa_reg, m->fs.cfa_offset - cfa_offset);
9162 mem = gen_rtx_MEM (mode, addr);
9163 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9164 }
9165 }
9166
9167 /* Emit code to save registers using MOV insns.
9168 First register is stored at CFA - CFA_OFFSET. */
9169 static void
9170 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9171 {
9172 unsigned int regno;
9173
9174 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9175 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9176 {
9177 ix86_emit_save_reg_using_mov (Pmode, regno, cfa_offset);
9178 cfa_offset -= UNITS_PER_WORD;
9179 }
9180 }
9181
9182 /* Emit code to save SSE registers using MOV insns.
9183 First register is stored at CFA - CFA_OFFSET. */
9184 static void
9185 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9186 {
9187 unsigned int regno;
9188
9189 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9190 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9191 {
9192 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9193 cfa_offset -= 16;
9194 }
9195 }
9196
9197 static GTY(()) rtx queued_cfa_restores;
9198
9199 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9200 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9201 Don't add the note if the previously saved value will be left untouched
9202 within stack red-zone till return, as unwinders can find the same value
9203 in the register and on the stack. */
9204
9205 static void
9206 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9207 {
9208 if (!crtl->shrink_wrapped
9209 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9210 return;
9211
9212 if (insn)
9213 {
9214 add_reg_note (insn, REG_CFA_RESTORE, reg);
9215 RTX_FRAME_RELATED_P (insn) = 1;
9216 }
9217 else
9218 queued_cfa_restores
9219 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9220 }
9221
9222 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9223
9224 static void
9225 ix86_add_queued_cfa_restore_notes (rtx insn)
9226 {
9227 rtx last;
9228 if (!queued_cfa_restores)
9229 return;
9230 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9231 ;
9232 XEXP (last, 1) = REG_NOTES (insn);
9233 REG_NOTES (insn) = queued_cfa_restores;
9234 queued_cfa_restores = NULL_RTX;
9235 RTX_FRAME_RELATED_P (insn) = 1;
9236 }
9237
9238 /* Expand prologue or epilogue stack adjustment.
9239 The pattern exist to put a dependency on all ebp-based memory accesses.
9240 STYLE should be negative if instructions should be marked as frame related,
9241 zero if %r11 register is live and cannot be freely used and positive
9242 otherwise. */
9243
9244 static void
9245 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9246 int style, bool set_cfa)
9247 {
9248 struct machine_function *m = cfun->machine;
9249 rtx insn;
9250 bool add_frame_related_expr = false;
9251
9252 if (! TARGET_64BIT)
9253 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9254 else if (x86_64_immediate_operand (offset, DImode))
9255 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9256 else
9257 {
9258 rtx tmp;
9259 /* r11 is used by indirect sibcall return as well, set before the
9260 epilogue and used after the epilogue. */
9261 if (style)
9262 tmp = gen_rtx_REG (DImode, R11_REG);
9263 else
9264 {
9265 gcc_assert (src != hard_frame_pointer_rtx
9266 && dest != hard_frame_pointer_rtx);
9267 tmp = hard_frame_pointer_rtx;
9268 }
9269 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9270 if (style < 0)
9271 add_frame_related_expr = true;
9272
9273 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9274 }
9275
9276 insn = emit_insn (insn);
9277 if (style >= 0)
9278 ix86_add_queued_cfa_restore_notes (insn);
9279
9280 if (set_cfa)
9281 {
9282 rtx r;
9283
9284 gcc_assert (m->fs.cfa_reg == src);
9285 m->fs.cfa_offset += INTVAL (offset);
9286 m->fs.cfa_reg = dest;
9287
9288 r = gen_rtx_PLUS (Pmode, src, offset);
9289 r = gen_rtx_SET (VOIDmode, dest, r);
9290 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9291 RTX_FRAME_RELATED_P (insn) = 1;
9292 }
9293 else if (style < 0)
9294 {
9295 RTX_FRAME_RELATED_P (insn) = 1;
9296 if (add_frame_related_expr)
9297 {
9298 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9299 r = gen_rtx_SET (VOIDmode, dest, r);
9300 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9301 }
9302 }
9303
9304 if (dest == stack_pointer_rtx)
9305 {
9306 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9307 bool valid = m->fs.sp_valid;
9308
9309 if (src == hard_frame_pointer_rtx)
9310 {
9311 valid = m->fs.fp_valid;
9312 ooffset = m->fs.fp_offset;
9313 }
9314 else if (src == crtl->drap_reg)
9315 {
9316 valid = m->fs.drap_valid;
9317 ooffset = 0;
9318 }
9319 else
9320 {
9321 /* Else there are two possibilities: SP itself, which we set
9322 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9323 taken care of this by hand along the eh_return path. */
9324 gcc_checking_assert (src == stack_pointer_rtx
9325 || offset == const0_rtx);
9326 }
9327
9328 m->fs.sp_offset = ooffset - INTVAL (offset);
9329 m->fs.sp_valid = valid;
9330 }
9331 }
9332
9333 /* Find an available register to be used as dynamic realign argument
9334 pointer regsiter. Such a register will be written in prologue and
9335 used in begin of body, so it must not be
9336 1. parameter passing register.
9337 2. GOT pointer.
9338 We reuse static-chain register if it is available. Otherwise, we
9339 use DI for i386 and R13 for x86-64. We chose R13 since it has
9340 shorter encoding.
9341
9342 Return: the regno of chosen register. */
9343
9344 static unsigned int
9345 find_drap_reg (void)
9346 {
9347 tree decl = cfun->decl;
9348
9349 if (TARGET_64BIT)
9350 {
9351 /* Use R13 for nested function or function need static chain.
9352 Since function with tail call may use any caller-saved
9353 registers in epilogue, DRAP must not use caller-saved
9354 register in such case. */
9355 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9356 return R13_REG;
9357
9358 return R10_REG;
9359 }
9360 else
9361 {
9362 /* Use DI for nested function or function need static chain.
9363 Since function with tail call may use any caller-saved
9364 registers in epilogue, DRAP must not use caller-saved
9365 register in such case. */
9366 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9367 return DI_REG;
9368
9369 /* Reuse static chain register if it isn't used for parameter
9370 passing. */
9371 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9372 {
9373 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9374 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9375 return CX_REG;
9376 }
9377 return DI_REG;
9378 }
9379 }
9380
9381 /* Return minimum incoming stack alignment. */
9382
9383 static unsigned int
9384 ix86_minimum_incoming_stack_boundary (bool sibcall)
9385 {
9386 unsigned int incoming_stack_boundary;
9387
9388 /* Prefer the one specified at command line. */
9389 if (ix86_user_incoming_stack_boundary)
9390 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9391 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9392 if -mstackrealign is used, it isn't used for sibcall check and
9393 estimated stack alignment is 128bit. */
9394 else if (!sibcall
9395 && !TARGET_64BIT
9396 && ix86_force_align_arg_pointer
9397 && crtl->stack_alignment_estimated == 128)
9398 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9399 else
9400 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9401
9402 /* Incoming stack alignment can be changed on individual functions
9403 via force_align_arg_pointer attribute. We use the smallest
9404 incoming stack boundary. */
9405 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9406 && lookup_attribute (ix86_force_align_arg_pointer_string,
9407 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9408 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9409
9410 /* The incoming stack frame has to be aligned at least at
9411 parm_stack_boundary. */
9412 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9413 incoming_stack_boundary = crtl->parm_stack_boundary;
9414
9415 /* Stack at entrance of main is aligned by runtime. We use the
9416 smallest incoming stack boundary. */
9417 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9418 && DECL_NAME (current_function_decl)
9419 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9420 && DECL_FILE_SCOPE_P (current_function_decl))
9421 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9422
9423 return incoming_stack_boundary;
9424 }
9425
9426 /* Update incoming stack boundary and estimated stack alignment. */
9427
9428 static void
9429 ix86_update_stack_boundary (void)
9430 {
9431 ix86_incoming_stack_boundary
9432 = ix86_minimum_incoming_stack_boundary (false);
9433
9434 /* x86_64 vararg needs 16byte stack alignment for register save
9435 area. */
9436 if (TARGET_64BIT
9437 && cfun->stdarg
9438 && crtl->stack_alignment_estimated < 128)
9439 crtl->stack_alignment_estimated = 128;
9440 }
9441
9442 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9443 needed or an rtx for DRAP otherwise. */
9444
9445 static rtx
9446 ix86_get_drap_rtx (void)
9447 {
9448 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9449 crtl->need_drap = true;
9450
9451 if (stack_realign_drap)
9452 {
9453 /* Assign DRAP to vDRAP and returns vDRAP */
9454 unsigned int regno = find_drap_reg ();
9455 rtx drap_vreg;
9456 rtx arg_ptr;
9457 rtx seq, insn;
9458
9459 arg_ptr = gen_rtx_REG (Pmode, regno);
9460 crtl->drap_reg = arg_ptr;
9461
9462 start_sequence ();
9463 drap_vreg = copy_to_reg (arg_ptr);
9464 seq = get_insns ();
9465 end_sequence ();
9466
9467 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9468 if (!optimize)
9469 {
9470 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9471 RTX_FRAME_RELATED_P (insn) = 1;
9472 }
9473 return drap_vreg;
9474 }
9475 else
9476 return NULL;
9477 }
9478
9479 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9480
9481 static rtx
9482 ix86_internal_arg_pointer (void)
9483 {
9484 return virtual_incoming_args_rtx;
9485 }
9486
9487 struct scratch_reg {
9488 rtx reg;
9489 bool saved;
9490 };
9491
9492 /* Return a short-lived scratch register for use on function entry.
9493 In 32-bit mode, it is valid only after the registers are saved
9494 in the prologue. This register must be released by means of
9495 release_scratch_register_on_entry once it is dead. */
9496
9497 static void
9498 get_scratch_register_on_entry (struct scratch_reg *sr)
9499 {
9500 int regno;
9501
9502 sr->saved = false;
9503
9504 if (TARGET_64BIT)
9505 {
9506 /* We always use R11 in 64-bit mode. */
9507 regno = R11_REG;
9508 }
9509 else
9510 {
9511 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9512 bool fastcall_p
9513 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9514 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9515 int regparm = ix86_function_regparm (fntype, decl);
9516 int drap_regno
9517 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9518
9519 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9520 for the static chain register. */
9521 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9522 && drap_regno != AX_REG)
9523 regno = AX_REG;
9524 else if (regparm < 2 && drap_regno != DX_REG)
9525 regno = DX_REG;
9526 /* ecx is the static chain register. */
9527 else if (regparm < 3 && !fastcall_p && !static_chain_p
9528 && drap_regno != CX_REG)
9529 regno = CX_REG;
9530 else if (ix86_save_reg (BX_REG, true))
9531 regno = BX_REG;
9532 /* esi is the static chain register. */
9533 else if (!(regparm == 3 && static_chain_p)
9534 && ix86_save_reg (SI_REG, true))
9535 regno = SI_REG;
9536 else if (ix86_save_reg (DI_REG, true))
9537 regno = DI_REG;
9538 else
9539 {
9540 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9541 sr->saved = true;
9542 }
9543 }
9544
9545 sr->reg = gen_rtx_REG (Pmode, regno);
9546 if (sr->saved)
9547 {
9548 rtx insn = emit_insn (gen_push (sr->reg));
9549 RTX_FRAME_RELATED_P (insn) = 1;
9550 }
9551 }
9552
9553 /* Release a scratch register obtained from the preceding function. */
9554
9555 static void
9556 release_scratch_register_on_entry (struct scratch_reg *sr)
9557 {
9558 if (sr->saved)
9559 {
9560 rtx x, insn = emit_insn (gen_pop (sr->reg));
9561
9562 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9563 RTX_FRAME_RELATED_P (insn) = 1;
9564 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9565 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9566 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9567 }
9568 }
9569
9570 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9571
9572 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9573
9574 static void
9575 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9576 {
9577 /* We skip the probe for the first interval + a small dope of 4 words and
9578 probe that many bytes past the specified size to maintain a protection
9579 area at the botton of the stack. */
9580 const int dope = 4 * UNITS_PER_WORD;
9581 rtx size_rtx = GEN_INT (size), last;
9582
9583 /* See if we have a constant small number of probes to generate. If so,
9584 that's the easy case. The run-time loop is made up of 11 insns in the
9585 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9586 for n # of intervals. */
9587 if (size <= 5 * PROBE_INTERVAL)
9588 {
9589 HOST_WIDE_INT i, adjust;
9590 bool first_probe = true;
9591
9592 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9593 values of N from 1 until it exceeds SIZE. If only one probe is
9594 needed, this will not generate any code. Then adjust and probe
9595 to PROBE_INTERVAL + SIZE. */
9596 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9597 {
9598 if (first_probe)
9599 {
9600 adjust = 2 * PROBE_INTERVAL + dope;
9601 first_probe = false;
9602 }
9603 else
9604 adjust = PROBE_INTERVAL;
9605
9606 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9607 plus_constant (stack_pointer_rtx, -adjust)));
9608 emit_stack_probe (stack_pointer_rtx);
9609 }
9610
9611 if (first_probe)
9612 adjust = size + PROBE_INTERVAL + dope;
9613 else
9614 adjust = size + PROBE_INTERVAL - i;
9615
9616 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9617 plus_constant (stack_pointer_rtx, -adjust)));
9618 emit_stack_probe (stack_pointer_rtx);
9619
9620 /* Adjust back to account for the additional first interval. */
9621 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9622 plus_constant (stack_pointer_rtx,
9623 PROBE_INTERVAL + dope)));
9624 }
9625
9626 /* Otherwise, do the same as above, but in a loop. Note that we must be
9627 extra careful with variables wrapping around because we might be at
9628 the very top (or the very bottom) of the address space and we have
9629 to be able to handle this case properly; in particular, we use an
9630 equality test for the loop condition. */
9631 else
9632 {
9633 HOST_WIDE_INT rounded_size;
9634 struct scratch_reg sr;
9635
9636 get_scratch_register_on_entry (&sr);
9637
9638
9639 /* Step 1: round SIZE to the previous multiple of the interval. */
9640
9641 rounded_size = size & -PROBE_INTERVAL;
9642
9643
9644 /* Step 2: compute initial and final value of the loop counter. */
9645
9646 /* SP = SP_0 + PROBE_INTERVAL. */
9647 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9648 plus_constant (stack_pointer_rtx,
9649 - (PROBE_INTERVAL + dope))));
9650
9651 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9652 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9653 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9654 gen_rtx_PLUS (Pmode, sr.reg,
9655 stack_pointer_rtx)));
9656
9657
9658 /* Step 3: the loop
9659
9660 while (SP != LAST_ADDR)
9661 {
9662 SP = SP + PROBE_INTERVAL
9663 probe at SP
9664 }
9665
9666 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9667 values of N from 1 until it is equal to ROUNDED_SIZE. */
9668
9669 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9670
9671
9672 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9673 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9674
9675 if (size != rounded_size)
9676 {
9677 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9678 plus_constant (stack_pointer_rtx,
9679 rounded_size - size)));
9680 emit_stack_probe (stack_pointer_rtx);
9681 }
9682
9683 /* Adjust back to account for the additional first interval. */
9684 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9685 plus_constant (stack_pointer_rtx,
9686 PROBE_INTERVAL + dope)));
9687
9688 release_scratch_register_on_entry (&sr);
9689 }
9690
9691 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9692
9693 /* Even if the stack pointer isn't the CFA register, we need to correctly
9694 describe the adjustments made to it, in particular differentiate the
9695 frame-related ones from the frame-unrelated ones. */
9696 if (size > 0)
9697 {
9698 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9699 XVECEXP (expr, 0, 0)
9700 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9701 plus_constant (stack_pointer_rtx, -size));
9702 XVECEXP (expr, 0, 1)
9703 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9704 plus_constant (stack_pointer_rtx,
9705 PROBE_INTERVAL + dope + size));
9706 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9707 RTX_FRAME_RELATED_P (last) = 1;
9708
9709 cfun->machine->fs.sp_offset += size;
9710 }
9711
9712 /* Make sure nothing is scheduled before we are done. */
9713 emit_insn (gen_blockage ());
9714 }
9715
9716 /* Adjust the stack pointer up to REG while probing it. */
9717
9718 const char *
9719 output_adjust_stack_and_probe (rtx reg)
9720 {
9721 static int labelno = 0;
9722 char loop_lab[32], end_lab[32];
9723 rtx xops[2];
9724
9725 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9726 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9727
9728 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9729
9730 /* Jump to END_LAB if SP == LAST_ADDR. */
9731 xops[0] = stack_pointer_rtx;
9732 xops[1] = reg;
9733 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9734 fputs ("\tje\t", asm_out_file);
9735 assemble_name_raw (asm_out_file, end_lab);
9736 fputc ('\n', asm_out_file);
9737
9738 /* SP = SP + PROBE_INTERVAL. */
9739 xops[1] = GEN_INT (PROBE_INTERVAL);
9740 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9741
9742 /* Probe at SP. */
9743 xops[1] = const0_rtx;
9744 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9745
9746 fprintf (asm_out_file, "\tjmp\t");
9747 assemble_name_raw (asm_out_file, loop_lab);
9748 fputc ('\n', asm_out_file);
9749
9750 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9751
9752 return "";
9753 }
9754
9755 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9756 inclusive. These are offsets from the current stack pointer. */
9757
9758 static void
9759 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9760 {
9761 /* See if we have a constant small number of probes to generate. If so,
9762 that's the easy case. The run-time loop is made up of 7 insns in the
9763 generic case while the compile-time loop is made up of n insns for n #
9764 of intervals. */
9765 if (size <= 7 * PROBE_INTERVAL)
9766 {
9767 HOST_WIDE_INT i;
9768
9769 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9770 it exceeds SIZE. If only one probe is needed, this will not
9771 generate any code. Then probe at FIRST + SIZE. */
9772 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9773 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i)));
9774
9775 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size)));
9776 }
9777
9778 /* Otherwise, do the same as above, but in a loop. Note that we must be
9779 extra careful with variables wrapping around because we might be at
9780 the very top (or the very bottom) of the address space and we have
9781 to be able to handle this case properly; in particular, we use an
9782 equality test for the loop condition. */
9783 else
9784 {
9785 HOST_WIDE_INT rounded_size, last;
9786 struct scratch_reg sr;
9787
9788 get_scratch_register_on_entry (&sr);
9789
9790
9791 /* Step 1: round SIZE to the previous multiple of the interval. */
9792
9793 rounded_size = size & -PROBE_INTERVAL;
9794
9795
9796 /* Step 2: compute initial and final value of the loop counter. */
9797
9798 /* TEST_OFFSET = FIRST. */
9799 emit_move_insn (sr.reg, GEN_INT (-first));
9800
9801 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9802 last = first + rounded_size;
9803
9804
9805 /* Step 3: the loop
9806
9807 while (TEST_ADDR != LAST_ADDR)
9808 {
9809 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9810 probe at TEST_ADDR
9811 }
9812
9813 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9814 until it is equal to ROUNDED_SIZE. */
9815
9816 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9817
9818
9819 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9820 that SIZE is equal to ROUNDED_SIZE. */
9821
9822 if (size != rounded_size)
9823 emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode,
9824 stack_pointer_rtx,
9825 sr.reg),
9826 rounded_size - size));
9827
9828 release_scratch_register_on_entry (&sr);
9829 }
9830
9831 /* Make sure nothing is scheduled before we are done. */
9832 emit_insn (gen_blockage ());
9833 }
9834
9835 /* Probe a range of stack addresses from REG to END, inclusive. These are
9836 offsets from the current stack pointer. */
9837
9838 const char *
9839 output_probe_stack_range (rtx reg, rtx end)
9840 {
9841 static int labelno = 0;
9842 char loop_lab[32], end_lab[32];
9843 rtx xops[3];
9844
9845 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9846 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9847
9848 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9849
9850 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
9851 xops[0] = reg;
9852 xops[1] = end;
9853 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9854 fputs ("\tje\t", asm_out_file);
9855 assemble_name_raw (asm_out_file, end_lab);
9856 fputc ('\n', asm_out_file);
9857
9858 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
9859 xops[1] = GEN_INT (PROBE_INTERVAL);
9860 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9861
9862 /* Probe at TEST_ADDR. */
9863 xops[0] = stack_pointer_rtx;
9864 xops[1] = reg;
9865 xops[2] = const0_rtx;
9866 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
9867
9868 fprintf (asm_out_file, "\tjmp\t");
9869 assemble_name_raw (asm_out_file, loop_lab);
9870 fputc ('\n', asm_out_file);
9871
9872 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9873
9874 return "";
9875 }
9876
9877 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
9878 to be generated in correct form. */
9879 static void
9880 ix86_finalize_stack_realign_flags (void)
9881 {
9882 /* Check if stack realign is really needed after reload, and
9883 stores result in cfun */
9884 unsigned int incoming_stack_boundary
9885 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
9886 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
9887 unsigned int stack_realign = (incoming_stack_boundary
9888 < (current_function_is_leaf
9889 ? crtl->max_used_stack_slot_alignment
9890 : crtl->stack_alignment_needed));
9891
9892 if (crtl->stack_realign_finalized)
9893 {
9894 /* After stack_realign_needed is finalized, we can't no longer
9895 change it. */
9896 gcc_assert (crtl->stack_realign_needed == stack_realign);
9897 }
9898 else
9899 {
9900 crtl->stack_realign_needed = stack_realign;
9901 crtl->stack_realign_finalized = true;
9902 }
9903 }
9904
9905 /* Expand the prologue into a bunch of separate insns. */
9906
9907 void
9908 ix86_expand_prologue (void)
9909 {
9910 struct machine_function *m = cfun->machine;
9911 rtx insn, t;
9912 bool pic_reg_used;
9913 struct ix86_frame frame;
9914 HOST_WIDE_INT allocate;
9915 bool int_registers_saved;
9916
9917 ix86_finalize_stack_realign_flags ();
9918
9919 /* DRAP should not coexist with stack_realign_fp */
9920 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
9921
9922 memset (&m->fs, 0, sizeof (m->fs));
9923
9924 /* Initialize CFA state for before the prologue. */
9925 m->fs.cfa_reg = stack_pointer_rtx;
9926 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
9927
9928 /* Track SP offset to the CFA. We continue tracking this after we've
9929 swapped the CFA register away from SP. In the case of re-alignment
9930 this is fudged; we're interested to offsets within the local frame. */
9931 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
9932 m->fs.sp_valid = true;
9933
9934 ix86_compute_frame_layout (&frame);
9935
9936 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
9937 {
9938 /* We should have already generated an error for any use of
9939 ms_hook on a nested function. */
9940 gcc_checking_assert (!ix86_static_chain_on_stack);
9941
9942 /* Check if profiling is active and we shall use profiling before
9943 prologue variant. If so sorry. */
9944 if (crtl->profile && flag_fentry != 0)
9945 sorry ("ms_hook_prologue attribute isn%'t compatible "
9946 "with -mfentry for 32-bit");
9947
9948 /* In ix86_asm_output_function_label we emitted:
9949 8b ff movl.s %edi,%edi
9950 55 push %ebp
9951 8b ec movl.s %esp,%ebp
9952
9953 This matches the hookable function prologue in Win32 API
9954 functions in Microsoft Windows XP Service Pack 2 and newer.
9955 Wine uses this to enable Windows apps to hook the Win32 API
9956 functions provided by Wine.
9957
9958 What that means is that we've already set up the frame pointer. */
9959
9960 if (frame_pointer_needed
9961 && !(crtl->drap_reg && crtl->stack_realign_needed))
9962 {
9963 rtx push, mov;
9964
9965 /* We've decided to use the frame pointer already set up.
9966 Describe this to the unwinder by pretending that both
9967 push and mov insns happen right here.
9968
9969 Putting the unwind info here at the end of the ms_hook
9970 is done so that we can make absolutely certain we get
9971 the required byte sequence at the start of the function,
9972 rather than relying on an assembler that can produce
9973 the exact encoding required.
9974
9975 However it does mean (in the unpatched case) that we have
9976 a 1 insn window where the asynchronous unwind info is
9977 incorrect. However, if we placed the unwind info at
9978 its correct location we would have incorrect unwind info
9979 in the patched case. Which is probably all moot since
9980 I don't expect Wine generates dwarf2 unwind info for the
9981 system libraries that use this feature. */
9982
9983 insn = emit_insn (gen_blockage ());
9984
9985 push = gen_push (hard_frame_pointer_rtx);
9986 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
9987 stack_pointer_rtx);
9988 RTX_FRAME_RELATED_P (push) = 1;
9989 RTX_FRAME_RELATED_P (mov) = 1;
9990
9991 RTX_FRAME_RELATED_P (insn) = 1;
9992 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
9993 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
9994
9995 /* Note that gen_push incremented m->fs.cfa_offset, even
9996 though we didn't emit the push insn here. */
9997 m->fs.cfa_reg = hard_frame_pointer_rtx;
9998 m->fs.fp_offset = m->fs.cfa_offset;
9999 m->fs.fp_valid = true;
10000 }
10001 else
10002 {
10003 /* The frame pointer is not needed so pop %ebp again.
10004 This leaves us with a pristine state. */
10005 emit_insn (gen_pop (hard_frame_pointer_rtx));
10006 }
10007 }
10008
10009 /* The first insn of a function that accepts its static chain on the
10010 stack is to push the register that would be filled in by a direct
10011 call. This insn will be skipped by the trampoline. */
10012 else if (ix86_static_chain_on_stack)
10013 {
10014 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10015 emit_insn (gen_blockage ());
10016
10017 /* We don't want to interpret this push insn as a register save,
10018 only as a stack adjustment. The real copy of the register as
10019 a save will be done later, if needed. */
10020 t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
10021 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10022 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10023 RTX_FRAME_RELATED_P (insn) = 1;
10024 }
10025
10026 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10027 of DRAP is needed and stack realignment is really needed after reload */
10028 if (stack_realign_drap)
10029 {
10030 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10031
10032 /* Only need to push parameter pointer reg if it is caller saved. */
10033 if (!call_used_regs[REGNO (crtl->drap_reg)])
10034 {
10035 /* Push arg pointer reg */
10036 insn = emit_insn (gen_push (crtl->drap_reg));
10037 RTX_FRAME_RELATED_P (insn) = 1;
10038 }
10039
10040 /* Grab the argument pointer. */
10041 t = plus_constant (stack_pointer_rtx, m->fs.sp_offset);
10042 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10043 RTX_FRAME_RELATED_P (insn) = 1;
10044 m->fs.cfa_reg = crtl->drap_reg;
10045 m->fs.cfa_offset = 0;
10046
10047 /* Align the stack. */
10048 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10049 stack_pointer_rtx,
10050 GEN_INT (-align_bytes)));
10051 RTX_FRAME_RELATED_P (insn) = 1;
10052
10053 /* Replicate the return address on the stack so that return
10054 address can be reached via (argp - 1) slot. This is needed
10055 to implement macro RETURN_ADDR_RTX and intrinsic function
10056 expand_builtin_return_addr etc. */
10057 t = plus_constant (crtl->drap_reg, -UNITS_PER_WORD);
10058 t = gen_frame_mem (Pmode, t);
10059 insn = emit_insn (gen_push (t));
10060 RTX_FRAME_RELATED_P (insn) = 1;
10061
10062 /* For the purposes of frame and register save area addressing,
10063 we've started over with a new frame. */
10064 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10065 m->fs.realigned = true;
10066 }
10067
10068 if (frame_pointer_needed && !m->fs.fp_valid)
10069 {
10070 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10071 slower on all targets. Also sdb doesn't like it. */
10072 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10073 RTX_FRAME_RELATED_P (insn) = 1;
10074
10075 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10076 {
10077 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10078 RTX_FRAME_RELATED_P (insn) = 1;
10079
10080 if (m->fs.cfa_reg == stack_pointer_rtx)
10081 m->fs.cfa_reg = hard_frame_pointer_rtx;
10082 m->fs.fp_offset = m->fs.sp_offset;
10083 m->fs.fp_valid = true;
10084 }
10085 }
10086
10087 int_registers_saved = (frame.nregs == 0);
10088
10089 if (!int_registers_saved)
10090 {
10091 /* If saving registers via PUSH, do so now. */
10092 if (!frame.save_regs_using_mov)
10093 {
10094 ix86_emit_save_regs ();
10095 int_registers_saved = true;
10096 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10097 }
10098
10099 /* When using red zone we may start register saving before allocating
10100 the stack frame saving one cycle of the prologue. However, avoid
10101 doing this if we have to probe the stack; at least on x86_64 the
10102 stack probe can turn into a call that clobbers a red zone location. */
10103 else if (ix86_using_red_zone ()
10104 && (! TARGET_STACK_PROBE
10105 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10106 {
10107 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10108 int_registers_saved = true;
10109 }
10110 }
10111
10112 if (stack_realign_fp)
10113 {
10114 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10115 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10116
10117 /* The computation of the size of the re-aligned stack frame means
10118 that we must allocate the size of the register save area before
10119 performing the actual alignment. Otherwise we cannot guarantee
10120 that there's enough storage above the realignment point. */
10121 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10122 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10123 GEN_INT (m->fs.sp_offset
10124 - frame.sse_reg_save_offset),
10125 -1, false);
10126
10127 /* Align the stack. */
10128 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10129 stack_pointer_rtx,
10130 GEN_INT (-align_bytes)));
10131
10132 /* For the purposes of register save area addressing, the stack
10133 pointer is no longer valid. As for the value of sp_offset,
10134 see ix86_compute_frame_layout, which we need to match in order
10135 to pass verification of stack_pointer_offset at the end. */
10136 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10137 m->fs.sp_valid = false;
10138 }
10139
10140 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10141
10142 if (flag_stack_usage_info)
10143 {
10144 /* We start to count from ARG_POINTER. */
10145 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10146
10147 /* If it was realigned, take into account the fake frame. */
10148 if (stack_realign_drap)
10149 {
10150 if (ix86_static_chain_on_stack)
10151 stack_size += UNITS_PER_WORD;
10152
10153 if (!call_used_regs[REGNO (crtl->drap_reg)])
10154 stack_size += UNITS_PER_WORD;
10155
10156 /* This over-estimates by 1 minimal-stack-alignment-unit but
10157 mitigates that by counting in the new return address slot. */
10158 current_function_dynamic_stack_size
10159 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10160 }
10161
10162 current_function_static_stack_size = stack_size;
10163 }
10164
10165 /* The stack has already been decremented by the instruction calling us
10166 so probe if the size is non-negative to preserve the protection area. */
10167 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10168 {
10169 /* We expect the registers to be saved when probes are used. */
10170 gcc_assert (int_registers_saved);
10171
10172 if (STACK_CHECK_MOVING_SP)
10173 {
10174 ix86_adjust_stack_and_probe (allocate);
10175 allocate = 0;
10176 }
10177 else
10178 {
10179 HOST_WIDE_INT size = allocate;
10180
10181 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10182 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10183
10184 if (TARGET_STACK_PROBE)
10185 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10186 else
10187 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10188 }
10189 }
10190
10191 if (allocate == 0)
10192 ;
10193 else if (!ix86_target_stack_probe ()
10194 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10195 {
10196 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10197 GEN_INT (-allocate), -1,
10198 m->fs.cfa_reg == stack_pointer_rtx);
10199 }
10200 else
10201 {
10202 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10203 rtx r10 = NULL;
10204 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10205
10206 bool eax_live = false;
10207 bool r10_live = false;
10208
10209 if (TARGET_64BIT)
10210 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10211 if (!TARGET_64BIT_MS_ABI)
10212 eax_live = ix86_eax_live_at_start_p ();
10213
10214 if (eax_live)
10215 {
10216 emit_insn (gen_push (eax));
10217 allocate -= UNITS_PER_WORD;
10218 }
10219 if (r10_live)
10220 {
10221 r10 = gen_rtx_REG (Pmode, R10_REG);
10222 emit_insn (gen_push (r10));
10223 allocate -= UNITS_PER_WORD;
10224 }
10225
10226 emit_move_insn (eax, GEN_INT (allocate));
10227 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10228
10229 /* Use the fact that AX still contains ALLOCATE. */
10230 adjust_stack_insn = (TARGET_64BIT
10231 ? gen_pro_epilogue_adjust_stack_di_sub
10232 : gen_pro_epilogue_adjust_stack_si_sub);
10233
10234 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10235 stack_pointer_rtx, eax));
10236
10237 /* Note that SEH directives need to continue tracking the stack
10238 pointer even after the frame pointer has been set up. */
10239 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10240 {
10241 if (m->fs.cfa_reg == stack_pointer_rtx)
10242 m->fs.cfa_offset += allocate;
10243
10244 RTX_FRAME_RELATED_P (insn) = 1;
10245 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10246 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10247 plus_constant (stack_pointer_rtx,
10248 -allocate)));
10249 }
10250 m->fs.sp_offset += allocate;
10251
10252 if (r10_live && eax_live)
10253 {
10254 t = choose_baseaddr (m->fs.sp_offset - allocate);
10255 emit_move_insn (r10, gen_frame_mem (Pmode, t));
10256 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10257 emit_move_insn (eax, gen_frame_mem (Pmode, t));
10258 }
10259 else if (eax_live || r10_live)
10260 {
10261 t = choose_baseaddr (m->fs.sp_offset - allocate);
10262 emit_move_insn ((eax_live ? eax : r10), gen_frame_mem (Pmode, t));
10263 }
10264 }
10265 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10266
10267 /* If we havn't already set up the frame pointer, do so now. */
10268 if (frame_pointer_needed && !m->fs.fp_valid)
10269 {
10270 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10271 GEN_INT (frame.stack_pointer_offset
10272 - frame.hard_frame_pointer_offset));
10273 insn = emit_insn (insn);
10274 RTX_FRAME_RELATED_P (insn) = 1;
10275 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10276
10277 if (m->fs.cfa_reg == stack_pointer_rtx)
10278 m->fs.cfa_reg = hard_frame_pointer_rtx;
10279 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10280 m->fs.fp_valid = true;
10281 }
10282
10283 if (!int_registers_saved)
10284 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10285 if (frame.nsseregs)
10286 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10287
10288 pic_reg_used = false;
10289 if (pic_offset_table_rtx
10290 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10291 || crtl->profile))
10292 {
10293 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10294
10295 if (alt_pic_reg_used != INVALID_REGNUM)
10296 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10297
10298 pic_reg_used = true;
10299 }
10300
10301 if (pic_reg_used)
10302 {
10303 if (TARGET_64BIT)
10304 {
10305 if (ix86_cmodel == CM_LARGE_PIC)
10306 {
10307 rtx tmp_reg = gen_rtx_REG (DImode, R11_REG);
10308 rtx label = gen_label_rtx ();
10309 emit_label (label);
10310 LABEL_PRESERVE_P (label) = 1;
10311 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10312 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, label));
10313 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10314 insn = emit_insn (gen_adddi3 (pic_offset_table_rtx,
10315 pic_offset_table_rtx, tmp_reg));
10316 }
10317 else
10318 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10319 }
10320 else
10321 {
10322 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10323 RTX_FRAME_RELATED_P (insn) = 1;
10324 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10325 }
10326 }
10327
10328 /* In the pic_reg_used case, make sure that the got load isn't deleted
10329 when mcount needs it. Blockage to avoid call movement across mcount
10330 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10331 note. */
10332 if (crtl->profile && !flag_fentry && pic_reg_used)
10333 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10334
10335 if (crtl->drap_reg && !crtl->stack_realign_needed)
10336 {
10337 /* vDRAP is setup but after reload it turns out stack realign
10338 isn't necessary, here we will emit prologue to setup DRAP
10339 without stack realign adjustment */
10340 t = choose_baseaddr (0);
10341 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10342 }
10343
10344 /* Prevent instructions from being scheduled into register save push
10345 sequence when access to the redzone area is done through frame pointer.
10346 The offset between the frame pointer and the stack pointer is calculated
10347 relative to the value of the stack pointer at the end of the function
10348 prologue, and moving instructions that access redzone area via frame
10349 pointer inside push sequence violates this assumption. */
10350 if (frame_pointer_needed && frame.red_zone_size)
10351 emit_insn (gen_memory_blockage ());
10352
10353 /* Emit cld instruction if stringops are used in the function. */
10354 if (TARGET_CLD && ix86_current_function_needs_cld)
10355 emit_insn (gen_cld ());
10356
10357 /* SEH requires that the prologue end within 256 bytes of the start of
10358 the function. Prevent instruction schedules that would extend that.
10359 Further, prevent alloca modifications to the stack pointer from being
10360 combined with prologue modifications. */
10361 if (TARGET_SEH)
10362 emit_insn (gen_prologue_use (stack_pointer_rtx));
10363 }
10364
10365 /* Emit code to restore REG using a POP insn. */
10366
10367 static void
10368 ix86_emit_restore_reg_using_pop (rtx reg)
10369 {
10370 struct machine_function *m = cfun->machine;
10371 rtx insn = emit_insn (gen_pop (reg));
10372
10373 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10374 m->fs.sp_offset -= UNITS_PER_WORD;
10375
10376 if (m->fs.cfa_reg == crtl->drap_reg
10377 && REGNO (reg) == REGNO (crtl->drap_reg))
10378 {
10379 /* Previously we'd represented the CFA as an expression
10380 like *(%ebp - 8). We've just popped that value from
10381 the stack, which means we need to reset the CFA to
10382 the drap register. This will remain until we restore
10383 the stack pointer. */
10384 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10385 RTX_FRAME_RELATED_P (insn) = 1;
10386
10387 /* This means that the DRAP register is valid for addressing too. */
10388 m->fs.drap_valid = true;
10389 return;
10390 }
10391
10392 if (m->fs.cfa_reg == stack_pointer_rtx)
10393 {
10394 rtx x = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
10395 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10396 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10397 RTX_FRAME_RELATED_P (insn) = 1;
10398
10399 m->fs.cfa_offset -= UNITS_PER_WORD;
10400 }
10401
10402 /* When the frame pointer is the CFA, and we pop it, we are
10403 swapping back to the stack pointer as the CFA. This happens
10404 for stack frames that don't allocate other data, so we assume
10405 the stack pointer is now pointing at the return address, i.e.
10406 the function entry state, which makes the offset be 1 word. */
10407 if (reg == hard_frame_pointer_rtx)
10408 {
10409 m->fs.fp_valid = false;
10410 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10411 {
10412 m->fs.cfa_reg = stack_pointer_rtx;
10413 m->fs.cfa_offset -= UNITS_PER_WORD;
10414
10415 add_reg_note (insn, REG_CFA_DEF_CFA,
10416 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10417 GEN_INT (m->fs.cfa_offset)));
10418 RTX_FRAME_RELATED_P (insn) = 1;
10419 }
10420 }
10421 }
10422
10423 /* Emit code to restore saved registers using POP insns. */
10424
10425 static void
10426 ix86_emit_restore_regs_using_pop (void)
10427 {
10428 unsigned int regno;
10429
10430 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10431 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10432 ix86_emit_restore_reg_using_pop (gen_rtx_REG (Pmode, regno));
10433 }
10434
10435 /* Emit code and notes for the LEAVE instruction. */
10436
10437 static void
10438 ix86_emit_leave (void)
10439 {
10440 struct machine_function *m = cfun->machine;
10441 rtx insn = emit_insn (ix86_gen_leave ());
10442
10443 ix86_add_queued_cfa_restore_notes (insn);
10444
10445 gcc_assert (m->fs.fp_valid);
10446 m->fs.sp_valid = true;
10447 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10448 m->fs.fp_valid = false;
10449
10450 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10451 {
10452 m->fs.cfa_reg = stack_pointer_rtx;
10453 m->fs.cfa_offset = m->fs.sp_offset;
10454
10455 add_reg_note (insn, REG_CFA_DEF_CFA,
10456 plus_constant (stack_pointer_rtx, m->fs.sp_offset));
10457 RTX_FRAME_RELATED_P (insn) = 1;
10458 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10459 m->fs.fp_offset);
10460 }
10461 }
10462
10463 /* Emit code to restore saved registers using MOV insns.
10464 First register is restored from CFA - CFA_OFFSET. */
10465 static void
10466 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10467 bool maybe_eh_return)
10468 {
10469 struct machine_function *m = cfun->machine;
10470 unsigned int regno;
10471
10472 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10473 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10474 {
10475 rtx reg = gen_rtx_REG (Pmode, regno);
10476 rtx insn, mem;
10477
10478 mem = choose_baseaddr (cfa_offset);
10479 mem = gen_frame_mem (Pmode, mem);
10480 insn = emit_move_insn (reg, mem);
10481
10482 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10483 {
10484 /* Previously we'd represented the CFA as an expression
10485 like *(%ebp - 8). We've just popped that value from
10486 the stack, which means we need to reset the CFA to
10487 the drap register. This will remain until we restore
10488 the stack pointer. */
10489 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10490 RTX_FRAME_RELATED_P (insn) = 1;
10491
10492 /* This means that the DRAP register is valid for addressing. */
10493 m->fs.drap_valid = true;
10494 }
10495 else
10496 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10497
10498 cfa_offset -= UNITS_PER_WORD;
10499 }
10500 }
10501
10502 /* Emit code to restore saved registers using MOV insns.
10503 First register is restored from CFA - CFA_OFFSET. */
10504 static void
10505 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10506 bool maybe_eh_return)
10507 {
10508 unsigned int regno;
10509
10510 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10511 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10512 {
10513 rtx reg = gen_rtx_REG (V4SFmode, regno);
10514 rtx mem;
10515
10516 mem = choose_baseaddr (cfa_offset);
10517 mem = gen_rtx_MEM (V4SFmode, mem);
10518 set_mem_align (mem, 128);
10519 emit_move_insn (reg, mem);
10520
10521 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10522
10523 cfa_offset -= 16;
10524 }
10525 }
10526
10527 /* Restore function stack, frame, and registers. */
10528
10529 void
10530 ix86_expand_epilogue (int style)
10531 {
10532 struct machine_function *m = cfun->machine;
10533 struct machine_frame_state frame_state_save = m->fs;
10534 struct ix86_frame frame;
10535 bool restore_regs_via_mov;
10536 bool using_drap;
10537
10538 ix86_finalize_stack_realign_flags ();
10539 ix86_compute_frame_layout (&frame);
10540
10541 m->fs.sp_valid = (!frame_pointer_needed
10542 || (current_function_sp_is_unchanging
10543 && !stack_realign_fp));
10544 gcc_assert (!m->fs.sp_valid
10545 || m->fs.sp_offset == frame.stack_pointer_offset);
10546
10547 /* The FP must be valid if the frame pointer is present. */
10548 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10549 gcc_assert (!m->fs.fp_valid
10550 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10551
10552 /* We must have *some* valid pointer to the stack frame. */
10553 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10554
10555 /* The DRAP is never valid at this point. */
10556 gcc_assert (!m->fs.drap_valid);
10557
10558 /* See the comment about red zone and frame
10559 pointer usage in ix86_expand_prologue. */
10560 if (frame_pointer_needed && frame.red_zone_size)
10561 emit_insn (gen_memory_blockage ());
10562
10563 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10564 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10565
10566 /* Determine the CFA offset of the end of the red-zone. */
10567 m->fs.red_zone_offset = 0;
10568 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10569 {
10570 /* The red-zone begins below the return address. */
10571 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10572
10573 /* When the register save area is in the aligned portion of
10574 the stack, determine the maximum runtime displacement that
10575 matches up with the aligned frame. */
10576 if (stack_realign_drap)
10577 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10578 + UNITS_PER_WORD);
10579 }
10580
10581 /* Special care must be taken for the normal return case of a function
10582 using eh_return: the eax and edx registers are marked as saved, but
10583 not restored along this path. Adjust the save location to match. */
10584 if (crtl->calls_eh_return && style != 2)
10585 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10586
10587 /* EH_RETURN requires the use of moves to function properly. */
10588 if (crtl->calls_eh_return)
10589 restore_regs_via_mov = true;
10590 /* SEH requires the use of pops to identify the epilogue. */
10591 else if (TARGET_SEH)
10592 restore_regs_via_mov = false;
10593 /* If we're only restoring one register and sp is not valid then
10594 using a move instruction to restore the register since it's
10595 less work than reloading sp and popping the register. */
10596 else if (!m->fs.sp_valid && frame.nregs <= 1)
10597 restore_regs_via_mov = true;
10598 else if (TARGET_EPILOGUE_USING_MOVE
10599 && cfun->machine->use_fast_prologue_epilogue
10600 && (frame.nregs > 1
10601 || m->fs.sp_offset != frame.reg_save_offset))
10602 restore_regs_via_mov = true;
10603 else if (frame_pointer_needed
10604 && !frame.nregs
10605 && m->fs.sp_offset != frame.reg_save_offset)
10606 restore_regs_via_mov = true;
10607 else if (frame_pointer_needed
10608 && TARGET_USE_LEAVE
10609 && cfun->machine->use_fast_prologue_epilogue
10610 && frame.nregs == 1)
10611 restore_regs_via_mov = true;
10612 else
10613 restore_regs_via_mov = false;
10614
10615 if (restore_regs_via_mov || frame.nsseregs)
10616 {
10617 /* Ensure that the entire register save area is addressable via
10618 the stack pointer, if we will restore via sp. */
10619 if (TARGET_64BIT
10620 && m->fs.sp_offset > 0x7fffffff
10621 && !(m->fs.fp_valid || m->fs.drap_valid)
10622 && (frame.nsseregs + frame.nregs) != 0)
10623 {
10624 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10625 GEN_INT (m->fs.sp_offset
10626 - frame.sse_reg_save_offset),
10627 style,
10628 m->fs.cfa_reg == stack_pointer_rtx);
10629 }
10630 }
10631
10632 /* If there are any SSE registers to restore, then we have to do it
10633 via moves, since there's obviously no pop for SSE regs. */
10634 if (frame.nsseregs)
10635 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10636 style == 2);
10637
10638 if (restore_regs_via_mov)
10639 {
10640 rtx t;
10641
10642 if (frame.nregs)
10643 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10644
10645 /* eh_return epilogues need %ecx added to the stack pointer. */
10646 if (style == 2)
10647 {
10648 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10649
10650 /* Stack align doesn't work with eh_return. */
10651 gcc_assert (!stack_realign_drap);
10652 /* Neither does regparm nested functions. */
10653 gcc_assert (!ix86_static_chain_on_stack);
10654
10655 if (frame_pointer_needed)
10656 {
10657 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10658 t = plus_constant (t, m->fs.fp_offset - UNITS_PER_WORD);
10659 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10660
10661 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10662 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10663
10664 /* Note that we use SA as a temporary CFA, as the return
10665 address is at the proper place relative to it. We
10666 pretend this happens at the FP restore insn because
10667 prior to this insn the FP would be stored at the wrong
10668 offset relative to SA, and after this insn we have no
10669 other reasonable register to use for the CFA. We don't
10670 bother resetting the CFA to the SP for the duration of
10671 the return insn. */
10672 add_reg_note (insn, REG_CFA_DEF_CFA,
10673 plus_constant (sa, UNITS_PER_WORD));
10674 ix86_add_queued_cfa_restore_notes (insn);
10675 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10676 RTX_FRAME_RELATED_P (insn) = 1;
10677
10678 m->fs.cfa_reg = sa;
10679 m->fs.cfa_offset = UNITS_PER_WORD;
10680 m->fs.fp_valid = false;
10681
10682 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10683 const0_rtx, style, false);
10684 }
10685 else
10686 {
10687 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10688 t = plus_constant (t, m->fs.sp_offset - UNITS_PER_WORD);
10689 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10690 ix86_add_queued_cfa_restore_notes (insn);
10691
10692 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10693 if (m->fs.cfa_offset != UNITS_PER_WORD)
10694 {
10695 m->fs.cfa_offset = UNITS_PER_WORD;
10696 add_reg_note (insn, REG_CFA_DEF_CFA,
10697 plus_constant (stack_pointer_rtx,
10698 UNITS_PER_WORD));
10699 RTX_FRAME_RELATED_P (insn) = 1;
10700 }
10701 }
10702 m->fs.sp_offset = UNITS_PER_WORD;
10703 m->fs.sp_valid = true;
10704 }
10705 }
10706 else
10707 {
10708 /* SEH requires that the function end with (1) a stack adjustment
10709 if necessary, (2) a sequence of pops, and (3) a return or
10710 jump instruction. Prevent insns from the function body from
10711 being scheduled into this sequence. */
10712 if (TARGET_SEH)
10713 {
10714 /* Prevent a catch region from being adjacent to the standard
10715 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10716 several other flags that would be interesting to test are
10717 not yet set up. */
10718 if (flag_non_call_exceptions)
10719 emit_insn (gen_nops (const1_rtx));
10720 else
10721 emit_insn (gen_blockage ());
10722 }
10723
10724 /* First step is to deallocate the stack frame so that we can
10725 pop the registers. */
10726 if (!m->fs.sp_valid)
10727 {
10728 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10729 GEN_INT (m->fs.fp_offset
10730 - frame.reg_save_offset),
10731 style, false);
10732 }
10733 else if (m->fs.sp_offset != frame.reg_save_offset)
10734 {
10735 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10736 GEN_INT (m->fs.sp_offset
10737 - frame.reg_save_offset),
10738 style,
10739 m->fs.cfa_reg == stack_pointer_rtx);
10740 }
10741
10742 ix86_emit_restore_regs_using_pop ();
10743 }
10744
10745 /* If we used a stack pointer and haven't already got rid of it,
10746 then do so now. */
10747 if (m->fs.fp_valid)
10748 {
10749 /* If the stack pointer is valid and pointing at the frame
10750 pointer store address, then we only need a pop. */
10751 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10752 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10753 /* Leave results in shorter dependency chains on CPUs that are
10754 able to grok it fast. */
10755 else if (TARGET_USE_LEAVE
10756 || optimize_function_for_size_p (cfun)
10757 || !cfun->machine->use_fast_prologue_epilogue)
10758 ix86_emit_leave ();
10759 else
10760 {
10761 pro_epilogue_adjust_stack (stack_pointer_rtx,
10762 hard_frame_pointer_rtx,
10763 const0_rtx, style, !using_drap);
10764 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10765 }
10766 }
10767
10768 if (using_drap)
10769 {
10770 int param_ptr_offset = UNITS_PER_WORD;
10771 rtx insn;
10772
10773 gcc_assert (stack_realign_drap);
10774
10775 if (ix86_static_chain_on_stack)
10776 param_ptr_offset += UNITS_PER_WORD;
10777 if (!call_used_regs[REGNO (crtl->drap_reg)])
10778 param_ptr_offset += UNITS_PER_WORD;
10779
10780 insn = emit_insn (gen_rtx_SET
10781 (VOIDmode, stack_pointer_rtx,
10782 gen_rtx_PLUS (Pmode,
10783 crtl->drap_reg,
10784 GEN_INT (-param_ptr_offset))));
10785 m->fs.cfa_reg = stack_pointer_rtx;
10786 m->fs.cfa_offset = param_ptr_offset;
10787 m->fs.sp_offset = param_ptr_offset;
10788 m->fs.realigned = false;
10789
10790 add_reg_note (insn, REG_CFA_DEF_CFA,
10791 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10792 GEN_INT (param_ptr_offset)));
10793 RTX_FRAME_RELATED_P (insn) = 1;
10794
10795 if (!call_used_regs[REGNO (crtl->drap_reg)])
10796 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
10797 }
10798
10799 /* At this point the stack pointer must be valid, and we must have
10800 restored all of the registers. We may not have deallocated the
10801 entire stack frame. We've delayed this until now because it may
10802 be possible to merge the local stack deallocation with the
10803 deallocation forced by ix86_static_chain_on_stack. */
10804 gcc_assert (m->fs.sp_valid);
10805 gcc_assert (!m->fs.fp_valid);
10806 gcc_assert (!m->fs.realigned);
10807 if (m->fs.sp_offset != UNITS_PER_WORD)
10808 {
10809 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10810 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
10811 style, true);
10812 }
10813 else
10814 ix86_add_queued_cfa_restore_notes (get_last_insn ());
10815
10816 /* Sibcall epilogues don't want a return instruction. */
10817 if (style == 0)
10818 {
10819 m->fs = frame_state_save;
10820 return;
10821 }
10822
10823 /* Emit vzeroupper if needed. */
10824 if (TARGET_VZEROUPPER
10825 && !TREE_THIS_VOLATILE (cfun->decl)
10826 && !cfun->machine->caller_return_avx256_p)
10827 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10828
10829 if (crtl->args.pops_args && crtl->args.size)
10830 {
10831 rtx popc = GEN_INT (crtl->args.pops_args);
10832
10833 /* i386 can only pop 64K bytes. If asked to pop more, pop return
10834 address, do explicit add, and jump indirectly to the caller. */
10835
10836 if (crtl->args.pops_args >= 65536)
10837 {
10838 rtx ecx = gen_rtx_REG (SImode, CX_REG);
10839 rtx insn;
10840
10841 /* There is no "pascal" calling convention in any 64bit ABI. */
10842 gcc_assert (!TARGET_64BIT);
10843
10844 insn = emit_insn (gen_pop (ecx));
10845 m->fs.cfa_offset -= UNITS_PER_WORD;
10846 m->fs.sp_offset -= UNITS_PER_WORD;
10847
10848 add_reg_note (insn, REG_CFA_ADJUST_CFA,
10849 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
10850 add_reg_note (insn, REG_CFA_REGISTER,
10851 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
10852 RTX_FRAME_RELATED_P (insn) = 1;
10853
10854 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10855 popc, -1, true);
10856 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
10857 }
10858 else
10859 emit_jump_insn (gen_simple_return_pop_internal (popc));
10860 }
10861 else
10862 emit_jump_insn (gen_simple_return_internal ());
10863
10864 /* Restore the state back to the state from the prologue,
10865 so that it's correct for the next epilogue. */
10866 m->fs = frame_state_save;
10867 }
10868
10869 /* Reset from the function's potential modifications. */
10870
10871 static void
10872 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
10873 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
10874 {
10875 if (pic_offset_table_rtx)
10876 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
10877 #if TARGET_MACHO
10878 /* Mach-O doesn't support labels at the end of objects, so if
10879 it looks like we might want one, insert a NOP. */
10880 {
10881 rtx insn = get_last_insn ();
10882 while (insn
10883 && NOTE_P (insn)
10884 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
10885 insn = PREV_INSN (insn);
10886 if (insn
10887 && (LABEL_P (insn)
10888 || (NOTE_P (insn)
10889 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
10890 fputs ("\tnop\n", file);
10891 }
10892 #endif
10893
10894 }
10895
10896 /* Return a scratch register to use in the split stack prologue. The
10897 split stack prologue is used for -fsplit-stack. It is the first
10898 instructions in the function, even before the regular prologue.
10899 The scratch register can be any caller-saved register which is not
10900 used for parameters or for the static chain. */
10901
10902 static unsigned int
10903 split_stack_prologue_scratch_regno (void)
10904 {
10905 if (TARGET_64BIT)
10906 return R11_REG;
10907 else
10908 {
10909 bool is_fastcall;
10910 int regparm;
10911
10912 is_fastcall = (lookup_attribute ("fastcall",
10913 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
10914 != NULL);
10915 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
10916
10917 if (is_fastcall)
10918 {
10919 if (DECL_STATIC_CHAIN (cfun->decl))
10920 {
10921 sorry ("-fsplit-stack does not support fastcall with "
10922 "nested function");
10923 return INVALID_REGNUM;
10924 }
10925 return AX_REG;
10926 }
10927 else if (regparm < 3)
10928 {
10929 if (!DECL_STATIC_CHAIN (cfun->decl))
10930 return CX_REG;
10931 else
10932 {
10933 if (regparm >= 2)
10934 {
10935 sorry ("-fsplit-stack does not support 2 register "
10936 " parameters for a nested function");
10937 return INVALID_REGNUM;
10938 }
10939 return DX_REG;
10940 }
10941 }
10942 else
10943 {
10944 /* FIXME: We could make this work by pushing a register
10945 around the addition and comparison. */
10946 sorry ("-fsplit-stack does not support 3 register parameters");
10947 return INVALID_REGNUM;
10948 }
10949 }
10950 }
10951
10952 /* A SYMBOL_REF for the function which allocates new stackspace for
10953 -fsplit-stack. */
10954
10955 static GTY(()) rtx split_stack_fn;
10956
10957 /* A SYMBOL_REF for the more stack function when using the large
10958 model. */
10959
10960 static GTY(()) rtx split_stack_fn_large;
10961
10962 /* Handle -fsplit-stack. These are the first instructions in the
10963 function, even before the regular prologue. */
10964
10965 void
10966 ix86_expand_split_stack_prologue (void)
10967 {
10968 struct ix86_frame frame;
10969 HOST_WIDE_INT allocate;
10970 unsigned HOST_WIDE_INT args_size;
10971 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
10972 rtx scratch_reg = NULL_RTX;
10973 rtx varargs_label = NULL_RTX;
10974 rtx fn;
10975
10976 gcc_assert (flag_split_stack && reload_completed);
10977
10978 ix86_finalize_stack_realign_flags ();
10979 ix86_compute_frame_layout (&frame);
10980 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
10981
10982 /* This is the label we will branch to if we have enough stack
10983 space. We expect the basic block reordering pass to reverse this
10984 branch if optimizing, so that we branch in the unlikely case. */
10985 label = gen_label_rtx ();
10986
10987 /* We need to compare the stack pointer minus the frame size with
10988 the stack boundary in the TCB. The stack boundary always gives
10989 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
10990 can compare directly. Otherwise we need to do an addition. */
10991
10992 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
10993 UNSPEC_STACK_CHECK);
10994 limit = gen_rtx_CONST (Pmode, limit);
10995 limit = gen_rtx_MEM (Pmode, limit);
10996 if (allocate < SPLIT_STACK_AVAILABLE)
10997 current = stack_pointer_rtx;
10998 else
10999 {
11000 unsigned int scratch_regno;
11001 rtx offset;
11002
11003 /* We need a scratch register to hold the stack pointer minus
11004 the required frame size. Since this is the very start of the
11005 function, the scratch register can be any caller-saved
11006 register which is not used for parameters. */
11007 offset = GEN_INT (- allocate);
11008 scratch_regno = split_stack_prologue_scratch_regno ();
11009 if (scratch_regno == INVALID_REGNUM)
11010 return;
11011 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11012 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11013 {
11014 /* We don't use ix86_gen_add3 in this case because it will
11015 want to split to lea, but when not optimizing the insn
11016 will not be split after this point. */
11017 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11018 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11019 offset)));
11020 }
11021 else
11022 {
11023 emit_move_insn (scratch_reg, offset);
11024 emit_insn (gen_adddi3 (scratch_reg, scratch_reg,
11025 stack_pointer_rtx));
11026 }
11027 current = scratch_reg;
11028 }
11029
11030 ix86_expand_branch (GEU, current, limit, label);
11031 jump_insn = get_last_insn ();
11032 JUMP_LABEL (jump_insn) = label;
11033
11034 /* Mark the jump as very likely to be taken. */
11035 add_reg_note (jump_insn, REG_BR_PROB,
11036 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11037
11038 if (split_stack_fn == NULL_RTX)
11039 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11040 fn = split_stack_fn;
11041
11042 /* Get more stack space. We pass in the desired stack space and the
11043 size of the arguments to copy to the new stack. In 32-bit mode
11044 we push the parameters; __morestack will return on a new stack
11045 anyhow. In 64-bit mode we pass the parameters in r10 and
11046 r11. */
11047 allocate_rtx = GEN_INT (allocate);
11048 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11049 call_fusage = NULL_RTX;
11050 if (TARGET_64BIT)
11051 {
11052 rtx reg10, reg11;
11053
11054 reg10 = gen_rtx_REG (Pmode, R10_REG);
11055 reg11 = gen_rtx_REG (Pmode, R11_REG);
11056
11057 /* If this function uses a static chain, it will be in %r10.
11058 Preserve it across the call to __morestack. */
11059 if (DECL_STATIC_CHAIN (cfun->decl))
11060 {
11061 rtx rax;
11062
11063 rax = gen_rtx_REG (Pmode, AX_REG);
11064 emit_move_insn (rax, reg10);
11065 use_reg (&call_fusage, rax);
11066 }
11067
11068 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11069 {
11070 HOST_WIDE_INT argval;
11071
11072 /* When using the large model we need to load the address
11073 into a register, and we've run out of registers. So we
11074 switch to a different calling convention, and we call a
11075 different function: __morestack_large. We pass the
11076 argument size in the upper 32 bits of r10 and pass the
11077 frame size in the lower 32 bits. */
11078 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11079 gcc_assert ((args_size & 0xffffffff) == args_size);
11080
11081 if (split_stack_fn_large == NULL_RTX)
11082 split_stack_fn_large =
11083 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11084
11085 if (ix86_cmodel == CM_LARGE_PIC)
11086 {
11087 rtx label, x;
11088
11089 label = gen_label_rtx ();
11090 emit_label (label);
11091 LABEL_PRESERVE_P (label) = 1;
11092 emit_insn (gen_set_rip_rex64 (reg10, label));
11093 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11094 emit_insn (gen_adddi3 (reg10, reg10, reg11));
11095 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11096 UNSPEC_GOT);
11097 x = gen_rtx_CONST (Pmode, x);
11098 emit_move_insn (reg11, x);
11099 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11100 x = gen_const_mem (Pmode, x);
11101 emit_move_insn (reg11, x);
11102 }
11103 else
11104 emit_move_insn (reg11, split_stack_fn_large);
11105
11106 fn = reg11;
11107
11108 argval = ((args_size << 16) << 16) + allocate;
11109 emit_move_insn (reg10, GEN_INT (argval));
11110 }
11111 else
11112 {
11113 emit_move_insn (reg10, allocate_rtx);
11114 emit_move_insn (reg11, GEN_INT (args_size));
11115 use_reg (&call_fusage, reg11);
11116 }
11117
11118 use_reg (&call_fusage, reg10);
11119 }
11120 else
11121 {
11122 emit_insn (gen_push (GEN_INT (args_size)));
11123 emit_insn (gen_push (allocate_rtx));
11124 }
11125 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11126 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11127 NULL_RTX, false);
11128 add_function_usage_to (call_insn, call_fusage);
11129
11130 /* In order to make call/return prediction work right, we now need
11131 to execute a return instruction. See
11132 libgcc/config/i386/morestack.S for the details on how this works.
11133
11134 For flow purposes gcc must not see this as a return
11135 instruction--we need control flow to continue at the subsequent
11136 label. Therefore, we use an unspec. */
11137 gcc_assert (crtl->args.pops_args < 65536);
11138 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11139
11140 /* If we are in 64-bit mode and this function uses a static chain,
11141 we saved %r10 in %rax before calling _morestack. */
11142 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11143 emit_move_insn (gen_rtx_REG (Pmode, R10_REG),
11144 gen_rtx_REG (Pmode, AX_REG));
11145
11146 /* If this function calls va_start, we need to store a pointer to
11147 the arguments on the old stack, because they may not have been
11148 all copied to the new stack. At this point the old stack can be
11149 found at the frame pointer value used by __morestack, because
11150 __morestack has set that up before calling back to us. Here we
11151 store that pointer in a scratch register, and in
11152 ix86_expand_prologue we store the scratch register in a stack
11153 slot. */
11154 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11155 {
11156 unsigned int scratch_regno;
11157 rtx frame_reg;
11158 int words;
11159
11160 scratch_regno = split_stack_prologue_scratch_regno ();
11161 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11162 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11163
11164 /* 64-bit:
11165 fp -> old fp value
11166 return address within this function
11167 return address of caller of this function
11168 stack arguments
11169 So we add three words to get to the stack arguments.
11170
11171 32-bit:
11172 fp -> old fp value
11173 return address within this function
11174 first argument to __morestack
11175 second argument to __morestack
11176 return address of caller of this function
11177 stack arguments
11178 So we add five words to get to the stack arguments.
11179 */
11180 words = TARGET_64BIT ? 3 : 5;
11181 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11182 gen_rtx_PLUS (Pmode, frame_reg,
11183 GEN_INT (words * UNITS_PER_WORD))));
11184
11185 varargs_label = gen_label_rtx ();
11186 emit_jump_insn (gen_jump (varargs_label));
11187 JUMP_LABEL (get_last_insn ()) = varargs_label;
11188
11189 emit_barrier ();
11190 }
11191
11192 emit_label (label);
11193 LABEL_NUSES (label) = 1;
11194
11195 /* If this function calls va_start, we now have to set the scratch
11196 register for the case where we do not call __morestack. In this
11197 case we need to set it based on the stack pointer. */
11198 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11199 {
11200 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11201 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11202 GEN_INT (UNITS_PER_WORD))));
11203
11204 emit_label (varargs_label);
11205 LABEL_NUSES (varargs_label) = 1;
11206 }
11207 }
11208
11209 /* We may have to tell the dataflow pass that the split stack prologue
11210 is initializing a scratch register. */
11211
11212 static void
11213 ix86_live_on_entry (bitmap regs)
11214 {
11215 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11216 {
11217 gcc_assert (flag_split_stack);
11218 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11219 }
11220 }
11221 \f
11222 /* Determine if op is suitable SUBREG RTX for address. */
11223
11224 static bool
11225 ix86_address_subreg_operand (rtx op)
11226 {
11227 enum machine_mode mode;
11228
11229 if (!REG_P (op))
11230 return false;
11231
11232 mode = GET_MODE (op);
11233
11234 if (GET_MODE_CLASS (mode) != MODE_INT)
11235 return false;
11236
11237 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11238 failures when the register is one word out of a two word structure. */
11239 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11240 return false;
11241
11242 /* Allow only SUBREGs of non-eliminable hard registers. */
11243 return register_no_elim_operand (op, mode);
11244 }
11245
11246 /* Extract the parts of an RTL expression that is a valid memory address
11247 for an instruction. Return 0 if the structure of the address is
11248 grossly off. Return -1 if the address contains ASHIFT, so it is not
11249 strictly valid, but still used for computing length of lea instruction. */
11250
11251 int
11252 ix86_decompose_address (rtx addr, struct ix86_address *out)
11253 {
11254 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11255 rtx base_reg, index_reg;
11256 HOST_WIDE_INT scale = 1;
11257 rtx scale_rtx = NULL_RTX;
11258 rtx tmp;
11259 int retval = 1;
11260 enum ix86_address_seg seg = SEG_DEFAULT;
11261
11262 /* Allow zero-extended SImode addresses,
11263 they will be emitted with addr32 prefix. */
11264 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11265 {
11266 if (GET_CODE (addr) == ZERO_EXTEND
11267 && GET_MODE (XEXP (addr, 0)) == SImode)
11268 addr = XEXP (addr, 0);
11269 else if (GET_CODE (addr) == AND
11270 && const_32bit_mask (XEXP (addr, 1), DImode))
11271 {
11272 addr = XEXP (addr, 0);
11273
11274 /* Strip subreg. */
11275 if (GET_CODE (addr) == SUBREG
11276 && GET_MODE (SUBREG_REG (addr)) == SImode)
11277 addr = SUBREG_REG (addr);
11278 }
11279 }
11280
11281 if (REG_P (addr))
11282 base = addr;
11283 else if (GET_CODE (addr) == SUBREG)
11284 {
11285 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11286 base = addr;
11287 else
11288 return 0;
11289 }
11290 else if (GET_CODE (addr) == PLUS)
11291 {
11292 rtx addends[4], op;
11293 int n = 0, i;
11294
11295 op = addr;
11296 do
11297 {
11298 if (n >= 4)
11299 return 0;
11300 addends[n++] = XEXP (op, 1);
11301 op = XEXP (op, 0);
11302 }
11303 while (GET_CODE (op) == PLUS);
11304 if (n >= 4)
11305 return 0;
11306 addends[n] = op;
11307
11308 for (i = n; i >= 0; --i)
11309 {
11310 op = addends[i];
11311 switch (GET_CODE (op))
11312 {
11313 case MULT:
11314 if (index)
11315 return 0;
11316 index = XEXP (op, 0);
11317 scale_rtx = XEXP (op, 1);
11318 break;
11319
11320 case ASHIFT:
11321 if (index)
11322 return 0;
11323 index = XEXP (op, 0);
11324 tmp = XEXP (op, 1);
11325 if (!CONST_INT_P (tmp))
11326 return 0;
11327 scale = INTVAL (tmp);
11328 if ((unsigned HOST_WIDE_INT) scale > 3)
11329 return 0;
11330 scale = 1 << scale;
11331 break;
11332
11333 case UNSPEC:
11334 if (XINT (op, 1) == UNSPEC_TP
11335 && TARGET_TLS_DIRECT_SEG_REFS
11336 && seg == SEG_DEFAULT)
11337 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11338 else
11339 return 0;
11340 break;
11341
11342 case SUBREG:
11343 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11344 return 0;
11345 /* FALLTHRU */
11346
11347 case REG:
11348 if (!base)
11349 base = op;
11350 else if (!index)
11351 index = op;
11352 else
11353 return 0;
11354 break;
11355
11356 case CONST:
11357 case CONST_INT:
11358 case SYMBOL_REF:
11359 case LABEL_REF:
11360 if (disp)
11361 return 0;
11362 disp = op;
11363 break;
11364
11365 default:
11366 return 0;
11367 }
11368 }
11369 }
11370 else if (GET_CODE (addr) == MULT)
11371 {
11372 index = XEXP (addr, 0); /* index*scale */
11373 scale_rtx = XEXP (addr, 1);
11374 }
11375 else if (GET_CODE (addr) == ASHIFT)
11376 {
11377 /* We're called for lea too, which implements ashift on occasion. */
11378 index = XEXP (addr, 0);
11379 tmp = XEXP (addr, 1);
11380 if (!CONST_INT_P (tmp))
11381 return 0;
11382 scale = INTVAL (tmp);
11383 if ((unsigned HOST_WIDE_INT) scale > 3)
11384 return 0;
11385 scale = 1 << scale;
11386 retval = -1;
11387 }
11388 else
11389 disp = addr; /* displacement */
11390
11391 if (index)
11392 {
11393 if (REG_P (index))
11394 ;
11395 else if (GET_CODE (index) == SUBREG
11396 && ix86_address_subreg_operand (SUBREG_REG (index)))
11397 ;
11398 else
11399 return 0;
11400 }
11401
11402 /* Extract the integral value of scale. */
11403 if (scale_rtx)
11404 {
11405 if (!CONST_INT_P (scale_rtx))
11406 return 0;
11407 scale = INTVAL (scale_rtx);
11408 }
11409
11410 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11411 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11412
11413 /* Avoid useless 0 displacement. */
11414 if (disp == const0_rtx && (base || index))
11415 disp = NULL_RTX;
11416
11417 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11418 if (base_reg && index_reg && scale == 1
11419 && (index_reg == arg_pointer_rtx
11420 || index_reg == frame_pointer_rtx
11421 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11422 {
11423 rtx tmp;
11424 tmp = base, base = index, index = tmp;
11425 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11426 }
11427
11428 /* Special case: %ebp cannot be encoded as a base without a displacement.
11429 Similarly %r13. */
11430 if (!disp
11431 && base_reg
11432 && (base_reg == hard_frame_pointer_rtx
11433 || base_reg == frame_pointer_rtx
11434 || base_reg == arg_pointer_rtx
11435 || (REG_P (base_reg)
11436 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11437 || REGNO (base_reg) == R13_REG))))
11438 disp = const0_rtx;
11439
11440 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11441 Avoid this by transforming to [%esi+0].
11442 Reload calls address legitimization without cfun defined, so we need
11443 to test cfun for being non-NULL. */
11444 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11445 && base_reg && !index_reg && !disp
11446 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11447 disp = const0_rtx;
11448
11449 /* Special case: encode reg+reg instead of reg*2. */
11450 if (!base && index && scale == 2)
11451 base = index, base_reg = index_reg, scale = 1;
11452
11453 /* Special case: scaling cannot be encoded without base or displacement. */
11454 if (!base && !disp && index && scale != 1)
11455 disp = const0_rtx;
11456
11457 out->base = base;
11458 out->index = index;
11459 out->disp = disp;
11460 out->scale = scale;
11461 out->seg = seg;
11462
11463 return retval;
11464 }
11465 \f
11466 /* Return cost of the memory address x.
11467 For i386, it is better to use a complex address than let gcc copy
11468 the address into a reg and make a new pseudo. But not if the address
11469 requires to two regs - that would mean more pseudos with longer
11470 lifetimes. */
11471 static int
11472 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11473 {
11474 struct ix86_address parts;
11475 int cost = 1;
11476 int ok = ix86_decompose_address (x, &parts);
11477
11478 gcc_assert (ok);
11479
11480 if (parts.base && GET_CODE (parts.base) == SUBREG)
11481 parts.base = SUBREG_REG (parts.base);
11482 if (parts.index && GET_CODE (parts.index) == SUBREG)
11483 parts.index = SUBREG_REG (parts.index);
11484
11485 /* Attempt to minimize number of registers in the address. */
11486 if ((parts.base
11487 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11488 || (parts.index
11489 && (!REG_P (parts.index)
11490 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11491 cost++;
11492
11493 if (parts.base
11494 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11495 && parts.index
11496 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11497 && parts.base != parts.index)
11498 cost++;
11499
11500 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11501 since it's predecode logic can't detect the length of instructions
11502 and it degenerates to vector decoded. Increase cost of such
11503 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11504 to split such addresses or even refuse such addresses at all.
11505
11506 Following addressing modes are affected:
11507 [base+scale*index]
11508 [scale*index+disp]
11509 [base+index]
11510
11511 The first and last case may be avoidable by explicitly coding the zero in
11512 memory address, but I don't have AMD-K6 machine handy to check this
11513 theory. */
11514
11515 if (TARGET_K6
11516 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11517 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11518 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11519 cost += 10;
11520
11521 return cost;
11522 }
11523 \f
11524 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11525 this is used for to form addresses to local data when -fPIC is in
11526 use. */
11527
11528 static bool
11529 darwin_local_data_pic (rtx disp)
11530 {
11531 return (GET_CODE (disp) == UNSPEC
11532 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11533 }
11534
11535 /* Determine if a given RTX is a valid constant. We already know this
11536 satisfies CONSTANT_P. */
11537
11538 static bool
11539 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11540 {
11541 switch (GET_CODE (x))
11542 {
11543 case CONST:
11544 x = XEXP (x, 0);
11545
11546 if (GET_CODE (x) == PLUS)
11547 {
11548 if (!CONST_INT_P (XEXP (x, 1)))
11549 return false;
11550 x = XEXP (x, 0);
11551 }
11552
11553 if (TARGET_MACHO && darwin_local_data_pic (x))
11554 return true;
11555
11556 /* Only some unspecs are valid as "constants". */
11557 if (GET_CODE (x) == UNSPEC)
11558 switch (XINT (x, 1))
11559 {
11560 case UNSPEC_GOT:
11561 case UNSPEC_GOTOFF:
11562 case UNSPEC_PLTOFF:
11563 return TARGET_64BIT;
11564 case UNSPEC_TPOFF:
11565 case UNSPEC_NTPOFF:
11566 x = XVECEXP (x, 0, 0);
11567 return (GET_CODE (x) == SYMBOL_REF
11568 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11569 case UNSPEC_DTPOFF:
11570 x = XVECEXP (x, 0, 0);
11571 return (GET_CODE (x) == SYMBOL_REF
11572 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11573 default:
11574 return false;
11575 }
11576
11577 /* We must have drilled down to a symbol. */
11578 if (GET_CODE (x) == LABEL_REF)
11579 return true;
11580 if (GET_CODE (x) != SYMBOL_REF)
11581 return false;
11582 /* FALLTHRU */
11583
11584 case SYMBOL_REF:
11585 /* TLS symbols are never valid. */
11586 if (SYMBOL_REF_TLS_MODEL (x))
11587 return false;
11588
11589 /* DLLIMPORT symbols are never valid. */
11590 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11591 && SYMBOL_REF_DLLIMPORT_P (x))
11592 return false;
11593
11594 #if TARGET_MACHO
11595 /* mdynamic-no-pic */
11596 if (MACHO_DYNAMIC_NO_PIC_P)
11597 return machopic_symbol_defined_p (x);
11598 #endif
11599 break;
11600
11601 case CONST_DOUBLE:
11602 if (GET_MODE (x) == TImode
11603 && x != CONST0_RTX (TImode)
11604 && !TARGET_64BIT)
11605 return false;
11606 break;
11607
11608 case CONST_VECTOR:
11609 if (!standard_sse_constant_p (x))
11610 return false;
11611
11612 default:
11613 break;
11614 }
11615
11616 /* Otherwise we handle everything else in the move patterns. */
11617 return true;
11618 }
11619
11620 /* Determine if it's legal to put X into the constant pool. This
11621 is not possible for the address of thread-local symbols, which
11622 is checked above. */
11623
11624 static bool
11625 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11626 {
11627 /* We can always put integral constants and vectors in memory. */
11628 switch (GET_CODE (x))
11629 {
11630 case CONST_INT:
11631 case CONST_DOUBLE:
11632 case CONST_VECTOR:
11633 return false;
11634
11635 default:
11636 break;
11637 }
11638 return !ix86_legitimate_constant_p (mode, x);
11639 }
11640
11641
11642 /* Nonzero if the constant value X is a legitimate general operand
11643 when generating PIC code. It is given that flag_pic is on and
11644 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11645
11646 bool
11647 legitimate_pic_operand_p (rtx x)
11648 {
11649 rtx inner;
11650
11651 switch (GET_CODE (x))
11652 {
11653 case CONST:
11654 inner = XEXP (x, 0);
11655 if (GET_CODE (inner) == PLUS
11656 && CONST_INT_P (XEXP (inner, 1)))
11657 inner = XEXP (inner, 0);
11658
11659 /* Only some unspecs are valid as "constants". */
11660 if (GET_CODE (inner) == UNSPEC)
11661 switch (XINT (inner, 1))
11662 {
11663 case UNSPEC_GOT:
11664 case UNSPEC_GOTOFF:
11665 case UNSPEC_PLTOFF:
11666 return TARGET_64BIT;
11667 case UNSPEC_TPOFF:
11668 x = XVECEXP (inner, 0, 0);
11669 return (GET_CODE (x) == SYMBOL_REF
11670 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11671 case UNSPEC_MACHOPIC_OFFSET:
11672 return legitimate_pic_address_disp_p (x);
11673 default:
11674 return false;
11675 }
11676 /* FALLTHRU */
11677
11678 case SYMBOL_REF:
11679 case LABEL_REF:
11680 return legitimate_pic_address_disp_p (x);
11681
11682 default:
11683 return true;
11684 }
11685 }
11686
11687 /* Determine if a given CONST RTX is a valid memory displacement
11688 in PIC mode. */
11689
11690 bool
11691 legitimate_pic_address_disp_p (rtx disp)
11692 {
11693 bool saw_plus;
11694
11695 /* In 64bit mode we can allow direct addresses of symbols and labels
11696 when they are not dynamic symbols. */
11697 if (TARGET_64BIT)
11698 {
11699 rtx op0 = disp, op1;
11700
11701 switch (GET_CODE (disp))
11702 {
11703 case LABEL_REF:
11704 return true;
11705
11706 case CONST:
11707 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11708 break;
11709 op0 = XEXP (XEXP (disp, 0), 0);
11710 op1 = XEXP (XEXP (disp, 0), 1);
11711 if (!CONST_INT_P (op1)
11712 || INTVAL (op1) >= 16*1024*1024
11713 || INTVAL (op1) < -16*1024*1024)
11714 break;
11715 if (GET_CODE (op0) == LABEL_REF)
11716 return true;
11717 if (GET_CODE (op0) != SYMBOL_REF)
11718 break;
11719 /* FALLTHRU */
11720
11721 case SYMBOL_REF:
11722 /* TLS references should always be enclosed in UNSPEC. */
11723 if (SYMBOL_REF_TLS_MODEL (op0))
11724 return false;
11725 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11726 && ix86_cmodel != CM_LARGE_PIC)
11727 return true;
11728 break;
11729
11730 default:
11731 break;
11732 }
11733 }
11734 if (GET_CODE (disp) != CONST)
11735 return false;
11736 disp = XEXP (disp, 0);
11737
11738 if (TARGET_64BIT)
11739 {
11740 /* We are unsafe to allow PLUS expressions. This limit allowed distance
11741 of GOT tables. We should not need these anyway. */
11742 if (GET_CODE (disp) != UNSPEC
11743 || (XINT (disp, 1) != UNSPEC_GOTPCREL
11744 && XINT (disp, 1) != UNSPEC_GOTOFF
11745 && XINT (disp, 1) != UNSPEC_PCREL
11746 && XINT (disp, 1) != UNSPEC_PLTOFF))
11747 return false;
11748
11749 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
11750 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
11751 return false;
11752 return true;
11753 }
11754
11755 saw_plus = false;
11756 if (GET_CODE (disp) == PLUS)
11757 {
11758 if (!CONST_INT_P (XEXP (disp, 1)))
11759 return false;
11760 disp = XEXP (disp, 0);
11761 saw_plus = true;
11762 }
11763
11764 if (TARGET_MACHO && darwin_local_data_pic (disp))
11765 return true;
11766
11767 if (GET_CODE (disp) != UNSPEC)
11768 return false;
11769
11770 switch (XINT (disp, 1))
11771 {
11772 case UNSPEC_GOT:
11773 if (saw_plus)
11774 return false;
11775 /* We need to check for both symbols and labels because VxWorks loads
11776 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
11777 details. */
11778 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11779 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
11780 case UNSPEC_GOTOFF:
11781 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
11782 While ABI specify also 32bit relocation but we don't produce it in
11783 small PIC model at all. */
11784 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11785 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
11786 && !TARGET_64BIT)
11787 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
11788 return false;
11789 case UNSPEC_GOTTPOFF:
11790 case UNSPEC_GOTNTPOFF:
11791 case UNSPEC_INDNTPOFF:
11792 if (saw_plus)
11793 return false;
11794 disp = XVECEXP (disp, 0, 0);
11795 return (GET_CODE (disp) == SYMBOL_REF
11796 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
11797 case UNSPEC_NTPOFF:
11798 disp = XVECEXP (disp, 0, 0);
11799 return (GET_CODE (disp) == SYMBOL_REF
11800 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
11801 case UNSPEC_DTPOFF:
11802 disp = XVECEXP (disp, 0, 0);
11803 return (GET_CODE (disp) == SYMBOL_REF
11804 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
11805 }
11806
11807 return false;
11808 }
11809
11810 /* Recognizes RTL expressions that are valid memory addresses for an
11811 instruction. The MODE argument is the machine mode for the MEM
11812 expression that wants to use this address.
11813
11814 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
11815 convert common non-canonical forms to canonical form so that they will
11816 be recognized. */
11817
11818 static bool
11819 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
11820 rtx addr, bool strict)
11821 {
11822 struct ix86_address parts;
11823 rtx base, index, disp;
11824 HOST_WIDE_INT scale;
11825
11826 if (ix86_decompose_address (addr, &parts) <= 0)
11827 /* Decomposition failed. */
11828 return false;
11829
11830 base = parts.base;
11831 index = parts.index;
11832 disp = parts.disp;
11833 scale = parts.scale;
11834
11835 /* Validate base register. */
11836 if (base)
11837 {
11838 rtx reg;
11839
11840 if (REG_P (base))
11841 reg = base;
11842 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
11843 reg = SUBREG_REG (base);
11844 else
11845 /* Base is not a register. */
11846 return false;
11847
11848 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
11849 return false;
11850
11851 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
11852 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
11853 /* Base is not valid. */
11854 return false;
11855 }
11856
11857 /* Validate index register. */
11858 if (index)
11859 {
11860 rtx reg;
11861
11862 if (REG_P (index))
11863 reg = index;
11864 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
11865 reg = SUBREG_REG (index);
11866 else
11867 /* Index is not a register. */
11868 return false;
11869
11870 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
11871 return false;
11872
11873 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
11874 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
11875 /* Index is not valid. */
11876 return false;
11877 }
11878
11879 /* Index and base should have the same mode. */
11880 if (base && index
11881 && GET_MODE (base) != GET_MODE (index))
11882 return false;
11883
11884 /* Validate scale factor. */
11885 if (scale != 1)
11886 {
11887 if (!index)
11888 /* Scale without index. */
11889 return false;
11890
11891 if (scale != 2 && scale != 4 && scale != 8)
11892 /* Scale is not a valid multiplier. */
11893 return false;
11894 }
11895
11896 /* Validate displacement. */
11897 if (disp)
11898 {
11899 if (GET_CODE (disp) == CONST
11900 && GET_CODE (XEXP (disp, 0)) == UNSPEC
11901 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
11902 switch (XINT (XEXP (disp, 0), 1))
11903 {
11904 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
11905 used. While ABI specify also 32bit relocations, we don't produce
11906 them at all and use IP relative instead. */
11907 case UNSPEC_GOT:
11908 case UNSPEC_GOTOFF:
11909 gcc_assert (flag_pic);
11910 if (!TARGET_64BIT)
11911 goto is_legitimate_pic;
11912
11913 /* 64bit address unspec. */
11914 return false;
11915
11916 case UNSPEC_GOTPCREL:
11917 case UNSPEC_PCREL:
11918 gcc_assert (flag_pic);
11919 goto is_legitimate_pic;
11920
11921 case UNSPEC_GOTTPOFF:
11922 case UNSPEC_GOTNTPOFF:
11923 case UNSPEC_INDNTPOFF:
11924 case UNSPEC_NTPOFF:
11925 case UNSPEC_DTPOFF:
11926 break;
11927
11928 case UNSPEC_STACK_CHECK:
11929 gcc_assert (flag_split_stack);
11930 break;
11931
11932 default:
11933 /* Invalid address unspec. */
11934 return false;
11935 }
11936
11937 else if (SYMBOLIC_CONST (disp)
11938 && (flag_pic
11939 || (TARGET_MACHO
11940 #if TARGET_MACHO
11941 && MACHOPIC_INDIRECT
11942 && !machopic_operand_p (disp)
11943 #endif
11944 )))
11945 {
11946
11947 is_legitimate_pic:
11948 if (TARGET_64BIT && (index || base))
11949 {
11950 /* foo@dtpoff(%rX) is ok. */
11951 if (GET_CODE (disp) != CONST
11952 || GET_CODE (XEXP (disp, 0)) != PLUS
11953 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
11954 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
11955 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
11956 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
11957 /* Non-constant pic memory reference. */
11958 return false;
11959 }
11960 else if ((!TARGET_MACHO || flag_pic)
11961 && ! legitimate_pic_address_disp_p (disp))
11962 /* Displacement is an invalid pic construct. */
11963 return false;
11964 #if TARGET_MACHO
11965 else if (MACHO_DYNAMIC_NO_PIC_P
11966 && !ix86_legitimate_constant_p (Pmode, disp))
11967 /* displacment must be referenced via non_lazy_pointer */
11968 return false;
11969 #endif
11970
11971 /* This code used to verify that a symbolic pic displacement
11972 includes the pic_offset_table_rtx register.
11973
11974 While this is good idea, unfortunately these constructs may
11975 be created by "adds using lea" optimization for incorrect
11976 code like:
11977
11978 int a;
11979 int foo(int i)
11980 {
11981 return *(&a+i);
11982 }
11983
11984 This code is nonsensical, but results in addressing
11985 GOT table with pic_offset_table_rtx base. We can't
11986 just refuse it easily, since it gets matched by
11987 "addsi3" pattern, that later gets split to lea in the
11988 case output register differs from input. While this
11989 can be handled by separate addsi pattern for this case
11990 that never results in lea, this seems to be easier and
11991 correct fix for crash to disable this test. */
11992 }
11993 else if (GET_CODE (disp) != LABEL_REF
11994 && !CONST_INT_P (disp)
11995 && (GET_CODE (disp) != CONST
11996 || !ix86_legitimate_constant_p (Pmode, disp))
11997 && (GET_CODE (disp) != SYMBOL_REF
11998 || !ix86_legitimate_constant_p (Pmode, disp)))
11999 /* Displacement is not constant. */
12000 return false;
12001 else if (TARGET_64BIT
12002 && !x86_64_immediate_operand (disp, VOIDmode))
12003 /* Displacement is out of range. */
12004 return false;
12005 }
12006
12007 /* Everything looks valid. */
12008 return true;
12009 }
12010
12011 /* Determine if a given RTX is a valid constant address. */
12012
12013 bool
12014 constant_address_p (rtx x)
12015 {
12016 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12017 }
12018 \f
12019 /* Return a unique alias set for the GOT. */
12020
12021 static alias_set_type
12022 ix86_GOT_alias_set (void)
12023 {
12024 static alias_set_type set = -1;
12025 if (set == -1)
12026 set = new_alias_set ();
12027 return set;
12028 }
12029
12030 /* Return a legitimate reference for ORIG (an address) using the
12031 register REG. If REG is 0, a new pseudo is generated.
12032
12033 There are two types of references that must be handled:
12034
12035 1. Global data references must load the address from the GOT, via
12036 the PIC reg. An insn is emitted to do this load, and the reg is
12037 returned.
12038
12039 2. Static data references, constant pool addresses, and code labels
12040 compute the address as an offset from the GOT, whose base is in
12041 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12042 differentiate them from global data objects. The returned
12043 address is the PIC reg + an unspec constant.
12044
12045 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12046 reg also appears in the address. */
12047
12048 static rtx
12049 legitimize_pic_address (rtx orig, rtx reg)
12050 {
12051 rtx addr = orig;
12052 rtx new_rtx = orig;
12053 rtx base;
12054
12055 #if TARGET_MACHO
12056 if (TARGET_MACHO && !TARGET_64BIT)
12057 {
12058 if (reg == 0)
12059 reg = gen_reg_rtx (Pmode);
12060 /* Use the generic Mach-O PIC machinery. */
12061 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12062 }
12063 #endif
12064
12065 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12066 new_rtx = addr;
12067 else if (TARGET_64BIT
12068 && ix86_cmodel != CM_SMALL_PIC
12069 && gotoff_operand (addr, Pmode))
12070 {
12071 rtx tmpreg;
12072 /* This symbol may be referenced via a displacement from the PIC
12073 base address (@GOTOFF). */
12074
12075 if (reload_in_progress)
12076 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12077 if (GET_CODE (addr) == CONST)
12078 addr = XEXP (addr, 0);
12079 if (GET_CODE (addr) == PLUS)
12080 {
12081 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12082 UNSPEC_GOTOFF);
12083 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12084 }
12085 else
12086 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12087 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12088 if (!reg)
12089 tmpreg = gen_reg_rtx (Pmode);
12090 else
12091 tmpreg = reg;
12092 emit_move_insn (tmpreg, new_rtx);
12093
12094 if (reg != 0)
12095 {
12096 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12097 tmpreg, 1, OPTAB_DIRECT);
12098 new_rtx = reg;
12099 }
12100 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12101 }
12102 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12103 {
12104 /* This symbol may be referenced via a displacement from the PIC
12105 base address (@GOTOFF). */
12106
12107 if (reload_in_progress)
12108 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12109 if (GET_CODE (addr) == CONST)
12110 addr = XEXP (addr, 0);
12111 if (GET_CODE (addr) == PLUS)
12112 {
12113 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12114 UNSPEC_GOTOFF);
12115 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12116 }
12117 else
12118 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12119 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12120 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12121
12122 if (reg != 0)
12123 {
12124 emit_move_insn (reg, new_rtx);
12125 new_rtx = reg;
12126 }
12127 }
12128 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12129 /* We can't use @GOTOFF for text labels on VxWorks;
12130 see gotoff_operand. */
12131 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12132 {
12133 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12134 {
12135 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12136 return legitimize_dllimport_symbol (addr, true);
12137 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12138 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12139 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12140 {
12141 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12142 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12143 }
12144 }
12145
12146 /* For x64 PE-COFF there is no GOT table. So we use address
12147 directly. */
12148 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12149 {
12150 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12151 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12152
12153 if (reg == 0)
12154 reg = gen_reg_rtx (Pmode);
12155 emit_move_insn (reg, new_rtx);
12156 new_rtx = reg;
12157 }
12158 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12159 {
12160 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12161 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12162 new_rtx = gen_const_mem (Pmode, new_rtx);
12163 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12164
12165 if (reg == 0)
12166 reg = gen_reg_rtx (Pmode);
12167 /* Use directly gen_movsi, otherwise the address is loaded
12168 into register for CSE. We don't want to CSE this addresses,
12169 instead we CSE addresses from the GOT table, so skip this. */
12170 emit_insn (gen_movsi (reg, new_rtx));
12171 new_rtx = reg;
12172 }
12173 else
12174 {
12175 /* This symbol must be referenced via a load from the
12176 Global Offset Table (@GOT). */
12177
12178 if (reload_in_progress)
12179 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12180 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12181 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12182 if (TARGET_64BIT)
12183 new_rtx = force_reg (Pmode, new_rtx);
12184 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12185 new_rtx = gen_const_mem (Pmode, new_rtx);
12186 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12187
12188 if (reg == 0)
12189 reg = gen_reg_rtx (Pmode);
12190 emit_move_insn (reg, new_rtx);
12191 new_rtx = reg;
12192 }
12193 }
12194 else
12195 {
12196 if (CONST_INT_P (addr)
12197 && !x86_64_immediate_operand (addr, VOIDmode))
12198 {
12199 if (reg)
12200 {
12201 emit_move_insn (reg, addr);
12202 new_rtx = reg;
12203 }
12204 else
12205 new_rtx = force_reg (Pmode, addr);
12206 }
12207 else if (GET_CODE (addr) == CONST)
12208 {
12209 addr = XEXP (addr, 0);
12210
12211 /* We must match stuff we generate before. Assume the only
12212 unspecs that can get here are ours. Not that we could do
12213 anything with them anyway.... */
12214 if (GET_CODE (addr) == UNSPEC
12215 || (GET_CODE (addr) == PLUS
12216 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12217 return orig;
12218 gcc_assert (GET_CODE (addr) == PLUS);
12219 }
12220 if (GET_CODE (addr) == PLUS)
12221 {
12222 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12223
12224 /* Check first to see if this is a constant offset from a @GOTOFF
12225 symbol reference. */
12226 if (gotoff_operand (op0, Pmode)
12227 && CONST_INT_P (op1))
12228 {
12229 if (!TARGET_64BIT)
12230 {
12231 if (reload_in_progress)
12232 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12233 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12234 UNSPEC_GOTOFF);
12235 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12236 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12237 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12238
12239 if (reg != 0)
12240 {
12241 emit_move_insn (reg, new_rtx);
12242 new_rtx = reg;
12243 }
12244 }
12245 else
12246 {
12247 if (INTVAL (op1) < -16*1024*1024
12248 || INTVAL (op1) >= 16*1024*1024)
12249 {
12250 if (!x86_64_immediate_operand (op1, Pmode))
12251 op1 = force_reg (Pmode, op1);
12252 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12253 }
12254 }
12255 }
12256 else
12257 {
12258 base = legitimize_pic_address (XEXP (addr, 0), reg);
12259 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12260 base == reg ? NULL_RTX : reg);
12261
12262 if (CONST_INT_P (new_rtx))
12263 new_rtx = plus_constant (base, INTVAL (new_rtx));
12264 else
12265 {
12266 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12267 {
12268 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12269 new_rtx = XEXP (new_rtx, 1);
12270 }
12271 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12272 }
12273 }
12274 }
12275 }
12276 return new_rtx;
12277 }
12278 \f
12279 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12280
12281 static rtx
12282 get_thread_pointer (bool to_reg)
12283 {
12284 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12285
12286 if (GET_MODE (tp) != Pmode)
12287 tp = convert_to_mode (Pmode, tp, 1);
12288
12289 if (to_reg)
12290 tp = copy_addr_to_reg (tp);
12291
12292 return tp;
12293 }
12294
12295 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12296
12297 static GTY(()) rtx ix86_tls_symbol;
12298
12299 static rtx
12300 ix86_tls_get_addr (void)
12301 {
12302 if (!ix86_tls_symbol)
12303 {
12304 const char *sym
12305 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12306 ? "___tls_get_addr" : "__tls_get_addr");
12307
12308 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12309 }
12310
12311 return ix86_tls_symbol;
12312 }
12313
12314 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12315
12316 static GTY(()) rtx ix86_tls_module_base_symbol;
12317
12318 rtx
12319 ix86_tls_module_base (void)
12320 {
12321 if (!ix86_tls_module_base_symbol)
12322 {
12323 ix86_tls_module_base_symbol
12324 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12325
12326 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12327 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12328 }
12329
12330 return ix86_tls_module_base_symbol;
12331 }
12332
12333 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12334 false if we expect this to be used for a memory address and true if
12335 we expect to load the address into a register. */
12336
12337 static rtx
12338 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12339 {
12340 rtx dest, base, off;
12341 rtx pic = NULL_RTX, tp = NULL_RTX;
12342 int type;
12343
12344 switch (model)
12345 {
12346 case TLS_MODEL_GLOBAL_DYNAMIC:
12347 dest = gen_reg_rtx (Pmode);
12348
12349 if (!TARGET_64BIT)
12350 {
12351 if (flag_pic)
12352 pic = pic_offset_table_rtx;
12353 else
12354 {
12355 pic = gen_reg_rtx (Pmode);
12356 emit_insn (gen_set_got (pic));
12357 }
12358 }
12359
12360 if (TARGET_GNU2_TLS)
12361 {
12362 if (TARGET_64BIT)
12363 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12364 else
12365 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12366
12367 tp = get_thread_pointer (true);
12368 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12369
12370 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12371 }
12372 else
12373 {
12374 rtx caddr = ix86_tls_get_addr ();
12375
12376 if (TARGET_64BIT)
12377 {
12378 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12379
12380 start_sequence ();
12381 emit_call_insn (gen_tls_global_dynamic_64 (rax, x, caddr));
12382 insns = get_insns ();
12383 end_sequence ();
12384
12385 RTL_CONST_CALL_P (insns) = 1;
12386 emit_libcall_block (insns, dest, rax, x);
12387 }
12388 else
12389 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12390 }
12391 break;
12392
12393 case TLS_MODEL_LOCAL_DYNAMIC:
12394 base = gen_reg_rtx (Pmode);
12395
12396 if (!TARGET_64BIT)
12397 {
12398 if (flag_pic)
12399 pic = pic_offset_table_rtx;
12400 else
12401 {
12402 pic = gen_reg_rtx (Pmode);
12403 emit_insn (gen_set_got (pic));
12404 }
12405 }
12406
12407 if (TARGET_GNU2_TLS)
12408 {
12409 rtx tmp = ix86_tls_module_base ();
12410
12411 if (TARGET_64BIT)
12412 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12413 else
12414 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12415
12416 tp = get_thread_pointer (true);
12417 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12418 gen_rtx_MINUS (Pmode, tmp, tp));
12419 }
12420 else
12421 {
12422 rtx caddr = ix86_tls_get_addr ();
12423
12424 if (TARGET_64BIT)
12425 {
12426 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12427
12428 start_sequence ();
12429 emit_call_insn (gen_tls_local_dynamic_base_64 (rax, caddr));
12430 insns = get_insns ();
12431 end_sequence ();
12432
12433 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12434 share the LD_BASE result with other LD model accesses. */
12435 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12436 UNSPEC_TLS_LD_BASE);
12437
12438 RTL_CONST_CALL_P (insns) = 1;
12439 emit_libcall_block (insns, base, rax, eqv);
12440 }
12441 else
12442 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12443 }
12444
12445 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12446 off = gen_rtx_CONST (Pmode, off);
12447
12448 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12449
12450 if (TARGET_GNU2_TLS)
12451 {
12452 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12453
12454 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12455 }
12456 break;
12457
12458 case TLS_MODEL_INITIAL_EXEC:
12459 if (TARGET_64BIT)
12460 {
12461 if (TARGET_SUN_TLS)
12462 {
12463 /* The Sun linker took the AMD64 TLS spec literally
12464 and can only handle %rax as destination of the
12465 initial executable code sequence. */
12466
12467 dest = gen_reg_rtx (Pmode);
12468 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12469 return dest;
12470 }
12471
12472 pic = NULL;
12473 type = UNSPEC_GOTNTPOFF;
12474 }
12475 else if (flag_pic)
12476 {
12477 if (reload_in_progress)
12478 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12479 pic = pic_offset_table_rtx;
12480 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12481 }
12482 else if (!TARGET_ANY_GNU_TLS)
12483 {
12484 pic = gen_reg_rtx (Pmode);
12485 emit_insn (gen_set_got (pic));
12486 type = UNSPEC_GOTTPOFF;
12487 }
12488 else
12489 {
12490 pic = NULL;
12491 type = UNSPEC_INDNTPOFF;
12492 }
12493
12494 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
12495 off = gen_rtx_CONST (Pmode, off);
12496 if (pic)
12497 off = gen_rtx_PLUS (Pmode, pic, off);
12498 off = gen_const_mem (Pmode, off);
12499 set_mem_alias_set (off, ix86_GOT_alias_set ());
12500
12501 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12502 {
12503 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12504 off = force_reg (Pmode, off);
12505 return gen_rtx_PLUS (Pmode, base, off);
12506 }
12507 else
12508 {
12509 base = get_thread_pointer (true);
12510 dest = gen_reg_rtx (Pmode);
12511 emit_insn (gen_subsi3 (dest, base, off));
12512 }
12513 break;
12514
12515 case TLS_MODEL_LOCAL_EXEC:
12516 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12517 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12518 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12519 off = gen_rtx_CONST (Pmode, off);
12520
12521 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12522 {
12523 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12524 return gen_rtx_PLUS (Pmode, base, off);
12525 }
12526 else
12527 {
12528 base = get_thread_pointer (true);
12529 dest = gen_reg_rtx (Pmode);
12530 emit_insn (gen_subsi3 (dest, base, off));
12531 }
12532 break;
12533
12534 default:
12535 gcc_unreachable ();
12536 }
12537
12538 return dest;
12539 }
12540
12541 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12542 to symbol DECL. */
12543
12544 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12545 htab_t dllimport_map;
12546
12547 static tree
12548 get_dllimport_decl (tree decl)
12549 {
12550 struct tree_map *h, in;
12551 void **loc;
12552 const char *name;
12553 const char *prefix;
12554 size_t namelen, prefixlen;
12555 char *imp_name;
12556 tree to;
12557 rtx rtl;
12558
12559 if (!dllimport_map)
12560 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12561
12562 in.hash = htab_hash_pointer (decl);
12563 in.base.from = decl;
12564 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12565 h = (struct tree_map *) *loc;
12566 if (h)
12567 return h->to;
12568
12569 *loc = h = ggc_alloc_tree_map ();
12570 h->hash = in.hash;
12571 h->base.from = decl;
12572 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12573 VAR_DECL, NULL, ptr_type_node);
12574 DECL_ARTIFICIAL (to) = 1;
12575 DECL_IGNORED_P (to) = 1;
12576 DECL_EXTERNAL (to) = 1;
12577 TREE_READONLY (to) = 1;
12578
12579 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12580 name = targetm.strip_name_encoding (name);
12581 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12582 ? "*__imp_" : "*__imp__";
12583 namelen = strlen (name);
12584 prefixlen = strlen (prefix);
12585 imp_name = (char *) alloca (namelen + prefixlen + 1);
12586 memcpy (imp_name, prefix, prefixlen);
12587 memcpy (imp_name + prefixlen, name, namelen + 1);
12588
12589 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12590 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12591 SET_SYMBOL_REF_DECL (rtl, to);
12592 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12593
12594 rtl = gen_const_mem (Pmode, rtl);
12595 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12596
12597 SET_DECL_RTL (to, rtl);
12598 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12599
12600 return to;
12601 }
12602
12603 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12604 true if we require the result be a register. */
12605
12606 static rtx
12607 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12608 {
12609 tree imp_decl;
12610 rtx x;
12611
12612 gcc_assert (SYMBOL_REF_DECL (symbol));
12613 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12614
12615 x = DECL_RTL (imp_decl);
12616 if (want_reg)
12617 x = force_reg (Pmode, x);
12618 return x;
12619 }
12620
12621 /* Try machine-dependent ways of modifying an illegitimate address
12622 to be legitimate. If we find one, return the new, valid address.
12623 This macro is used in only one place: `memory_address' in explow.c.
12624
12625 OLDX is the address as it was before break_out_memory_refs was called.
12626 In some cases it is useful to look at this to decide what needs to be done.
12627
12628 It is always safe for this macro to do nothing. It exists to recognize
12629 opportunities to optimize the output.
12630
12631 For the 80386, we handle X+REG by loading X into a register R and
12632 using R+REG. R will go in a general reg and indexing will be used.
12633 However, if REG is a broken-out memory address or multiplication,
12634 nothing needs to be done because REG can certainly go in a general reg.
12635
12636 When -fpic is used, special handling is needed for symbolic references.
12637 See comments by legitimize_pic_address in i386.c for details. */
12638
12639 static rtx
12640 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12641 enum machine_mode mode)
12642 {
12643 int changed = 0;
12644 unsigned log;
12645
12646 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12647 if (log)
12648 return legitimize_tls_address (x, (enum tls_model) log, false);
12649 if (GET_CODE (x) == CONST
12650 && GET_CODE (XEXP (x, 0)) == PLUS
12651 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12652 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12653 {
12654 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12655 (enum tls_model) log, false);
12656 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12657 }
12658
12659 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12660 {
12661 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
12662 return legitimize_dllimport_symbol (x, true);
12663 if (GET_CODE (x) == CONST
12664 && GET_CODE (XEXP (x, 0)) == PLUS
12665 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12666 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
12667 {
12668 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
12669 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12670 }
12671 }
12672
12673 if (flag_pic && SYMBOLIC_CONST (x))
12674 return legitimize_pic_address (x, 0);
12675
12676 #if TARGET_MACHO
12677 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
12678 return machopic_indirect_data_reference (x, 0);
12679 #endif
12680
12681 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
12682 if (GET_CODE (x) == ASHIFT
12683 && CONST_INT_P (XEXP (x, 1))
12684 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
12685 {
12686 changed = 1;
12687 log = INTVAL (XEXP (x, 1));
12688 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
12689 GEN_INT (1 << log));
12690 }
12691
12692 if (GET_CODE (x) == PLUS)
12693 {
12694 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
12695
12696 if (GET_CODE (XEXP (x, 0)) == ASHIFT
12697 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12698 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
12699 {
12700 changed = 1;
12701 log = INTVAL (XEXP (XEXP (x, 0), 1));
12702 XEXP (x, 0) = gen_rtx_MULT (Pmode,
12703 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
12704 GEN_INT (1 << log));
12705 }
12706
12707 if (GET_CODE (XEXP (x, 1)) == ASHIFT
12708 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
12709 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
12710 {
12711 changed = 1;
12712 log = INTVAL (XEXP (XEXP (x, 1), 1));
12713 XEXP (x, 1) = gen_rtx_MULT (Pmode,
12714 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
12715 GEN_INT (1 << log));
12716 }
12717
12718 /* Put multiply first if it isn't already. */
12719 if (GET_CODE (XEXP (x, 1)) == MULT)
12720 {
12721 rtx tmp = XEXP (x, 0);
12722 XEXP (x, 0) = XEXP (x, 1);
12723 XEXP (x, 1) = tmp;
12724 changed = 1;
12725 }
12726
12727 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
12728 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
12729 created by virtual register instantiation, register elimination, and
12730 similar optimizations. */
12731 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
12732 {
12733 changed = 1;
12734 x = gen_rtx_PLUS (Pmode,
12735 gen_rtx_PLUS (Pmode, XEXP (x, 0),
12736 XEXP (XEXP (x, 1), 0)),
12737 XEXP (XEXP (x, 1), 1));
12738 }
12739
12740 /* Canonicalize
12741 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
12742 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
12743 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
12744 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12745 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
12746 && CONSTANT_P (XEXP (x, 1)))
12747 {
12748 rtx constant;
12749 rtx other = NULL_RTX;
12750
12751 if (CONST_INT_P (XEXP (x, 1)))
12752 {
12753 constant = XEXP (x, 1);
12754 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
12755 }
12756 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
12757 {
12758 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
12759 other = XEXP (x, 1);
12760 }
12761 else
12762 constant = 0;
12763
12764 if (constant)
12765 {
12766 changed = 1;
12767 x = gen_rtx_PLUS (Pmode,
12768 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
12769 XEXP (XEXP (XEXP (x, 0), 1), 0)),
12770 plus_constant (other, INTVAL (constant)));
12771 }
12772 }
12773
12774 if (changed && ix86_legitimate_address_p (mode, x, false))
12775 return x;
12776
12777 if (GET_CODE (XEXP (x, 0)) == MULT)
12778 {
12779 changed = 1;
12780 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
12781 }
12782
12783 if (GET_CODE (XEXP (x, 1)) == MULT)
12784 {
12785 changed = 1;
12786 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
12787 }
12788
12789 if (changed
12790 && REG_P (XEXP (x, 1))
12791 && REG_P (XEXP (x, 0)))
12792 return x;
12793
12794 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
12795 {
12796 changed = 1;
12797 x = legitimize_pic_address (x, 0);
12798 }
12799
12800 if (changed && ix86_legitimate_address_p (mode, x, false))
12801 return x;
12802
12803 if (REG_P (XEXP (x, 0)))
12804 {
12805 rtx temp = gen_reg_rtx (Pmode);
12806 rtx val = force_operand (XEXP (x, 1), temp);
12807 if (val != temp)
12808 {
12809 if (GET_MODE (val) != Pmode)
12810 val = convert_to_mode (Pmode, val, 1);
12811 emit_move_insn (temp, val);
12812 }
12813
12814 XEXP (x, 1) = temp;
12815 return x;
12816 }
12817
12818 else if (REG_P (XEXP (x, 1)))
12819 {
12820 rtx temp = gen_reg_rtx (Pmode);
12821 rtx val = force_operand (XEXP (x, 0), temp);
12822 if (val != temp)
12823 {
12824 if (GET_MODE (val) != Pmode)
12825 val = convert_to_mode (Pmode, val, 1);
12826 emit_move_insn (temp, val);
12827 }
12828
12829 XEXP (x, 0) = temp;
12830 return x;
12831 }
12832 }
12833
12834 return x;
12835 }
12836 \f
12837 /* Print an integer constant expression in assembler syntax. Addition
12838 and subtraction are the only arithmetic that may appear in these
12839 expressions. FILE is the stdio stream to write to, X is the rtx, and
12840 CODE is the operand print code from the output string. */
12841
12842 static void
12843 output_pic_addr_const (FILE *file, rtx x, int code)
12844 {
12845 char buf[256];
12846
12847 switch (GET_CODE (x))
12848 {
12849 case PC:
12850 gcc_assert (flag_pic);
12851 putc ('.', file);
12852 break;
12853
12854 case SYMBOL_REF:
12855 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
12856 output_addr_const (file, x);
12857 else
12858 {
12859 const char *name = XSTR (x, 0);
12860
12861 /* Mark the decl as referenced so that cgraph will
12862 output the function. */
12863 if (SYMBOL_REF_DECL (x))
12864 mark_decl_referenced (SYMBOL_REF_DECL (x));
12865
12866 #if TARGET_MACHO
12867 if (MACHOPIC_INDIRECT
12868 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
12869 name = machopic_indirection_name (x, /*stub_p=*/true);
12870 #endif
12871 assemble_name (file, name);
12872 }
12873 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12874 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
12875 fputs ("@PLT", file);
12876 break;
12877
12878 case LABEL_REF:
12879 x = XEXP (x, 0);
12880 /* FALLTHRU */
12881 case CODE_LABEL:
12882 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
12883 assemble_name (asm_out_file, buf);
12884 break;
12885
12886 case CONST_INT:
12887 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
12888 break;
12889
12890 case CONST:
12891 /* This used to output parentheses around the expression,
12892 but that does not work on the 386 (either ATT or BSD assembler). */
12893 output_pic_addr_const (file, XEXP (x, 0), code);
12894 break;
12895
12896 case CONST_DOUBLE:
12897 if (GET_MODE (x) == VOIDmode)
12898 {
12899 /* We can use %d if the number is <32 bits and positive. */
12900 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
12901 fprintf (file, "0x%lx%08lx",
12902 (unsigned long) CONST_DOUBLE_HIGH (x),
12903 (unsigned long) CONST_DOUBLE_LOW (x));
12904 else
12905 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
12906 }
12907 else
12908 /* We can't handle floating point constants;
12909 TARGET_PRINT_OPERAND must handle them. */
12910 output_operand_lossage ("floating constant misused");
12911 break;
12912
12913 case PLUS:
12914 /* Some assemblers need integer constants to appear first. */
12915 if (CONST_INT_P (XEXP (x, 0)))
12916 {
12917 output_pic_addr_const (file, XEXP (x, 0), code);
12918 putc ('+', file);
12919 output_pic_addr_const (file, XEXP (x, 1), code);
12920 }
12921 else
12922 {
12923 gcc_assert (CONST_INT_P (XEXP (x, 1)));
12924 output_pic_addr_const (file, XEXP (x, 1), code);
12925 putc ('+', file);
12926 output_pic_addr_const (file, XEXP (x, 0), code);
12927 }
12928 break;
12929
12930 case MINUS:
12931 if (!TARGET_MACHO)
12932 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
12933 output_pic_addr_const (file, XEXP (x, 0), code);
12934 putc ('-', file);
12935 output_pic_addr_const (file, XEXP (x, 1), code);
12936 if (!TARGET_MACHO)
12937 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
12938 break;
12939
12940 case UNSPEC:
12941 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
12942 {
12943 bool f = i386_asm_output_addr_const_extra (file, x);
12944 gcc_assert (f);
12945 break;
12946 }
12947
12948 gcc_assert (XVECLEN (x, 0) == 1);
12949 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
12950 switch (XINT (x, 1))
12951 {
12952 case UNSPEC_GOT:
12953 fputs ("@GOT", file);
12954 break;
12955 case UNSPEC_GOTOFF:
12956 fputs ("@GOTOFF", file);
12957 break;
12958 case UNSPEC_PLTOFF:
12959 fputs ("@PLTOFF", file);
12960 break;
12961 case UNSPEC_PCREL:
12962 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
12963 "(%rip)" : "[rip]", file);
12964 break;
12965 case UNSPEC_GOTPCREL:
12966 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
12967 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
12968 break;
12969 case UNSPEC_GOTTPOFF:
12970 /* FIXME: This might be @TPOFF in Sun ld too. */
12971 fputs ("@gottpoff", file);
12972 break;
12973 case UNSPEC_TPOFF:
12974 fputs ("@tpoff", file);
12975 break;
12976 case UNSPEC_NTPOFF:
12977 if (TARGET_64BIT)
12978 fputs ("@tpoff", file);
12979 else
12980 fputs ("@ntpoff", file);
12981 break;
12982 case UNSPEC_DTPOFF:
12983 fputs ("@dtpoff", file);
12984 break;
12985 case UNSPEC_GOTNTPOFF:
12986 if (TARGET_64BIT)
12987 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
12988 "@gottpoff(%rip)": "@gottpoff[rip]", file);
12989 else
12990 fputs ("@gotntpoff", file);
12991 break;
12992 case UNSPEC_INDNTPOFF:
12993 fputs ("@indntpoff", file);
12994 break;
12995 #if TARGET_MACHO
12996 case UNSPEC_MACHOPIC_OFFSET:
12997 putc ('-', file);
12998 machopic_output_function_base_name (file);
12999 break;
13000 #endif
13001 default:
13002 output_operand_lossage ("invalid UNSPEC as operand");
13003 break;
13004 }
13005 break;
13006
13007 default:
13008 output_operand_lossage ("invalid expression as operand");
13009 }
13010 }
13011
13012 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13013 We need to emit DTP-relative relocations. */
13014
13015 static void ATTRIBUTE_UNUSED
13016 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13017 {
13018 fputs (ASM_LONG, file);
13019 output_addr_const (file, x);
13020 fputs ("@dtpoff", file);
13021 switch (size)
13022 {
13023 case 4:
13024 break;
13025 case 8:
13026 fputs (", 0", file);
13027 break;
13028 default:
13029 gcc_unreachable ();
13030 }
13031 }
13032
13033 /* Return true if X is a representation of the PIC register. This copes
13034 with calls from ix86_find_base_term, where the register might have
13035 been replaced by a cselib value. */
13036
13037 static bool
13038 ix86_pic_register_p (rtx x)
13039 {
13040 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13041 return (pic_offset_table_rtx
13042 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13043 else
13044 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13045 }
13046
13047 /* Helper function for ix86_delegitimize_address.
13048 Attempt to delegitimize TLS local-exec accesses. */
13049
13050 static rtx
13051 ix86_delegitimize_tls_address (rtx orig_x)
13052 {
13053 rtx x = orig_x, unspec;
13054 struct ix86_address addr;
13055
13056 if (!TARGET_TLS_DIRECT_SEG_REFS)
13057 return orig_x;
13058 if (MEM_P (x))
13059 x = XEXP (x, 0);
13060 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13061 return orig_x;
13062 if (ix86_decompose_address (x, &addr) == 0
13063 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13064 || addr.disp == NULL_RTX
13065 || GET_CODE (addr.disp) != CONST)
13066 return orig_x;
13067 unspec = XEXP (addr.disp, 0);
13068 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13069 unspec = XEXP (unspec, 0);
13070 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13071 return orig_x;
13072 x = XVECEXP (unspec, 0, 0);
13073 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13074 if (unspec != XEXP (addr.disp, 0))
13075 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13076 if (addr.index)
13077 {
13078 rtx idx = addr.index;
13079 if (addr.scale != 1)
13080 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13081 x = gen_rtx_PLUS (Pmode, idx, x);
13082 }
13083 if (addr.base)
13084 x = gen_rtx_PLUS (Pmode, addr.base, x);
13085 if (MEM_P (orig_x))
13086 x = replace_equiv_address_nv (orig_x, x);
13087 return x;
13088 }
13089
13090 /* In the name of slightly smaller debug output, and to cater to
13091 general assembler lossage, recognize PIC+GOTOFF and turn it back
13092 into a direct symbol reference.
13093
13094 On Darwin, this is necessary to avoid a crash, because Darwin
13095 has a different PIC label for each routine but the DWARF debugging
13096 information is not associated with any particular routine, so it's
13097 necessary to remove references to the PIC label from RTL stored by
13098 the DWARF output code. */
13099
13100 static rtx
13101 ix86_delegitimize_address (rtx x)
13102 {
13103 rtx orig_x = delegitimize_mem_from_attrs (x);
13104 /* addend is NULL or some rtx if x is something+GOTOFF where
13105 something doesn't include the PIC register. */
13106 rtx addend = NULL_RTX;
13107 /* reg_addend is NULL or a multiple of some register. */
13108 rtx reg_addend = NULL_RTX;
13109 /* const_addend is NULL or a const_int. */
13110 rtx const_addend = NULL_RTX;
13111 /* This is the result, or NULL. */
13112 rtx result = NULL_RTX;
13113
13114 x = orig_x;
13115
13116 if (MEM_P (x))
13117 x = XEXP (x, 0);
13118
13119 if (TARGET_64BIT)
13120 {
13121 if (GET_CODE (x) != CONST
13122 || GET_CODE (XEXP (x, 0)) != UNSPEC
13123 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13124 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13125 || !MEM_P (orig_x))
13126 return ix86_delegitimize_tls_address (orig_x);
13127 x = XVECEXP (XEXP (x, 0), 0, 0);
13128 if (GET_MODE (orig_x) != GET_MODE (x))
13129 {
13130 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13131 GET_MODE (x), 0);
13132 if (x == NULL_RTX)
13133 return orig_x;
13134 }
13135 return x;
13136 }
13137
13138 if (GET_CODE (x) != PLUS
13139 || GET_CODE (XEXP (x, 1)) != CONST)
13140 return ix86_delegitimize_tls_address (orig_x);
13141
13142 if (ix86_pic_register_p (XEXP (x, 0)))
13143 /* %ebx + GOT/GOTOFF */
13144 ;
13145 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13146 {
13147 /* %ebx + %reg * scale + GOT/GOTOFF */
13148 reg_addend = XEXP (x, 0);
13149 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13150 reg_addend = XEXP (reg_addend, 1);
13151 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13152 reg_addend = XEXP (reg_addend, 0);
13153 else
13154 {
13155 reg_addend = NULL_RTX;
13156 addend = XEXP (x, 0);
13157 }
13158 }
13159 else
13160 addend = XEXP (x, 0);
13161
13162 x = XEXP (XEXP (x, 1), 0);
13163 if (GET_CODE (x) == PLUS
13164 && CONST_INT_P (XEXP (x, 1)))
13165 {
13166 const_addend = XEXP (x, 1);
13167 x = XEXP (x, 0);
13168 }
13169
13170 if (GET_CODE (x) == UNSPEC
13171 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13172 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13173 result = XVECEXP (x, 0, 0);
13174
13175 if (TARGET_MACHO && darwin_local_data_pic (x)
13176 && !MEM_P (orig_x))
13177 result = XVECEXP (x, 0, 0);
13178
13179 if (! result)
13180 return ix86_delegitimize_tls_address (orig_x);
13181
13182 if (const_addend)
13183 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13184 if (reg_addend)
13185 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13186 if (addend)
13187 {
13188 /* If the rest of original X doesn't involve the PIC register, add
13189 addend and subtract pic_offset_table_rtx. This can happen e.g.
13190 for code like:
13191 leal (%ebx, %ecx, 4), %ecx
13192 ...
13193 movl foo@GOTOFF(%ecx), %edx
13194 in which case we return (%ecx - %ebx) + foo. */
13195 if (pic_offset_table_rtx)
13196 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13197 pic_offset_table_rtx),
13198 result);
13199 else
13200 return orig_x;
13201 }
13202 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13203 {
13204 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13205 if (result == NULL_RTX)
13206 return orig_x;
13207 }
13208 return result;
13209 }
13210
13211 /* If X is a machine specific address (i.e. a symbol or label being
13212 referenced as a displacement from the GOT implemented using an
13213 UNSPEC), then return the base term. Otherwise return X. */
13214
13215 rtx
13216 ix86_find_base_term (rtx x)
13217 {
13218 rtx term;
13219
13220 if (TARGET_64BIT)
13221 {
13222 if (GET_CODE (x) != CONST)
13223 return x;
13224 term = XEXP (x, 0);
13225 if (GET_CODE (term) == PLUS
13226 && (CONST_INT_P (XEXP (term, 1))
13227 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13228 term = XEXP (term, 0);
13229 if (GET_CODE (term) != UNSPEC
13230 || (XINT (term, 1) != UNSPEC_GOTPCREL
13231 && XINT (term, 1) != UNSPEC_PCREL))
13232 return x;
13233
13234 return XVECEXP (term, 0, 0);
13235 }
13236
13237 return ix86_delegitimize_address (x);
13238 }
13239 \f
13240 static void
13241 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
13242 int fp, FILE *file)
13243 {
13244 const char *suffix;
13245
13246 if (mode == CCFPmode || mode == CCFPUmode)
13247 {
13248 code = ix86_fp_compare_code_to_integer (code);
13249 mode = CCmode;
13250 }
13251 if (reverse)
13252 code = reverse_condition (code);
13253
13254 switch (code)
13255 {
13256 case EQ:
13257 switch (mode)
13258 {
13259 case CCAmode:
13260 suffix = "a";
13261 break;
13262
13263 case CCCmode:
13264 suffix = "c";
13265 break;
13266
13267 case CCOmode:
13268 suffix = "o";
13269 break;
13270
13271 case CCSmode:
13272 suffix = "s";
13273 break;
13274
13275 default:
13276 suffix = "e";
13277 }
13278 break;
13279 case NE:
13280 switch (mode)
13281 {
13282 case CCAmode:
13283 suffix = "na";
13284 break;
13285
13286 case CCCmode:
13287 suffix = "nc";
13288 break;
13289
13290 case CCOmode:
13291 suffix = "no";
13292 break;
13293
13294 case CCSmode:
13295 suffix = "ns";
13296 break;
13297
13298 default:
13299 suffix = "ne";
13300 }
13301 break;
13302 case GT:
13303 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13304 suffix = "g";
13305 break;
13306 case GTU:
13307 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13308 Those same assemblers have the same but opposite lossage on cmov. */
13309 if (mode == CCmode)
13310 suffix = fp ? "nbe" : "a";
13311 else if (mode == CCCmode)
13312 suffix = "b";
13313 else
13314 gcc_unreachable ();
13315 break;
13316 case LT:
13317 switch (mode)
13318 {
13319 case CCNOmode:
13320 case CCGOCmode:
13321 suffix = "s";
13322 break;
13323
13324 case CCmode:
13325 case CCGCmode:
13326 suffix = "l";
13327 break;
13328
13329 default:
13330 gcc_unreachable ();
13331 }
13332 break;
13333 case LTU:
13334 gcc_assert (mode == CCmode || mode == CCCmode);
13335 suffix = "b";
13336 break;
13337 case GE:
13338 switch (mode)
13339 {
13340 case CCNOmode:
13341 case CCGOCmode:
13342 suffix = "ns";
13343 break;
13344
13345 case CCmode:
13346 case CCGCmode:
13347 suffix = "ge";
13348 break;
13349
13350 default:
13351 gcc_unreachable ();
13352 }
13353 break;
13354 case GEU:
13355 /* ??? As above. */
13356 gcc_assert (mode == CCmode || mode == CCCmode);
13357 suffix = fp ? "nb" : "ae";
13358 break;
13359 case LE:
13360 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13361 suffix = "le";
13362 break;
13363 case LEU:
13364 /* ??? As above. */
13365 if (mode == CCmode)
13366 suffix = "be";
13367 else if (mode == CCCmode)
13368 suffix = fp ? "nb" : "ae";
13369 else
13370 gcc_unreachable ();
13371 break;
13372 case UNORDERED:
13373 suffix = fp ? "u" : "p";
13374 break;
13375 case ORDERED:
13376 suffix = fp ? "nu" : "np";
13377 break;
13378 default:
13379 gcc_unreachable ();
13380 }
13381 fputs (suffix, file);
13382 }
13383
13384 /* Print the name of register X to FILE based on its machine mode and number.
13385 If CODE is 'w', pretend the mode is HImode.
13386 If CODE is 'b', pretend the mode is QImode.
13387 If CODE is 'k', pretend the mode is SImode.
13388 If CODE is 'q', pretend the mode is DImode.
13389 If CODE is 'x', pretend the mode is V4SFmode.
13390 If CODE is 't', pretend the mode is V8SFmode.
13391 If CODE is 'h', pretend the reg is the 'high' byte register.
13392 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13393 If CODE is 'd', duplicate the operand for AVX instruction.
13394 */
13395
13396 void
13397 print_reg (rtx x, int code, FILE *file)
13398 {
13399 const char *reg;
13400 bool duplicated = code == 'd' && TARGET_AVX;
13401
13402 gcc_assert (x == pc_rtx
13403 || (REGNO (x) != ARG_POINTER_REGNUM
13404 && REGNO (x) != FRAME_POINTER_REGNUM
13405 && REGNO (x) != FLAGS_REG
13406 && REGNO (x) != FPSR_REG
13407 && REGNO (x) != FPCR_REG));
13408
13409 if (ASSEMBLER_DIALECT == ASM_ATT)
13410 putc ('%', file);
13411
13412 if (x == pc_rtx)
13413 {
13414 gcc_assert (TARGET_64BIT);
13415 fputs ("rip", file);
13416 return;
13417 }
13418
13419 if (code == 'w' || MMX_REG_P (x))
13420 code = 2;
13421 else if (code == 'b')
13422 code = 1;
13423 else if (code == 'k')
13424 code = 4;
13425 else if (code == 'q')
13426 code = 8;
13427 else if (code == 'y')
13428 code = 3;
13429 else if (code == 'h')
13430 code = 0;
13431 else if (code == 'x')
13432 code = 16;
13433 else if (code == 't')
13434 code = 32;
13435 else
13436 code = GET_MODE_SIZE (GET_MODE (x));
13437
13438 /* Irritatingly, AMD extended registers use different naming convention
13439 from the normal registers. */
13440 if (REX_INT_REG_P (x))
13441 {
13442 gcc_assert (TARGET_64BIT);
13443 switch (code)
13444 {
13445 case 0:
13446 error ("extended registers have no high halves");
13447 break;
13448 case 1:
13449 fprintf (file, "r%ib", REGNO (x) - FIRST_REX_INT_REG + 8);
13450 break;
13451 case 2:
13452 fprintf (file, "r%iw", REGNO (x) - FIRST_REX_INT_REG + 8);
13453 break;
13454 case 4:
13455 fprintf (file, "r%id", REGNO (x) - FIRST_REX_INT_REG + 8);
13456 break;
13457 case 8:
13458 fprintf (file, "r%i", REGNO (x) - FIRST_REX_INT_REG + 8);
13459 break;
13460 default:
13461 error ("unsupported operand size for extended register");
13462 break;
13463 }
13464 return;
13465 }
13466
13467 reg = NULL;
13468 switch (code)
13469 {
13470 case 3:
13471 if (STACK_TOP_P (x))
13472 {
13473 reg = "st(0)";
13474 break;
13475 }
13476 /* FALLTHRU */
13477 case 8:
13478 case 4:
13479 case 12:
13480 if (! ANY_FP_REG_P (x))
13481 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13482 /* FALLTHRU */
13483 case 16:
13484 case 2:
13485 normal:
13486 reg = hi_reg_name[REGNO (x)];
13487 break;
13488 case 1:
13489 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13490 goto normal;
13491 reg = qi_reg_name[REGNO (x)];
13492 break;
13493 case 0:
13494 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13495 goto normal;
13496 reg = qi_high_reg_name[REGNO (x)];
13497 break;
13498 case 32:
13499 if (SSE_REG_P (x))
13500 {
13501 gcc_assert (!duplicated);
13502 putc ('y', file);
13503 fputs (hi_reg_name[REGNO (x)] + 1, file);
13504 return;
13505 }
13506 break;
13507 default:
13508 gcc_unreachable ();
13509 }
13510
13511 fputs (reg, file);
13512 if (duplicated)
13513 {
13514 if (ASSEMBLER_DIALECT == ASM_ATT)
13515 fprintf (file, ", %%%s", reg);
13516 else
13517 fprintf (file, ", %s", reg);
13518 }
13519 }
13520
13521 /* Locate some local-dynamic symbol still in use by this function
13522 so that we can print its name in some tls_local_dynamic_base
13523 pattern. */
13524
13525 static int
13526 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13527 {
13528 rtx x = *px;
13529
13530 if (GET_CODE (x) == SYMBOL_REF
13531 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13532 {
13533 cfun->machine->some_ld_name = XSTR (x, 0);
13534 return 1;
13535 }
13536
13537 return 0;
13538 }
13539
13540 static const char *
13541 get_some_local_dynamic_name (void)
13542 {
13543 rtx insn;
13544
13545 if (cfun->machine->some_ld_name)
13546 return cfun->machine->some_ld_name;
13547
13548 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13549 if (NONDEBUG_INSN_P (insn)
13550 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13551 return cfun->machine->some_ld_name;
13552
13553 return NULL;
13554 }
13555
13556 /* Meaning of CODE:
13557 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13558 C -- print opcode suffix for set/cmov insn.
13559 c -- like C, but print reversed condition
13560 F,f -- likewise, but for floating-point.
13561 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13562 otherwise nothing
13563 R -- print the prefix for register names.
13564 z -- print the opcode suffix for the size of the current operand.
13565 Z -- likewise, with special suffixes for x87 instructions.
13566 * -- print a star (in certain assembler syntax)
13567 A -- print an absolute memory reference.
13568 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13569 s -- print a shift double count, followed by the assemblers argument
13570 delimiter.
13571 b -- print the QImode name of the register for the indicated operand.
13572 %b0 would print %al if operands[0] is reg 0.
13573 w -- likewise, print the HImode name of the register.
13574 k -- likewise, print the SImode name of the register.
13575 q -- likewise, print the DImode name of the register.
13576 x -- likewise, print the V4SFmode name of the register.
13577 t -- likewise, print the V8SFmode name of the register.
13578 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13579 y -- print "st(0)" instead of "st" as a register.
13580 d -- print duplicated register operand for AVX instruction.
13581 D -- print condition for SSE cmp instruction.
13582 P -- if PIC, print an @PLT suffix.
13583 p -- print raw symbol name.
13584 X -- don't print any sort of PIC '@' suffix for a symbol.
13585 & -- print some in-use local-dynamic symbol name.
13586 H -- print a memory address offset by 8; used for sse high-parts
13587 Y -- print condition for XOP pcom* instruction.
13588 + -- print a branch hint as 'cs' or 'ds' prefix
13589 ; -- print a semicolon (after prefixes due to bug in older gas).
13590 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13591 @ -- print a segment register of thread base pointer load
13592 */
13593
13594 void
13595 ix86_print_operand (FILE *file, rtx x, int code)
13596 {
13597 if (code)
13598 {
13599 switch (code)
13600 {
13601 case '*':
13602 if (ASSEMBLER_DIALECT == ASM_ATT)
13603 putc ('*', file);
13604 return;
13605
13606 case '&':
13607 {
13608 const char *name = get_some_local_dynamic_name ();
13609 if (name == NULL)
13610 output_operand_lossage ("'%%&' used without any "
13611 "local dynamic TLS references");
13612 else
13613 assemble_name (file, name);
13614 return;
13615 }
13616
13617 case 'A':
13618 switch (ASSEMBLER_DIALECT)
13619 {
13620 case ASM_ATT:
13621 putc ('*', file);
13622 break;
13623
13624 case ASM_INTEL:
13625 /* Intel syntax. For absolute addresses, registers should not
13626 be surrounded by braces. */
13627 if (!REG_P (x))
13628 {
13629 putc ('[', file);
13630 ix86_print_operand (file, x, 0);
13631 putc (']', file);
13632 return;
13633 }
13634 break;
13635
13636 default:
13637 gcc_unreachable ();
13638 }
13639
13640 ix86_print_operand (file, x, 0);
13641 return;
13642
13643
13644 case 'L':
13645 if (ASSEMBLER_DIALECT == ASM_ATT)
13646 putc ('l', file);
13647 return;
13648
13649 case 'W':
13650 if (ASSEMBLER_DIALECT == ASM_ATT)
13651 putc ('w', file);
13652 return;
13653
13654 case 'B':
13655 if (ASSEMBLER_DIALECT == ASM_ATT)
13656 putc ('b', file);
13657 return;
13658
13659 case 'Q':
13660 if (ASSEMBLER_DIALECT == ASM_ATT)
13661 putc ('l', file);
13662 return;
13663
13664 case 'S':
13665 if (ASSEMBLER_DIALECT == ASM_ATT)
13666 putc ('s', file);
13667 return;
13668
13669 case 'T':
13670 if (ASSEMBLER_DIALECT == ASM_ATT)
13671 putc ('t', file);
13672 return;
13673
13674 case 'z':
13675 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13676 {
13677 /* Opcodes don't get size suffixes if using Intel opcodes. */
13678 if (ASSEMBLER_DIALECT == ASM_INTEL)
13679 return;
13680
13681 switch (GET_MODE_SIZE (GET_MODE (x)))
13682 {
13683 case 1:
13684 putc ('b', file);
13685 return;
13686
13687 case 2:
13688 putc ('w', file);
13689 return;
13690
13691 case 4:
13692 putc ('l', file);
13693 return;
13694
13695 case 8:
13696 putc ('q', file);
13697 return;
13698
13699 default:
13700 output_operand_lossage
13701 ("invalid operand size for operand code '%c'", code);
13702 return;
13703 }
13704 }
13705
13706 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13707 warning
13708 (0, "non-integer operand used with operand code '%c'", code);
13709 /* FALLTHRU */
13710
13711 case 'Z':
13712 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
13713 if (ASSEMBLER_DIALECT == ASM_INTEL)
13714 return;
13715
13716 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13717 {
13718 switch (GET_MODE_SIZE (GET_MODE (x)))
13719 {
13720 case 2:
13721 #ifdef HAVE_AS_IX86_FILDS
13722 putc ('s', file);
13723 #endif
13724 return;
13725
13726 case 4:
13727 putc ('l', file);
13728 return;
13729
13730 case 8:
13731 #ifdef HAVE_AS_IX86_FILDQ
13732 putc ('q', file);
13733 #else
13734 fputs ("ll", file);
13735 #endif
13736 return;
13737
13738 default:
13739 break;
13740 }
13741 }
13742 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13743 {
13744 /* 387 opcodes don't get size suffixes
13745 if the operands are registers. */
13746 if (STACK_REG_P (x))
13747 return;
13748
13749 switch (GET_MODE_SIZE (GET_MODE (x)))
13750 {
13751 case 4:
13752 putc ('s', file);
13753 return;
13754
13755 case 8:
13756 putc ('l', file);
13757 return;
13758
13759 case 12:
13760 case 16:
13761 putc ('t', file);
13762 return;
13763
13764 default:
13765 break;
13766 }
13767 }
13768 else
13769 {
13770 output_operand_lossage
13771 ("invalid operand type used with operand code '%c'", code);
13772 return;
13773 }
13774
13775 output_operand_lossage
13776 ("invalid operand size for operand code '%c'", code);
13777 return;
13778
13779 case 'd':
13780 case 'b':
13781 case 'w':
13782 case 'k':
13783 case 'q':
13784 case 'h':
13785 case 't':
13786 case 'y':
13787 case 'x':
13788 case 'X':
13789 case 'P':
13790 case 'p':
13791 break;
13792
13793 case 's':
13794 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
13795 {
13796 ix86_print_operand (file, x, 0);
13797 fputs (", ", file);
13798 }
13799 return;
13800
13801 case 'D':
13802 /* Little bit of braindamage here. The SSE compare instructions
13803 does use completely different names for the comparisons that the
13804 fp conditional moves. */
13805 if (TARGET_AVX)
13806 {
13807 switch (GET_CODE (x))
13808 {
13809 case EQ:
13810 fputs ("eq", file);
13811 break;
13812 case UNEQ:
13813 fputs ("eq_us", file);
13814 break;
13815 case LT:
13816 fputs ("lt", file);
13817 break;
13818 case UNLT:
13819 fputs ("nge", file);
13820 break;
13821 case LE:
13822 fputs ("le", file);
13823 break;
13824 case UNLE:
13825 fputs ("ngt", file);
13826 break;
13827 case UNORDERED:
13828 fputs ("unord", file);
13829 break;
13830 case NE:
13831 fputs ("neq", file);
13832 break;
13833 case LTGT:
13834 fputs ("neq_oq", file);
13835 break;
13836 case GE:
13837 fputs ("ge", file);
13838 break;
13839 case UNGE:
13840 fputs ("nlt", file);
13841 break;
13842 case GT:
13843 fputs ("gt", file);
13844 break;
13845 case UNGT:
13846 fputs ("nle", file);
13847 break;
13848 case ORDERED:
13849 fputs ("ord", file);
13850 break;
13851 default:
13852 output_operand_lossage ("operand is not a condition code, "
13853 "invalid operand code 'D'");
13854 return;
13855 }
13856 }
13857 else
13858 {
13859 switch (GET_CODE (x))
13860 {
13861 case EQ:
13862 case UNEQ:
13863 fputs ("eq", file);
13864 break;
13865 case LT:
13866 case UNLT:
13867 fputs ("lt", file);
13868 break;
13869 case LE:
13870 case UNLE:
13871 fputs ("le", file);
13872 break;
13873 case UNORDERED:
13874 fputs ("unord", file);
13875 break;
13876 case NE:
13877 case LTGT:
13878 fputs ("neq", file);
13879 break;
13880 case UNGE:
13881 case GE:
13882 fputs ("nlt", file);
13883 break;
13884 case UNGT:
13885 case GT:
13886 fputs ("nle", file);
13887 break;
13888 case ORDERED:
13889 fputs ("ord", file);
13890 break;
13891 default:
13892 output_operand_lossage ("operand is not a condition code, "
13893 "invalid operand code 'D'");
13894 return;
13895 }
13896 }
13897 return;
13898 case 'O':
13899 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
13900 if (ASSEMBLER_DIALECT == ASM_ATT)
13901 {
13902 switch (GET_MODE (x))
13903 {
13904 case HImode: putc ('w', file); break;
13905 case SImode:
13906 case SFmode: putc ('l', file); break;
13907 case DImode:
13908 case DFmode: putc ('q', file); break;
13909 default: gcc_unreachable ();
13910 }
13911 putc ('.', file);
13912 }
13913 #endif
13914 return;
13915 case 'C':
13916 if (!COMPARISON_P (x))
13917 {
13918 output_operand_lossage ("operand is neither a constant nor a "
13919 "condition code, invalid operand code "
13920 "'C'");
13921 return;
13922 }
13923 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
13924 return;
13925 case 'F':
13926 if (!COMPARISON_P (x))
13927 {
13928 output_operand_lossage ("operand is neither a constant nor a "
13929 "condition code, invalid operand code "
13930 "'F'");
13931 return;
13932 }
13933 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
13934 if (ASSEMBLER_DIALECT == ASM_ATT)
13935 putc ('.', file);
13936 #endif
13937 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
13938 return;
13939
13940 /* Like above, but reverse condition */
13941 case 'c':
13942 /* Check to see if argument to %c is really a constant
13943 and not a condition code which needs to be reversed. */
13944 if (!COMPARISON_P (x))
13945 {
13946 output_operand_lossage ("operand is neither a constant nor a "
13947 "condition code, invalid operand "
13948 "code 'c'");
13949 return;
13950 }
13951 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
13952 return;
13953 case 'f':
13954 if (!COMPARISON_P (x))
13955 {
13956 output_operand_lossage ("operand is neither a constant nor a "
13957 "condition code, invalid operand "
13958 "code 'f'");
13959 return;
13960 }
13961 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
13962 if (ASSEMBLER_DIALECT == ASM_ATT)
13963 putc ('.', file);
13964 #endif
13965 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
13966 return;
13967
13968 case 'H':
13969 /* It doesn't actually matter what mode we use here, as we're
13970 only going to use this for printing. */
13971 x = adjust_address_nv (x, DImode, 8);
13972 break;
13973
13974 case '+':
13975 {
13976 rtx x;
13977
13978 if (!optimize
13979 || optimize_function_for_size_p (cfun) || !TARGET_BRANCH_PREDICTION_HINTS)
13980 return;
13981
13982 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
13983 if (x)
13984 {
13985 int pred_val = INTVAL (XEXP (x, 0));
13986
13987 if (pred_val < REG_BR_PROB_BASE * 45 / 100
13988 || pred_val > REG_BR_PROB_BASE * 55 / 100)
13989 {
13990 int taken = pred_val > REG_BR_PROB_BASE / 2;
13991 int cputaken = final_forward_branch_p (current_output_insn) == 0;
13992
13993 /* Emit hints only in the case default branch prediction
13994 heuristics would fail. */
13995 if (taken != cputaken)
13996 {
13997 /* We use 3e (DS) prefix for taken branches and
13998 2e (CS) prefix for not taken branches. */
13999 if (taken)
14000 fputs ("ds ; ", file);
14001 else
14002 fputs ("cs ; ", file);
14003 }
14004 }
14005 }
14006 return;
14007 }
14008
14009 case 'Y':
14010 switch (GET_CODE (x))
14011 {
14012 case NE:
14013 fputs ("neq", file);
14014 break;
14015 case EQ:
14016 fputs ("eq", file);
14017 break;
14018 case GE:
14019 case GEU:
14020 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14021 break;
14022 case GT:
14023 case GTU:
14024 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14025 break;
14026 case LE:
14027 case LEU:
14028 fputs ("le", file);
14029 break;
14030 case LT:
14031 case LTU:
14032 fputs ("lt", file);
14033 break;
14034 case UNORDERED:
14035 fputs ("unord", file);
14036 break;
14037 case ORDERED:
14038 fputs ("ord", file);
14039 break;
14040 case UNEQ:
14041 fputs ("ueq", file);
14042 break;
14043 case UNGE:
14044 fputs ("nlt", file);
14045 break;
14046 case UNGT:
14047 fputs ("nle", file);
14048 break;
14049 case UNLE:
14050 fputs ("ule", file);
14051 break;
14052 case UNLT:
14053 fputs ("ult", file);
14054 break;
14055 case LTGT:
14056 fputs ("une", file);
14057 break;
14058 default:
14059 output_operand_lossage ("operand is not a condition code, "
14060 "invalid operand code 'Y'");
14061 return;
14062 }
14063 return;
14064
14065 case ';':
14066 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14067 putc (';', file);
14068 #endif
14069 return;
14070
14071 case '@':
14072 if (ASSEMBLER_DIALECT == ASM_ATT)
14073 putc ('%', file);
14074
14075 /* The kernel uses a different segment register for performance
14076 reasons; a system call would not have to trash the userspace
14077 segment register, which would be expensive. */
14078 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14079 fputs ("fs", file);
14080 else
14081 fputs ("gs", file);
14082 return;
14083
14084 case '~':
14085 putc (TARGET_AVX2 ? 'i' : 'f', file);
14086 return;
14087
14088 default:
14089 output_operand_lossage ("invalid operand code '%c'", code);
14090 }
14091 }
14092
14093 if (REG_P (x))
14094 print_reg (x, code, file);
14095
14096 else if (MEM_P (x))
14097 {
14098 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14099 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14100 && GET_MODE (x) != BLKmode)
14101 {
14102 const char * size;
14103 switch (GET_MODE_SIZE (GET_MODE (x)))
14104 {
14105 case 1: size = "BYTE"; break;
14106 case 2: size = "WORD"; break;
14107 case 4: size = "DWORD"; break;
14108 case 8: size = "QWORD"; break;
14109 case 12: size = "TBYTE"; break;
14110 case 16:
14111 if (GET_MODE (x) == XFmode)
14112 size = "TBYTE";
14113 else
14114 size = "XMMWORD";
14115 break;
14116 case 32: size = "YMMWORD"; break;
14117 default:
14118 gcc_unreachable ();
14119 }
14120
14121 /* Check for explicit size override (codes 'b', 'w', 'k',
14122 'q' and 'x') */
14123 if (code == 'b')
14124 size = "BYTE";
14125 else if (code == 'w')
14126 size = "WORD";
14127 else if (code == 'k')
14128 size = "DWORD";
14129 else if (code == 'q')
14130 size = "QWORD";
14131 else if (code == 'x')
14132 size = "XMMWORD";
14133
14134 fputs (size, file);
14135 fputs (" PTR ", file);
14136 }
14137
14138 x = XEXP (x, 0);
14139 /* Avoid (%rip) for call operands. */
14140 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14141 && !CONST_INT_P (x))
14142 output_addr_const (file, x);
14143 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14144 output_operand_lossage ("invalid constraints for operand");
14145 else
14146 output_address (x);
14147 }
14148
14149 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14150 {
14151 REAL_VALUE_TYPE r;
14152 long l;
14153
14154 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14155 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14156
14157 if (ASSEMBLER_DIALECT == ASM_ATT)
14158 putc ('$', file);
14159 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14160 if (code == 'q')
14161 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14162 else
14163 fprintf (file, "0x%08x", (unsigned int) l);
14164 }
14165
14166 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14167 {
14168 REAL_VALUE_TYPE r;
14169 long l[2];
14170
14171 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14172 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14173
14174 if (ASSEMBLER_DIALECT == ASM_ATT)
14175 putc ('$', file);
14176 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14177 }
14178
14179 /* These float cases don't actually occur as immediate operands. */
14180 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14181 {
14182 char dstr[30];
14183
14184 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14185 fputs (dstr, file);
14186 }
14187
14188 else
14189 {
14190 /* We have patterns that allow zero sets of memory, for instance.
14191 In 64-bit mode, we should probably support all 8-byte vectors,
14192 since we can in fact encode that into an immediate. */
14193 if (GET_CODE (x) == CONST_VECTOR)
14194 {
14195 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14196 x = const0_rtx;
14197 }
14198
14199 if (code != 'P' && code != 'p')
14200 {
14201 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14202 {
14203 if (ASSEMBLER_DIALECT == ASM_ATT)
14204 putc ('$', file);
14205 }
14206 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14207 || GET_CODE (x) == LABEL_REF)
14208 {
14209 if (ASSEMBLER_DIALECT == ASM_ATT)
14210 putc ('$', file);
14211 else
14212 fputs ("OFFSET FLAT:", file);
14213 }
14214 }
14215 if (CONST_INT_P (x))
14216 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14217 else if (flag_pic || MACHOPIC_INDIRECT)
14218 output_pic_addr_const (file, x, code);
14219 else
14220 output_addr_const (file, x);
14221 }
14222 }
14223
14224 static bool
14225 ix86_print_operand_punct_valid_p (unsigned char code)
14226 {
14227 return (code == '@' || code == '*' || code == '+'
14228 || code == '&' || code == ';' || code == '~');
14229 }
14230 \f
14231 /* Print a memory operand whose address is ADDR. */
14232
14233 static void
14234 ix86_print_operand_address (FILE *file, rtx addr)
14235 {
14236 struct ix86_address parts;
14237 rtx base, index, disp;
14238 int scale;
14239 int ok;
14240 bool vsib = false;
14241
14242 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14243 {
14244 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14245 gcc_assert (parts.index == NULL_RTX);
14246 parts.index = XVECEXP (addr, 0, 1);
14247 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14248 addr = XVECEXP (addr, 0, 0);
14249 vsib = true;
14250 }
14251 else
14252 ok = ix86_decompose_address (addr, &parts);
14253
14254 gcc_assert (ok);
14255
14256 if (parts.base && GET_CODE (parts.base) == SUBREG)
14257 {
14258 rtx tmp = SUBREG_REG (parts.base);
14259 parts.base = simplify_subreg (GET_MODE (parts.base),
14260 tmp, GET_MODE (tmp), 0);
14261 }
14262
14263 if (parts.index && GET_CODE (parts.index) == SUBREG)
14264 {
14265 rtx tmp = SUBREG_REG (parts.index);
14266 parts.index = simplify_subreg (GET_MODE (parts.index),
14267 tmp, GET_MODE (tmp), 0);
14268 }
14269
14270 base = parts.base;
14271 index = parts.index;
14272 disp = parts.disp;
14273 scale = parts.scale;
14274
14275 switch (parts.seg)
14276 {
14277 case SEG_DEFAULT:
14278 break;
14279 case SEG_FS:
14280 case SEG_GS:
14281 if (ASSEMBLER_DIALECT == ASM_ATT)
14282 putc ('%', file);
14283 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14284 break;
14285 default:
14286 gcc_unreachable ();
14287 }
14288
14289 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14290 if (TARGET_64BIT && !base && !index)
14291 {
14292 rtx symbol = disp;
14293
14294 if (GET_CODE (disp) == CONST
14295 && GET_CODE (XEXP (disp, 0)) == PLUS
14296 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14297 symbol = XEXP (XEXP (disp, 0), 0);
14298
14299 if (GET_CODE (symbol) == LABEL_REF
14300 || (GET_CODE (symbol) == SYMBOL_REF
14301 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14302 base = pc_rtx;
14303 }
14304 if (!base && !index)
14305 {
14306 /* Displacement only requires special attention. */
14307
14308 if (CONST_INT_P (disp))
14309 {
14310 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14311 fputs ("ds:", file);
14312 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14313 }
14314 else if (flag_pic)
14315 output_pic_addr_const (file, disp, 0);
14316 else
14317 output_addr_const (file, disp);
14318 }
14319 else
14320 {
14321 int code = 0;
14322
14323 /* Print SImode registers for zero-extended addresses to force
14324 addr32 prefix. Otherwise print DImode registers to avoid it. */
14325 if (TARGET_64BIT)
14326 code = ((GET_CODE (addr) == ZERO_EXTEND
14327 || GET_CODE (addr) == AND)
14328 ? 'l'
14329 : 'q');
14330
14331 if (ASSEMBLER_DIALECT == ASM_ATT)
14332 {
14333 if (disp)
14334 {
14335 if (flag_pic)
14336 output_pic_addr_const (file, disp, 0);
14337 else if (GET_CODE (disp) == LABEL_REF)
14338 output_asm_label (disp);
14339 else
14340 output_addr_const (file, disp);
14341 }
14342
14343 putc ('(', file);
14344 if (base)
14345 print_reg (base, code, file);
14346 if (index)
14347 {
14348 putc (',', file);
14349 print_reg (index, vsib ? 0 : code, file);
14350 if (scale != 1 || vsib)
14351 fprintf (file, ",%d", scale);
14352 }
14353 putc (')', file);
14354 }
14355 else
14356 {
14357 rtx offset = NULL_RTX;
14358
14359 if (disp)
14360 {
14361 /* Pull out the offset of a symbol; print any symbol itself. */
14362 if (GET_CODE (disp) == CONST
14363 && GET_CODE (XEXP (disp, 0)) == PLUS
14364 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14365 {
14366 offset = XEXP (XEXP (disp, 0), 1);
14367 disp = gen_rtx_CONST (VOIDmode,
14368 XEXP (XEXP (disp, 0), 0));
14369 }
14370
14371 if (flag_pic)
14372 output_pic_addr_const (file, disp, 0);
14373 else if (GET_CODE (disp) == LABEL_REF)
14374 output_asm_label (disp);
14375 else if (CONST_INT_P (disp))
14376 offset = disp;
14377 else
14378 output_addr_const (file, disp);
14379 }
14380
14381 putc ('[', file);
14382 if (base)
14383 {
14384 print_reg (base, code, file);
14385 if (offset)
14386 {
14387 if (INTVAL (offset) >= 0)
14388 putc ('+', file);
14389 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14390 }
14391 }
14392 else if (offset)
14393 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14394 else
14395 putc ('0', file);
14396
14397 if (index)
14398 {
14399 putc ('+', file);
14400 print_reg (index, vsib ? 0 : code, file);
14401 if (scale != 1 || vsib)
14402 fprintf (file, "*%d", scale);
14403 }
14404 putc (']', file);
14405 }
14406 }
14407 }
14408
14409 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14410
14411 static bool
14412 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14413 {
14414 rtx op;
14415
14416 if (GET_CODE (x) != UNSPEC)
14417 return false;
14418
14419 op = XVECEXP (x, 0, 0);
14420 switch (XINT (x, 1))
14421 {
14422 case UNSPEC_GOTTPOFF:
14423 output_addr_const (file, op);
14424 /* FIXME: This might be @TPOFF in Sun ld. */
14425 fputs ("@gottpoff", file);
14426 break;
14427 case UNSPEC_TPOFF:
14428 output_addr_const (file, op);
14429 fputs ("@tpoff", file);
14430 break;
14431 case UNSPEC_NTPOFF:
14432 output_addr_const (file, op);
14433 if (TARGET_64BIT)
14434 fputs ("@tpoff", file);
14435 else
14436 fputs ("@ntpoff", file);
14437 break;
14438 case UNSPEC_DTPOFF:
14439 output_addr_const (file, op);
14440 fputs ("@dtpoff", file);
14441 break;
14442 case UNSPEC_GOTNTPOFF:
14443 output_addr_const (file, op);
14444 if (TARGET_64BIT)
14445 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14446 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14447 else
14448 fputs ("@gotntpoff", file);
14449 break;
14450 case UNSPEC_INDNTPOFF:
14451 output_addr_const (file, op);
14452 fputs ("@indntpoff", file);
14453 break;
14454 #if TARGET_MACHO
14455 case UNSPEC_MACHOPIC_OFFSET:
14456 output_addr_const (file, op);
14457 putc ('-', file);
14458 machopic_output_function_base_name (file);
14459 break;
14460 #endif
14461
14462 case UNSPEC_STACK_CHECK:
14463 {
14464 int offset;
14465
14466 gcc_assert (flag_split_stack);
14467
14468 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14469 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14470 #else
14471 gcc_unreachable ();
14472 #endif
14473
14474 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14475 }
14476 break;
14477
14478 default:
14479 return false;
14480 }
14481
14482 return true;
14483 }
14484 \f
14485 /* Split one or more double-mode RTL references into pairs of half-mode
14486 references. The RTL can be REG, offsettable MEM, integer constant, or
14487 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14488 split and "num" is its length. lo_half and hi_half are output arrays
14489 that parallel "operands". */
14490
14491 void
14492 split_double_mode (enum machine_mode mode, rtx operands[],
14493 int num, rtx lo_half[], rtx hi_half[])
14494 {
14495 enum machine_mode half_mode;
14496 unsigned int byte;
14497
14498 switch (mode)
14499 {
14500 case TImode:
14501 half_mode = DImode;
14502 break;
14503 case DImode:
14504 half_mode = SImode;
14505 break;
14506 default:
14507 gcc_unreachable ();
14508 }
14509
14510 byte = GET_MODE_SIZE (half_mode);
14511
14512 while (num--)
14513 {
14514 rtx op = operands[num];
14515
14516 /* simplify_subreg refuse to split volatile memory addresses,
14517 but we still have to handle it. */
14518 if (MEM_P (op))
14519 {
14520 lo_half[num] = adjust_address (op, half_mode, 0);
14521 hi_half[num] = adjust_address (op, half_mode, byte);
14522 }
14523 else
14524 {
14525 lo_half[num] = simplify_gen_subreg (half_mode, op,
14526 GET_MODE (op) == VOIDmode
14527 ? mode : GET_MODE (op), 0);
14528 hi_half[num] = simplify_gen_subreg (half_mode, op,
14529 GET_MODE (op) == VOIDmode
14530 ? mode : GET_MODE (op), byte);
14531 }
14532 }
14533 }
14534 \f
14535 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14536 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14537 is the expression of the binary operation. The output may either be
14538 emitted here, or returned to the caller, like all output_* functions.
14539
14540 There is no guarantee that the operands are the same mode, as they
14541 might be within FLOAT or FLOAT_EXTEND expressions. */
14542
14543 #ifndef SYSV386_COMPAT
14544 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14545 wants to fix the assemblers because that causes incompatibility
14546 with gcc. No-one wants to fix gcc because that causes
14547 incompatibility with assemblers... You can use the option of
14548 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14549 #define SYSV386_COMPAT 1
14550 #endif
14551
14552 const char *
14553 output_387_binary_op (rtx insn, rtx *operands)
14554 {
14555 static char buf[40];
14556 const char *p;
14557 const char *ssep;
14558 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14559
14560 #ifdef ENABLE_CHECKING
14561 /* Even if we do not want to check the inputs, this documents input
14562 constraints. Which helps in understanding the following code. */
14563 if (STACK_REG_P (operands[0])
14564 && ((REG_P (operands[1])
14565 && REGNO (operands[0]) == REGNO (operands[1])
14566 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14567 || (REG_P (operands[2])
14568 && REGNO (operands[0]) == REGNO (operands[2])
14569 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14570 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14571 ; /* ok */
14572 else
14573 gcc_assert (is_sse);
14574 #endif
14575
14576 switch (GET_CODE (operands[3]))
14577 {
14578 case PLUS:
14579 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14580 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14581 p = "fiadd";
14582 else
14583 p = "fadd";
14584 ssep = "vadd";
14585 break;
14586
14587 case MINUS:
14588 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14589 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14590 p = "fisub";
14591 else
14592 p = "fsub";
14593 ssep = "vsub";
14594 break;
14595
14596 case MULT:
14597 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14598 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14599 p = "fimul";
14600 else
14601 p = "fmul";
14602 ssep = "vmul";
14603 break;
14604
14605 case DIV:
14606 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14607 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14608 p = "fidiv";
14609 else
14610 p = "fdiv";
14611 ssep = "vdiv";
14612 break;
14613
14614 default:
14615 gcc_unreachable ();
14616 }
14617
14618 if (is_sse)
14619 {
14620 if (TARGET_AVX)
14621 {
14622 strcpy (buf, ssep);
14623 if (GET_MODE (operands[0]) == SFmode)
14624 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
14625 else
14626 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
14627 }
14628 else
14629 {
14630 strcpy (buf, ssep + 1);
14631 if (GET_MODE (operands[0]) == SFmode)
14632 strcat (buf, "ss\t{%2, %0|%0, %2}");
14633 else
14634 strcat (buf, "sd\t{%2, %0|%0, %2}");
14635 }
14636 return buf;
14637 }
14638 strcpy (buf, p);
14639
14640 switch (GET_CODE (operands[3]))
14641 {
14642 case MULT:
14643 case PLUS:
14644 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
14645 {
14646 rtx temp = operands[2];
14647 operands[2] = operands[1];
14648 operands[1] = temp;
14649 }
14650
14651 /* know operands[0] == operands[1]. */
14652
14653 if (MEM_P (operands[2]))
14654 {
14655 p = "%Z2\t%2";
14656 break;
14657 }
14658
14659 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14660 {
14661 if (STACK_TOP_P (operands[0]))
14662 /* How is it that we are storing to a dead operand[2]?
14663 Well, presumably operands[1] is dead too. We can't
14664 store the result to st(0) as st(0) gets popped on this
14665 instruction. Instead store to operands[2] (which I
14666 think has to be st(1)). st(1) will be popped later.
14667 gcc <= 2.8.1 didn't have this check and generated
14668 assembly code that the Unixware assembler rejected. */
14669 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14670 else
14671 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14672 break;
14673 }
14674
14675 if (STACK_TOP_P (operands[0]))
14676 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14677 else
14678 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14679 break;
14680
14681 case MINUS:
14682 case DIV:
14683 if (MEM_P (operands[1]))
14684 {
14685 p = "r%Z1\t%1";
14686 break;
14687 }
14688
14689 if (MEM_P (operands[2]))
14690 {
14691 p = "%Z2\t%2";
14692 break;
14693 }
14694
14695 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14696 {
14697 #if SYSV386_COMPAT
14698 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
14699 derived assemblers, confusingly reverse the direction of
14700 the operation for fsub{r} and fdiv{r} when the
14701 destination register is not st(0). The Intel assembler
14702 doesn't have this brain damage. Read !SYSV386_COMPAT to
14703 figure out what the hardware really does. */
14704 if (STACK_TOP_P (operands[0]))
14705 p = "{p\t%0, %2|rp\t%2, %0}";
14706 else
14707 p = "{rp\t%2, %0|p\t%0, %2}";
14708 #else
14709 if (STACK_TOP_P (operands[0]))
14710 /* As above for fmul/fadd, we can't store to st(0). */
14711 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14712 else
14713 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14714 #endif
14715 break;
14716 }
14717
14718 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
14719 {
14720 #if SYSV386_COMPAT
14721 if (STACK_TOP_P (operands[0]))
14722 p = "{rp\t%0, %1|p\t%1, %0}";
14723 else
14724 p = "{p\t%1, %0|rp\t%0, %1}";
14725 #else
14726 if (STACK_TOP_P (operands[0]))
14727 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
14728 else
14729 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
14730 #endif
14731 break;
14732 }
14733
14734 if (STACK_TOP_P (operands[0]))
14735 {
14736 if (STACK_TOP_P (operands[1]))
14737 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14738 else
14739 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
14740 break;
14741 }
14742 else if (STACK_TOP_P (operands[1]))
14743 {
14744 #if SYSV386_COMPAT
14745 p = "{\t%1, %0|r\t%0, %1}";
14746 #else
14747 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
14748 #endif
14749 }
14750 else
14751 {
14752 #if SYSV386_COMPAT
14753 p = "{r\t%2, %0|\t%0, %2}";
14754 #else
14755 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14756 #endif
14757 }
14758 break;
14759
14760 default:
14761 gcc_unreachable ();
14762 }
14763
14764 strcat (buf, p);
14765 return buf;
14766 }
14767
14768 /* Return needed mode for entity in optimize_mode_switching pass. */
14769
14770 int
14771 ix86_mode_needed (int entity, rtx insn)
14772 {
14773 enum attr_i387_cw mode;
14774
14775 /* The mode UNINITIALIZED is used to store control word after a
14776 function call or ASM pattern. The mode ANY specify that function
14777 has no requirements on the control word and make no changes in the
14778 bits we are interested in. */
14779
14780 if (CALL_P (insn)
14781 || (NONJUMP_INSN_P (insn)
14782 && (asm_noperands (PATTERN (insn)) >= 0
14783 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
14784 return I387_CW_UNINITIALIZED;
14785
14786 if (recog_memoized (insn) < 0)
14787 return I387_CW_ANY;
14788
14789 mode = get_attr_i387_cw (insn);
14790
14791 switch (entity)
14792 {
14793 case I387_TRUNC:
14794 if (mode == I387_CW_TRUNC)
14795 return mode;
14796 break;
14797
14798 case I387_FLOOR:
14799 if (mode == I387_CW_FLOOR)
14800 return mode;
14801 break;
14802
14803 case I387_CEIL:
14804 if (mode == I387_CW_CEIL)
14805 return mode;
14806 break;
14807
14808 case I387_MASK_PM:
14809 if (mode == I387_CW_MASK_PM)
14810 return mode;
14811 break;
14812
14813 default:
14814 gcc_unreachable ();
14815 }
14816
14817 return I387_CW_ANY;
14818 }
14819
14820 /* Output code to initialize control word copies used by trunc?f?i and
14821 rounding patterns. CURRENT_MODE is set to current control word,
14822 while NEW_MODE is set to new control word. */
14823
14824 void
14825 emit_i387_cw_initialization (int mode)
14826 {
14827 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
14828 rtx new_mode;
14829
14830 enum ix86_stack_slot slot;
14831
14832 rtx reg = gen_reg_rtx (HImode);
14833
14834 emit_insn (gen_x86_fnstcw_1 (stored_mode));
14835 emit_move_insn (reg, copy_rtx (stored_mode));
14836
14837 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
14838 || optimize_function_for_size_p (cfun))
14839 {
14840 switch (mode)
14841 {
14842 case I387_CW_TRUNC:
14843 /* round toward zero (truncate) */
14844 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
14845 slot = SLOT_CW_TRUNC;
14846 break;
14847
14848 case I387_CW_FLOOR:
14849 /* round down toward -oo */
14850 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
14851 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
14852 slot = SLOT_CW_FLOOR;
14853 break;
14854
14855 case I387_CW_CEIL:
14856 /* round up toward +oo */
14857 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
14858 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
14859 slot = SLOT_CW_CEIL;
14860 break;
14861
14862 case I387_CW_MASK_PM:
14863 /* mask precision exception for nearbyint() */
14864 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
14865 slot = SLOT_CW_MASK_PM;
14866 break;
14867
14868 default:
14869 gcc_unreachable ();
14870 }
14871 }
14872 else
14873 {
14874 switch (mode)
14875 {
14876 case I387_CW_TRUNC:
14877 /* round toward zero (truncate) */
14878 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
14879 slot = SLOT_CW_TRUNC;
14880 break;
14881
14882 case I387_CW_FLOOR:
14883 /* round down toward -oo */
14884 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
14885 slot = SLOT_CW_FLOOR;
14886 break;
14887
14888 case I387_CW_CEIL:
14889 /* round up toward +oo */
14890 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
14891 slot = SLOT_CW_CEIL;
14892 break;
14893
14894 case I387_CW_MASK_PM:
14895 /* mask precision exception for nearbyint() */
14896 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
14897 slot = SLOT_CW_MASK_PM;
14898 break;
14899
14900 default:
14901 gcc_unreachable ();
14902 }
14903 }
14904
14905 gcc_assert (slot < MAX_386_STACK_LOCALS);
14906
14907 new_mode = assign_386_stack_local (HImode, slot);
14908 emit_move_insn (new_mode, reg);
14909 }
14910
14911 /* Output code for INSN to convert a float to a signed int. OPERANDS
14912 are the insn operands. The output may be [HSD]Imode and the input
14913 operand may be [SDX]Fmode. */
14914
14915 const char *
14916 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
14917 {
14918 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
14919 int dimode_p = GET_MODE (operands[0]) == DImode;
14920 int round_mode = get_attr_i387_cw (insn);
14921
14922 /* Jump through a hoop or two for DImode, since the hardware has no
14923 non-popping instruction. We used to do this a different way, but
14924 that was somewhat fragile and broke with post-reload splitters. */
14925 if ((dimode_p || fisttp) && !stack_top_dies)
14926 output_asm_insn ("fld\t%y1", operands);
14927
14928 gcc_assert (STACK_TOP_P (operands[1]));
14929 gcc_assert (MEM_P (operands[0]));
14930 gcc_assert (GET_MODE (operands[1]) != TFmode);
14931
14932 if (fisttp)
14933 output_asm_insn ("fisttp%Z0\t%0", operands);
14934 else
14935 {
14936 if (round_mode != I387_CW_ANY)
14937 output_asm_insn ("fldcw\t%3", operands);
14938 if (stack_top_dies || dimode_p)
14939 output_asm_insn ("fistp%Z0\t%0", operands);
14940 else
14941 output_asm_insn ("fist%Z0\t%0", operands);
14942 if (round_mode != I387_CW_ANY)
14943 output_asm_insn ("fldcw\t%2", operands);
14944 }
14945
14946 return "";
14947 }
14948
14949 /* Output code for x87 ffreep insn. The OPNO argument, which may only
14950 have the values zero or one, indicates the ffreep insn's operand
14951 from the OPERANDS array. */
14952
14953 static const char *
14954 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
14955 {
14956 if (TARGET_USE_FFREEP)
14957 #ifdef HAVE_AS_IX86_FFREEP
14958 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
14959 #else
14960 {
14961 static char retval[32];
14962 int regno = REGNO (operands[opno]);
14963
14964 gcc_assert (FP_REGNO_P (regno));
14965
14966 regno -= FIRST_STACK_REG;
14967
14968 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
14969 return retval;
14970 }
14971 #endif
14972
14973 return opno ? "fstp\t%y1" : "fstp\t%y0";
14974 }
14975
14976
14977 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
14978 should be used. UNORDERED_P is true when fucom should be used. */
14979
14980 const char *
14981 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
14982 {
14983 int stack_top_dies;
14984 rtx cmp_op0, cmp_op1;
14985 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
14986
14987 if (eflags_p)
14988 {
14989 cmp_op0 = operands[0];
14990 cmp_op1 = operands[1];
14991 }
14992 else
14993 {
14994 cmp_op0 = operands[1];
14995 cmp_op1 = operands[2];
14996 }
14997
14998 if (is_sse)
14999 {
15000 if (GET_MODE (operands[0]) == SFmode)
15001 if (unordered_p)
15002 return "%vucomiss\t{%1, %0|%0, %1}";
15003 else
15004 return "%vcomiss\t{%1, %0|%0, %1}";
15005 else
15006 if (unordered_p)
15007 return "%vucomisd\t{%1, %0|%0, %1}";
15008 else
15009 return "%vcomisd\t{%1, %0|%0, %1}";
15010 }
15011
15012 gcc_assert (STACK_TOP_P (cmp_op0));
15013
15014 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15015
15016 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15017 {
15018 if (stack_top_dies)
15019 {
15020 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15021 return output_387_ffreep (operands, 1);
15022 }
15023 else
15024 return "ftst\n\tfnstsw\t%0";
15025 }
15026
15027 if (STACK_REG_P (cmp_op1)
15028 && stack_top_dies
15029 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15030 && REGNO (cmp_op1) != FIRST_STACK_REG)
15031 {
15032 /* If both the top of the 387 stack dies, and the other operand
15033 is also a stack register that dies, then this must be a
15034 `fcompp' float compare */
15035
15036 if (eflags_p)
15037 {
15038 /* There is no double popping fcomi variant. Fortunately,
15039 eflags is immune from the fstp's cc clobbering. */
15040 if (unordered_p)
15041 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15042 else
15043 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15044 return output_387_ffreep (operands, 0);
15045 }
15046 else
15047 {
15048 if (unordered_p)
15049 return "fucompp\n\tfnstsw\t%0";
15050 else
15051 return "fcompp\n\tfnstsw\t%0";
15052 }
15053 }
15054 else
15055 {
15056 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15057
15058 static const char * const alt[16] =
15059 {
15060 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15061 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15062 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15063 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15064
15065 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15066 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15067 NULL,
15068 NULL,
15069
15070 "fcomi\t{%y1, %0|%0, %y1}",
15071 "fcomip\t{%y1, %0|%0, %y1}",
15072 "fucomi\t{%y1, %0|%0, %y1}",
15073 "fucomip\t{%y1, %0|%0, %y1}",
15074
15075 NULL,
15076 NULL,
15077 NULL,
15078 NULL
15079 };
15080
15081 int mask;
15082 const char *ret;
15083
15084 mask = eflags_p << 3;
15085 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15086 mask |= unordered_p << 1;
15087 mask |= stack_top_dies;
15088
15089 gcc_assert (mask < 16);
15090 ret = alt[mask];
15091 gcc_assert (ret);
15092
15093 return ret;
15094 }
15095 }
15096
15097 void
15098 ix86_output_addr_vec_elt (FILE *file, int value)
15099 {
15100 const char *directive = ASM_LONG;
15101
15102 #ifdef ASM_QUAD
15103 if (TARGET_LP64)
15104 directive = ASM_QUAD;
15105 #else
15106 gcc_assert (!TARGET_64BIT);
15107 #endif
15108
15109 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15110 }
15111
15112 void
15113 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15114 {
15115 const char *directive = ASM_LONG;
15116
15117 #ifdef ASM_QUAD
15118 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15119 directive = ASM_QUAD;
15120 #else
15121 gcc_assert (!TARGET_64BIT);
15122 #endif
15123 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15124 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15125 fprintf (file, "%s%s%d-%s%d\n",
15126 directive, LPREFIX, value, LPREFIX, rel);
15127 else if (HAVE_AS_GOTOFF_IN_DATA)
15128 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15129 #if TARGET_MACHO
15130 else if (TARGET_MACHO)
15131 {
15132 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15133 machopic_output_function_base_name (file);
15134 putc ('\n', file);
15135 }
15136 #endif
15137 else
15138 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15139 GOT_SYMBOL_NAME, LPREFIX, value);
15140 }
15141 \f
15142 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15143 for the target. */
15144
15145 void
15146 ix86_expand_clear (rtx dest)
15147 {
15148 rtx tmp;
15149
15150 /* We play register width games, which are only valid after reload. */
15151 gcc_assert (reload_completed);
15152
15153 /* Avoid HImode and its attendant prefix byte. */
15154 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15155 dest = gen_rtx_REG (SImode, REGNO (dest));
15156 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15157
15158 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15159 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15160 {
15161 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15162 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15163 }
15164
15165 emit_insn (tmp);
15166 }
15167
15168 /* X is an unchanging MEM. If it is a constant pool reference, return
15169 the constant pool rtx, else NULL. */
15170
15171 rtx
15172 maybe_get_pool_constant (rtx x)
15173 {
15174 x = ix86_delegitimize_address (XEXP (x, 0));
15175
15176 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15177 return get_pool_constant (x);
15178
15179 return NULL_RTX;
15180 }
15181
15182 void
15183 ix86_expand_move (enum machine_mode mode, rtx operands[])
15184 {
15185 rtx op0, op1;
15186 enum tls_model model;
15187
15188 op0 = operands[0];
15189 op1 = operands[1];
15190
15191 if (GET_CODE (op1) == SYMBOL_REF)
15192 {
15193 model = SYMBOL_REF_TLS_MODEL (op1);
15194 if (model)
15195 {
15196 op1 = legitimize_tls_address (op1, model, true);
15197 op1 = force_operand (op1, op0);
15198 if (op1 == op0)
15199 return;
15200 if (GET_MODE (op1) != mode)
15201 op1 = convert_to_mode (mode, op1, 1);
15202 }
15203 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15204 && SYMBOL_REF_DLLIMPORT_P (op1))
15205 op1 = legitimize_dllimport_symbol (op1, false);
15206 }
15207 else if (GET_CODE (op1) == CONST
15208 && GET_CODE (XEXP (op1, 0)) == PLUS
15209 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15210 {
15211 rtx addend = XEXP (XEXP (op1, 0), 1);
15212 rtx symbol = XEXP (XEXP (op1, 0), 0);
15213 rtx tmp = NULL;
15214
15215 model = SYMBOL_REF_TLS_MODEL (symbol);
15216 if (model)
15217 tmp = legitimize_tls_address (symbol, model, true);
15218 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15219 && SYMBOL_REF_DLLIMPORT_P (symbol))
15220 tmp = legitimize_dllimport_symbol (symbol, true);
15221
15222 if (tmp)
15223 {
15224 tmp = force_operand (tmp, NULL);
15225 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15226 op0, 1, OPTAB_DIRECT);
15227 if (tmp == op0)
15228 return;
15229 if (GET_MODE (tmp) != mode)
15230 op1 = convert_to_mode (mode, tmp, 1);
15231 }
15232 }
15233
15234 if ((flag_pic || MACHOPIC_INDIRECT)
15235 && symbolic_operand (op1, mode))
15236 {
15237 if (TARGET_MACHO && !TARGET_64BIT)
15238 {
15239 #if TARGET_MACHO
15240 /* dynamic-no-pic */
15241 if (MACHOPIC_INDIRECT)
15242 {
15243 rtx temp = ((reload_in_progress
15244 || ((op0 && REG_P (op0))
15245 && mode == Pmode))
15246 ? op0 : gen_reg_rtx (Pmode));
15247 op1 = machopic_indirect_data_reference (op1, temp);
15248 if (MACHOPIC_PURE)
15249 op1 = machopic_legitimize_pic_address (op1, mode,
15250 temp == op1 ? 0 : temp);
15251 }
15252 if (op0 != op1 && GET_CODE (op0) != MEM)
15253 {
15254 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15255 emit_insn (insn);
15256 return;
15257 }
15258 if (GET_CODE (op0) == MEM)
15259 op1 = force_reg (Pmode, op1);
15260 else
15261 {
15262 rtx temp = op0;
15263 if (GET_CODE (temp) != REG)
15264 temp = gen_reg_rtx (Pmode);
15265 temp = legitimize_pic_address (op1, temp);
15266 if (temp == op0)
15267 return;
15268 op1 = temp;
15269 }
15270 /* dynamic-no-pic */
15271 #endif
15272 }
15273 else
15274 {
15275 if (MEM_P (op0))
15276 op1 = force_reg (mode, op1);
15277 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15278 {
15279 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15280 op1 = legitimize_pic_address (op1, reg);
15281 if (op0 == op1)
15282 return;
15283 if (GET_MODE (op1) != mode)
15284 op1 = convert_to_mode (mode, op1, 1);
15285 }
15286 }
15287 }
15288 else
15289 {
15290 if (MEM_P (op0)
15291 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15292 || !push_operand (op0, mode))
15293 && MEM_P (op1))
15294 op1 = force_reg (mode, op1);
15295
15296 if (push_operand (op0, mode)
15297 && ! general_no_elim_operand (op1, mode))
15298 op1 = copy_to_mode_reg (mode, op1);
15299
15300 /* Force large constants in 64bit compilation into register
15301 to get them CSEed. */
15302 if (can_create_pseudo_p ()
15303 && (mode == DImode) && TARGET_64BIT
15304 && immediate_operand (op1, mode)
15305 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15306 && !register_operand (op0, mode)
15307 && optimize)
15308 op1 = copy_to_mode_reg (mode, op1);
15309
15310 if (can_create_pseudo_p ()
15311 && FLOAT_MODE_P (mode)
15312 && GET_CODE (op1) == CONST_DOUBLE)
15313 {
15314 /* If we are loading a floating point constant to a register,
15315 force the value to memory now, since we'll get better code
15316 out the back end. */
15317
15318 op1 = validize_mem (force_const_mem (mode, op1));
15319 if (!register_operand (op0, mode))
15320 {
15321 rtx temp = gen_reg_rtx (mode);
15322 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15323 emit_move_insn (op0, temp);
15324 return;
15325 }
15326 }
15327 }
15328
15329 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15330 }
15331
15332 void
15333 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15334 {
15335 rtx op0 = operands[0], op1 = operands[1];
15336 unsigned int align = GET_MODE_ALIGNMENT (mode);
15337
15338 /* Force constants other than zero into memory. We do not know how
15339 the instructions used to build constants modify the upper 64 bits
15340 of the register, once we have that information we may be able
15341 to handle some of them more efficiently. */
15342 if (can_create_pseudo_p ()
15343 && register_operand (op0, mode)
15344 && (CONSTANT_P (op1)
15345 || (GET_CODE (op1) == SUBREG
15346 && CONSTANT_P (SUBREG_REG (op1))))
15347 && !standard_sse_constant_p (op1))
15348 op1 = validize_mem (force_const_mem (mode, op1));
15349
15350 /* We need to check memory alignment for SSE mode since attribute
15351 can make operands unaligned. */
15352 if (can_create_pseudo_p ()
15353 && SSE_REG_MODE_P (mode)
15354 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15355 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15356 {
15357 rtx tmp[2];
15358
15359 /* ix86_expand_vector_move_misalign() does not like constants ... */
15360 if (CONSTANT_P (op1)
15361 || (GET_CODE (op1) == SUBREG
15362 && CONSTANT_P (SUBREG_REG (op1))))
15363 op1 = validize_mem (force_const_mem (mode, op1));
15364
15365 /* ... nor both arguments in memory. */
15366 if (!register_operand (op0, mode)
15367 && !register_operand (op1, mode))
15368 op1 = force_reg (mode, op1);
15369
15370 tmp[0] = op0; tmp[1] = op1;
15371 ix86_expand_vector_move_misalign (mode, tmp);
15372 return;
15373 }
15374
15375 /* Make operand1 a register if it isn't already. */
15376 if (can_create_pseudo_p ()
15377 && !register_operand (op0, mode)
15378 && !register_operand (op1, mode))
15379 {
15380 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15381 return;
15382 }
15383
15384 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15385 }
15386
15387 /* Split 32-byte AVX unaligned load and store if needed. */
15388
15389 static void
15390 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15391 {
15392 rtx m;
15393 rtx (*extract) (rtx, rtx, rtx);
15394 rtx (*move_unaligned) (rtx, rtx);
15395 enum machine_mode mode;
15396
15397 switch (GET_MODE (op0))
15398 {
15399 default:
15400 gcc_unreachable ();
15401 case V32QImode:
15402 extract = gen_avx_vextractf128v32qi;
15403 move_unaligned = gen_avx_movdqu256;
15404 mode = V16QImode;
15405 break;
15406 case V8SFmode:
15407 extract = gen_avx_vextractf128v8sf;
15408 move_unaligned = gen_avx_movups256;
15409 mode = V4SFmode;
15410 break;
15411 case V4DFmode:
15412 extract = gen_avx_vextractf128v4df;
15413 move_unaligned = gen_avx_movupd256;
15414 mode = V2DFmode;
15415 break;
15416 }
15417
15418 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15419 {
15420 rtx r = gen_reg_rtx (mode);
15421 m = adjust_address (op1, mode, 0);
15422 emit_move_insn (r, m);
15423 m = adjust_address (op1, mode, 16);
15424 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15425 emit_move_insn (op0, r);
15426 }
15427 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15428 {
15429 m = adjust_address (op0, mode, 0);
15430 emit_insn (extract (m, op1, const0_rtx));
15431 m = adjust_address (op0, mode, 16);
15432 emit_insn (extract (m, op1, const1_rtx));
15433 }
15434 else
15435 emit_insn (move_unaligned (op0, op1));
15436 }
15437
15438 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15439 straight to ix86_expand_vector_move. */
15440 /* Code generation for scalar reg-reg moves of single and double precision data:
15441 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15442 movaps reg, reg
15443 else
15444 movss reg, reg
15445 if (x86_sse_partial_reg_dependency == true)
15446 movapd reg, reg
15447 else
15448 movsd reg, reg
15449
15450 Code generation for scalar loads of double precision data:
15451 if (x86_sse_split_regs == true)
15452 movlpd mem, reg (gas syntax)
15453 else
15454 movsd mem, reg
15455
15456 Code generation for unaligned packed loads of single precision data
15457 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15458 if (x86_sse_unaligned_move_optimal)
15459 movups mem, reg
15460
15461 if (x86_sse_partial_reg_dependency == true)
15462 {
15463 xorps reg, reg
15464 movlps mem, reg
15465 movhps mem+8, reg
15466 }
15467 else
15468 {
15469 movlps mem, reg
15470 movhps mem+8, reg
15471 }
15472
15473 Code generation for unaligned packed loads of double precision data
15474 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15475 if (x86_sse_unaligned_move_optimal)
15476 movupd mem, reg
15477
15478 if (x86_sse_split_regs == true)
15479 {
15480 movlpd mem, reg
15481 movhpd mem+8, reg
15482 }
15483 else
15484 {
15485 movsd mem, reg
15486 movhpd mem+8, reg
15487 }
15488 */
15489
15490 void
15491 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15492 {
15493 rtx op0, op1, m;
15494
15495 op0 = operands[0];
15496 op1 = operands[1];
15497
15498 if (TARGET_AVX)
15499 {
15500 switch (GET_MODE_CLASS (mode))
15501 {
15502 case MODE_VECTOR_INT:
15503 case MODE_INT:
15504 switch (GET_MODE_SIZE (mode))
15505 {
15506 case 16:
15507 /* If we're optimizing for size, movups is the smallest. */
15508 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15509 {
15510 op0 = gen_lowpart (V4SFmode, op0);
15511 op1 = gen_lowpart (V4SFmode, op1);
15512 emit_insn (gen_sse_movups (op0, op1));
15513 return;
15514 }
15515 op0 = gen_lowpart (V16QImode, op0);
15516 op1 = gen_lowpart (V16QImode, op1);
15517 emit_insn (gen_sse2_movdqu (op0, op1));
15518 break;
15519 case 32:
15520 op0 = gen_lowpart (V32QImode, op0);
15521 op1 = gen_lowpart (V32QImode, op1);
15522 ix86_avx256_split_vector_move_misalign (op0, op1);
15523 break;
15524 default:
15525 gcc_unreachable ();
15526 }
15527 break;
15528 case MODE_VECTOR_FLOAT:
15529 op0 = gen_lowpart (mode, op0);
15530 op1 = gen_lowpart (mode, op1);
15531
15532 switch (mode)
15533 {
15534 case V4SFmode:
15535 emit_insn (gen_sse_movups (op0, op1));
15536 break;
15537 case V8SFmode:
15538 ix86_avx256_split_vector_move_misalign (op0, op1);
15539 break;
15540 case V2DFmode:
15541 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15542 {
15543 op0 = gen_lowpart (V4SFmode, op0);
15544 op1 = gen_lowpart (V4SFmode, op1);
15545 emit_insn (gen_sse_movups (op0, op1));
15546 return;
15547 }
15548 emit_insn (gen_sse2_movupd (op0, op1));
15549 break;
15550 case V4DFmode:
15551 ix86_avx256_split_vector_move_misalign (op0, op1);
15552 break;
15553 default:
15554 gcc_unreachable ();
15555 }
15556 break;
15557
15558 default:
15559 gcc_unreachable ();
15560 }
15561
15562 return;
15563 }
15564
15565 if (MEM_P (op1))
15566 {
15567 /* If we're optimizing for size, movups is the smallest. */
15568 if (optimize_insn_for_size_p ()
15569 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15570 {
15571 op0 = gen_lowpart (V4SFmode, op0);
15572 op1 = gen_lowpart (V4SFmode, op1);
15573 emit_insn (gen_sse_movups (op0, op1));
15574 return;
15575 }
15576
15577 /* ??? If we have typed data, then it would appear that using
15578 movdqu is the only way to get unaligned data loaded with
15579 integer type. */
15580 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15581 {
15582 op0 = gen_lowpart (V16QImode, op0);
15583 op1 = gen_lowpart (V16QImode, op1);
15584 emit_insn (gen_sse2_movdqu (op0, op1));
15585 return;
15586 }
15587
15588 if (TARGET_SSE2 && mode == V2DFmode)
15589 {
15590 rtx zero;
15591
15592 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15593 {
15594 op0 = gen_lowpart (V2DFmode, op0);
15595 op1 = gen_lowpart (V2DFmode, op1);
15596 emit_insn (gen_sse2_movupd (op0, op1));
15597 return;
15598 }
15599
15600 /* When SSE registers are split into halves, we can avoid
15601 writing to the top half twice. */
15602 if (TARGET_SSE_SPLIT_REGS)
15603 {
15604 emit_clobber (op0);
15605 zero = op0;
15606 }
15607 else
15608 {
15609 /* ??? Not sure about the best option for the Intel chips.
15610 The following would seem to satisfy; the register is
15611 entirely cleared, breaking the dependency chain. We
15612 then store to the upper half, with a dependency depth
15613 of one. A rumor has it that Intel recommends two movsd
15614 followed by an unpacklpd, but this is unconfirmed. And
15615 given that the dependency depth of the unpacklpd would
15616 still be one, I'm not sure why this would be better. */
15617 zero = CONST0_RTX (V2DFmode);
15618 }
15619
15620 m = adjust_address (op1, DFmode, 0);
15621 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15622 m = adjust_address (op1, DFmode, 8);
15623 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15624 }
15625 else
15626 {
15627 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15628 {
15629 op0 = gen_lowpart (V4SFmode, op0);
15630 op1 = gen_lowpart (V4SFmode, op1);
15631 emit_insn (gen_sse_movups (op0, op1));
15632 return;
15633 }
15634
15635 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
15636 emit_move_insn (op0, CONST0_RTX (mode));
15637 else
15638 emit_clobber (op0);
15639
15640 if (mode != V4SFmode)
15641 op0 = gen_lowpart (V4SFmode, op0);
15642 m = adjust_address (op1, V2SFmode, 0);
15643 emit_insn (gen_sse_loadlps (op0, op0, m));
15644 m = adjust_address (op1, V2SFmode, 8);
15645 emit_insn (gen_sse_loadhps (op0, op0, m));
15646 }
15647 }
15648 else if (MEM_P (op0))
15649 {
15650 /* If we're optimizing for size, movups is the smallest. */
15651 if (optimize_insn_for_size_p ()
15652 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15653 {
15654 op0 = gen_lowpart (V4SFmode, op0);
15655 op1 = gen_lowpart (V4SFmode, op1);
15656 emit_insn (gen_sse_movups (op0, op1));
15657 return;
15658 }
15659
15660 /* ??? Similar to above, only less clear because of quote
15661 typeless stores unquote. */
15662 if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
15663 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15664 {
15665 op0 = gen_lowpart (V16QImode, op0);
15666 op1 = gen_lowpart (V16QImode, op1);
15667 emit_insn (gen_sse2_movdqu (op0, op1));
15668 return;
15669 }
15670
15671 if (TARGET_SSE2 && mode == V2DFmode)
15672 {
15673 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15674 {
15675 op0 = gen_lowpart (V2DFmode, op0);
15676 op1 = gen_lowpart (V2DFmode, op1);
15677 emit_insn (gen_sse2_movupd (op0, op1));
15678 }
15679 else
15680 {
15681 m = adjust_address (op0, DFmode, 0);
15682 emit_insn (gen_sse2_storelpd (m, op1));
15683 m = adjust_address (op0, DFmode, 8);
15684 emit_insn (gen_sse2_storehpd (m, op1));
15685 }
15686 }
15687 else
15688 {
15689 if (mode != V4SFmode)
15690 op1 = gen_lowpart (V4SFmode, op1);
15691
15692 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15693 {
15694 op0 = gen_lowpart (V4SFmode, op0);
15695 emit_insn (gen_sse_movups (op0, op1));
15696 }
15697 else
15698 {
15699 m = adjust_address (op0, V2SFmode, 0);
15700 emit_insn (gen_sse_storelps (m, op1));
15701 m = adjust_address (op0, V2SFmode, 8);
15702 emit_insn (gen_sse_storehps (m, op1));
15703 }
15704 }
15705 }
15706 else
15707 gcc_unreachable ();
15708 }
15709
15710 /* Expand a push in MODE. This is some mode for which we do not support
15711 proper push instructions, at least from the registers that we expect
15712 the value to live in. */
15713
15714 void
15715 ix86_expand_push (enum machine_mode mode, rtx x)
15716 {
15717 rtx tmp;
15718
15719 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
15720 GEN_INT (-GET_MODE_SIZE (mode)),
15721 stack_pointer_rtx, 1, OPTAB_DIRECT);
15722 if (tmp != stack_pointer_rtx)
15723 emit_move_insn (stack_pointer_rtx, tmp);
15724
15725 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
15726
15727 /* When we push an operand onto stack, it has to be aligned at least
15728 at the function argument boundary. However since we don't have
15729 the argument type, we can't determine the actual argument
15730 boundary. */
15731 emit_move_insn (tmp, x);
15732 }
15733
15734 /* Helper function of ix86_fixup_binary_operands to canonicalize
15735 operand order. Returns true if the operands should be swapped. */
15736
15737 static bool
15738 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
15739 rtx operands[])
15740 {
15741 rtx dst = operands[0];
15742 rtx src1 = operands[1];
15743 rtx src2 = operands[2];
15744
15745 /* If the operation is not commutative, we can't do anything. */
15746 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
15747 return false;
15748
15749 /* Highest priority is that src1 should match dst. */
15750 if (rtx_equal_p (dst, src1))
15751 return false;
15752 if (rtx_equal_p (dst, src2))
15753 return true;
15754
15755 /* Next highest priority is that immediate constants come second. */
15756 if (immediate_operand (src2, mode))
15757 return false;
15758 if (immediate_operand (src1, mode))
15759 return true;
15760
15761 /* Lowest priority is that memory references should come second. */
15762 if (MEM_P (src2))
15763 return false;
15764 if (MEM_P (src1))
15765 return true;
15766
15767 return false;
15768 }
15769
15770
15771 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
15772 destination to use for the operation. If different from the true
15773 destination in operands[0], a copy operation will be required. */
15774
15775 rtx
15776 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
15777 rtx operands[])
15778 {
15779 rtx dst = operands[0];
15780 rtx src1 = operands[1];
15781 rtx src2 = operands[2];
15782
15783 /* Canonicalize operand order. */
15784 if (ix86_swap_binary_operands_p (code, mode, operands))
15785 {
15786 rtx temp;
15787
15788 /* It is invalid to swap operands of different modes. */
15789 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
15790
15791 temp = src1;
15792 src1 = src2;
15793 src2 = temp;
15794 }
15795
15796 /* Both source operands cannot be in memory. */
15797 if (MEM_P (src1) && MEM_P (src2))
15798 {
15799 /* Optimization: Only read from memory once. */
15800 if (rtx_equal_p (src1, src2))
15801 {
15802 src2 = force_reg (mode, src2);
15803 src1 = src2;
15804 }
15805 else
15806 src2 = force_reg (mode, src2);
15807 }
15808
15809 /* If the destination is memory, and we do not have matching source
15810 operands, do things in registers. */
15811 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
15812 dst = gen_reg_rtx (mode);
15813
15814 /* Source 1 cannot be a constant. */
15815 if (CONSTANT_P (src1))
15816 src1 = force_reg (mode, src1);
15817
15818 /* Source 1 cannot be a non-matching memory. */
15819 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
15820 src1 = force_reg (mode, src1);
15821
15822 /* Improve address combine. */
15823 if (code == PLUS
15824 && GET_MODE_CLASS (mode) == MODE_INT
15825 && MEM_P (src2))
15826 src2 = force_reg (mode, src2);
15827
15828 operands[1] = src1;
15829 operands[2] = src2;
15830 return dst;
15831 }
15832
15833 /* Similarly, but assume that the destination has already been
15834 set up properly. */
15835
15836 void
15837 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
15838 enum machine_mode mode, rtx operands[])
15839 {
15840 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
15841 gcc_assert (dst == operands[0]);
15842 }
15843
15844 /* Attempt to expand a binary operator. Make the expansion closer to the
15845 actual machine, then just general_operand, which will allow 3 separate
15846 memory references (one output, two input) in a single insn. */
15847
15848 void
15849 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
15850 rtx operands[])
15851 {
15852 rtx src1, src2, dst, op, clob;
15853
15854 dst = ix86_fixup_binary_operands (code, mode, operands);
15855 src1 = operands[1];
15856 src2 = operands[2];
15857
15858 /* Emit the instruction. */
15859
15860 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
15861 if (reload_in_progress)
15862 {
15863 /* Reload doesn't know about the flags register, and doesn't know that
15864 it doesn't want to clobber it. We can only do this with PLUS. */
15865 gcc_assert (code == PLUS);
15866 emit_insn (op);
15867 }
15868 else if (reload_completed
15869 && code == PLUS
15870 && !rtx_equal_p (dst, src1))
15871 {
15872 /* This is going to be an LEA; avoid splitting it later. */
15873 emit_insn (op);
15874 }
15875 else
15876 {
15877 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15878 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
15879 }
15880
15881 /* Fix up the destination if needed. */
15882 if (dst != operands[0])
15883 emit_move_insn (operands[0], dst);
15884 }
15885
15886 /* Return TRUE or FALSE depending on whether the binary operator meets the
15887 appropriate constraints. */
15888
15889 bool
15890 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
15891 rtx operands[3])
15892 {
15893 rtx dst = operands[0];
15894 rtx src1 = operands[1];
15895 rtx src2 = operands[2];
15896
15897 /* Both source operands cannot be in memory. */
15898 if (MEM_P (src1) && MEM_P (src2))
15899 return false;
15900
15901 /* Canonicalize operand order for commutative operators. */
15902 if (ix86_swap_binary_operands_p (code, mode, operands))
15903 {
15904 rtx temp = src1;
15905 src1 = src2;
15906 src2 = temp;
15907 }
15908
15909 /* If the destination is memory, we must have a matching source operand. */
15910 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
15911 return false;
15912
15913 /* Source 1 cannot be a constant. */
15914 if (CONSTANT_P (src1))
15915 return false;
15916
15917 /* Source 1 cannot be a non-matching memory. */
15918 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
15919 /* Support "andhi/andsi/anddi" as a zero-extending move. */
15920 return (code == AND
15921 && (mode == HImode
15922 || mode == SImode
15923 || (TARGET_64BIT && mode == DImode))
15924 && satisfies_constraint_L (src2));
15925
15926 return true;
15927 }
15928
15929 /* Attempt to expand a unary operator. Make the expansion closer to the
15930 actual machine, then just general_operand, which will allow 2 separate
15931 memory references (one output, one input) in a single insn. */
15932
15933 void
15934 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
15935 rtx operands[])
15936 {
15937 int matching_memory;
15938 rtx src, dst, op, clob;
15939
15940 dst = operands[0];
15941 src = operands[1];
15942
15943 /* If the destination is memory, and we do not have matching source
15944 operands, do things in registers. */
15945 matching_memory = 0;
15946 if (MEM_P (dst))
15947 {
15948 if (rtx_equal_p (dst, src))
15949 matching_memory = 1;
15950 else
15951 dst = gen_reg_rtx (mode);
15952 }
15953
15954 /* When source operand is memory, destination must match. */
15955 if (MEM_P (src) && !matching_memory)
15956 src = force_reg (mode, src);
15957
15958 /* Emit the instruction. */
15959
15960 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
15961 if (reload_in_progress || code == NOT)
15962 {
15963 /* Reload doesn't know about the flags register, and doesn't know that
15964 it doesn't want to clobber it. */
15965 gcc_assert (code == NOT);
15966 emit_insn (op);
15967 }
15968 else
15969 {
15970 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15971 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
15972 }
15973
15974 /* Fix up the destination if needed. */
15975 if (dst != operands[0])
15976 emit_move_insn (operands[0], dst);
15977 }
15978
15979 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
15980 divisor are within the range [0-255]. */
15981
15982 void
15983 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
15984 bool signed_p)
15985 {
15986 rtx end_label, qimode_label;
15987 rtx insn, div, mod;
15988 rtx scratch, tmp0, tmp1, tmp2;
15989 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
15990 rtx (*gen_zero_extend) (rtx, rtx);
15991 rtx (*gen_test_ccno_1) (rtx, rtx);
15992
15993 switch (mode)
15994 {
15995 case SImode:
15996 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
15997 gen_test_ccno_1 = gen_testsi_ccno_1;
15998 gen_zero_extend = gen_zero_extendqisi2;
15999 break;
16000 case DImode:
16001 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16002 gen_test_ccno_1 = gen_testdi_ccno_1;
16003 gen_zero_extend = gen_zero_extendqidi2;
16004 break;
16005 default:
16006 gcc_unreachable ();
16007 }
16008
16009 end_label = gen_label_rtx ();
16010 qimode_label = gen_label_rtx ();
16011
16012 scratch = gen_reg_rtx (mode);
16013
16014 /* Use 8bit unsigned divimod if dividend and divisor are within
16015 the range [0-255]. */
16016 emit_move_insn (scratch, operands[2]);
16017 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16018 scratch, 1, OPTAB_DIRECT);
16019 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16020 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16021 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16022 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16023 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16024 pc_rtx);
16025 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16026 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16027 JUMP_LABEL (insn) = qimode_label;
16028
16029 /* Generate original signed/unsigned divimod. */
16030 div = gen_divmod4_1 (operands[0], operands[1],
16031 operands[2], operands[3]);
16032 emit_insn (div);
16033
16034 /* Branch to the end. */
16035 emit_jump_insn (gen_jump (end_label));
16036 emit_barrier ();
16037
16038 /* Generate 8bit unsigned divide. */
16039 emit_label (qimode_label);
16040 /* Don't use operands[0] for result of 8bit divide since not all
16041 registers support QImode ZERO_EXTRACT. */
16042 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16043 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16044 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16045 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16046
16047 if (signed_p)
16048 {
16049 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16050 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16051 }
16052 else
16053 {
16054 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16055 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16056 }
16057
16058 /* Extract remainder from AH. */
16059 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16060 if (REG_P (operands[1]))
16061 insn = emit_move_insn (operands[1], tmp1);
16062 else
16063 {
16064 /* Need a new scratch register since the old one has result
16065 of 8bit divide. */
16066 scratch = gen_reg_rtx (mode);
16067 emit_move_insn (scratch, tmp1);
16068 insn = emit_move_insn (operands[1], scratch);
16069 }
16070 set_unique_reg_note (insn, REG_EQUAL, mod);
16071
16072 /* Zero extend quotient from AL. */
16073 tmp1 = gen_lowpart (QImode, tmp0);
16074 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16075 set_unique_reg_note (insn, REG_EQUAL, div);
16076
16077 emit_label (end_label);
16078 }
16079
16080 #define LEA_MAX_STALL (3)
16081 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16082
16083 /* Increase given DISTANCE in half-cycles according to
16084 dependencies between PREV and NEXT instructions.
16085 Add 1 half-cycle if there is no dependency and
16086 go to next cycle if there is some dependecy. */
16087
16088 static unsigned int
16089 increase_distance (rtx prev, rtx next, unsigned int distance)
16090 {
16091 df_ref *use_rec;
16092 df_ref *def_rec;
16093
16094 if (!prev || !next)
16095 return distance + (distance & 1) + 2;
16096
16097 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16098 return distance + 1;
16099
16100 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16101 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16102 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16103 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16104 return distance + (distance & 1) + 2;
16105
16106 return distance + 1;
16107 }
16108
16109 /* Function checks if instruction INSN defines register number
16110 REGNO1 or REGNO2. */
16111
16112 static bool
16113 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16114 rtx insn)
16115 {
16116 df_ref *def_rec;
16117
16118 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16119 if (DF_REF_REG_DEF_P (*def_rec)
16120 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16121 && (regno1 == DF_REF_REGNO (*def_rec)
16122 || regno2 == DF_REF_REGNO (*def_rec)))
16123 {
16124 return true;
16125 }
16126
16127 return false;
16128 }
16129
16130 /* Function checks if instruction INSN uses register number
16131 REGNO as a part of address expression. */
16132
16133 static bool
16134 insn_uses_reg_mem (unsigned int regno, rtx insn)
16135 {
16136 df_ref *use_rec;
16137
16138 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16139 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16140 return true;
16141
16142 return false;
16143 }
16144
16145 /* Search backward for non-agu definition of register number REGNO1
16146 or register number REGNO2 in basic block starting from instruction
16147 START up to head of basic block or instruction INSN.
16148
16149 Function puts true value into *FOUND var if definition was found
16150 and false otherwise.
16151
16152 Distance in half-cycles between START and found instruction or head
16153 of BB is added to DISTANCE and returned. */
16154
16155 static int
16156 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16157 rtx insn, int distance,
16158 rtx start, bool *found)
16159 {
16160 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16161 rtx prev = start;
16162 rtx next = NULL;
16163 enum attr_type insn_type;
16164
16165 *found = false;
16166
16167 while (prev
16168 && prev != insn
16169 && distance < LEA_SEARCH_THRESHOLD)
16170 {
16171 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16172 {
16173 distance = increase_distance (prev, next, distance);
16174 if (insn_defines_reg (regno1, regno2, prev))
16175 {
16176 insn_type = get_attr_type (prev);
16177 if (insn_type != TYPE_LEA)
16178 {
16179 *found = true;
16180 return distance;
16181 }
16182 }
16183
16184 next = prev;
16185 }
16186 if (prev == BB_HEAD (bb))
16187 break;
16188
16189 prev = PREV_INSN (prev);
16190 }
16191
16192 return distance;
16193 }
16194
16195 /* Search backward for non-agu definition of register number REGNO1
16196 or register number REGNO2 in INSN's basic block until
16197 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16198 2. Reach neighbour BBs boundary, or
16199 3. Reach agu definition.
16200 Returns the distance between the non-agu definition point and INSN.
16201 If no definition point, returns -1. */
16202
16203 static int
16204 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16205 rtx insn)
16206 {
16207 basic_block bb = BLOCK_FOR_INSN (insn);
16208 int distance = 0;
16209 bool found = false;
16210
16211 if (insn != BB_HEAD (bb))
16212 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16213 distance, PREV_INSN (insn),
16214 &found);
16215
16216 if (!found && distance < LEA_SEARCH_THRESHOLD)
16217 {
16218 edge e;
16219 edge_iterator ei;
16220 bool simple_loop = false;
16221
16222 FOR_EACH_EDGE (e, ei, bb->preds)
16223 if (e->src == bb)
16224 {
16225 simple_loop = true;
16226 break;
16227 }
16228
16229 if (simple_loop)
16230 distance = distance_non_agu_define_in_bb (regno1, regno2,
16231 insn, distance,
16232 BB_END (bb), &found);
16233 else
16234 {
16235 int shortest_dist = -1;
16236 bool found_in_bb = false;
16237
16238 FOR_EACH_EDGE (e, ei, bb->preds)
16239 {
16240 int bb_dist
16241 = distance_non_agu_define_in_bb (regno1, regno2,
16242 insn, distance,
16243 BB_END (e->src),
16244 &found_in_bb);
16245 if (found_in_bb)
16246 {
16247 if (shortest_dist < 0)
16248 shortest_dist = bb_dist;
16249 else if (bb_dist > 0)
16250 shortest_dist = MIN (bb_dist, shortest_dist);
16251
16252 found = true;
16253 }
16254 }
16255
16256 distance = shortest_dist;
16257 }
16258 }
16259
16260 /* get_attr_type may modify recog data. We want to make sure
16261 that recog data is valid for instruction INSN, on which
16262 distance_non_agu_define is called. INSN is unchanged here. */
16263 extract_insn_cached (insn);
16264
16265 if (!found)
16266 return -1;
16267
16268 return distance >> 1;
16269 }
16270
16271 /* Return the distance in half-cycles between INSN and the next
16272 insn that uses register number REGNO in memory address added
16273 to DISTANCE. Return -1 if REGNO0 is set.
16274
16275 Put true value into *FOUND if register usage was found and
16276 false otherwise.
16277 Put true value into *REDEFINED if register redefinition was
16278 found and false otherwise. */
16279
16280 static int
16281 distance_agu_use_in_bb (unsigned int regno,
16282 rtx insn, int distance, rtx start,
16283 bool *found, bool *redefined)
16284 {
16285 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16286 rtx next = start;
16287 rtx prev = NULL;
16288
16289 *found = false;
16290 *redefined = false;
16291
16292 while (next
16293 && next != insn
16294 && distance < LEA_SEARCH_THRESHOLD)
16295 {
16296 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16297 {
16298 distance = increase_distance(prev, next, distance);
16299 if (insn_uses_reg_mem (regno, next))
16300 {
16301 /* Return DISTANCE if OP0 is used in memory
16302 address in NEXT. */
16303 *found = true;
16304 return distance;
16305 }
16306
16307 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16308 {
16309 /* Return -1 if OP0 is set in NEXT. */
16310 *redefined = true;
16311 return -1;
16312 }
16313
16314 prev = next;
16315 }
16316
16317 if (next == BB_END (bb))
16318 break;
16319
16320 next = NEXT_INSN (next);
16321 }
16322
16323 return distance;
16324 }
16325
16326 /* Return the distance between INSN and the next insn that uses
16327 register number REGNO0 in memory address. Return -1 if no such
16328 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16329
16330 static int
16331 distance_agu_use (unsigned int regno0, rtx insn)
16332 {
16333 basic_block bb = BLOCK_FOR_INSN (insn);
16334 int distance = 0;
16335 bool found = false;
16336 bool redefined = false;
16337
16338 if (insn != BB_END (bb))
16339 distance = distance_agu_use_in_bb (regno0, insn, distance,
16340 NEXT_INSN (insn),
16341 &found, &redefined);
16342
16343 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16344 {
16345 edge e;
16346 edge_iterator ei;
16347 bool simple_loop = false;
16348
16349 FOR_EACH_EDGE (e, ei, bb->succs)
16350 if (e->dest == bb)
16351 {
16352 simple_loop = true;
16353 break;
16354 }
16355
16356 if (simple_loop)
16357 distance = distance_agu_use_in_bb (regno0, insn,
16358 distance, BB_HEAD (bb),
16359 &found, &redefined);
16360 else
16361 {
16362 int shortest_dist = -1;
16363 bool found_in_bb = false;
16364 bool redefined_in_bb = false;
16365
16366 FOR_EACH_EDGE (e, ei, bb->succs)
16367 {
16368 int bb_dist
16369 = distance_agu_use_in_bb (regno0, insn,
16370 distance, BB_HEAD (e->dest),
16371 &found_in_bb, &redefined_in_bb);
16372 if (found_in_bb)
16373 {
16374 if (shortest_dist < 0)
16375 shortest_dist = bb_dist;
16376 else if (bb_dist > 0)
16377 shortest_dist = MIN (bb_dist, shortest_dist);
16378
16379 found = true;
16380 }
16381 }
16382
16383 distance = shortest_dist;
16384 }
16385 }
16386
16387 if (!found || redefined)
16388 return -1;
16389
16390 return distance >> 1;
16391 }
16392
16393 /* Define this macro to tune LEA priority vs ADD, it take effect when
16394 there is a dilemma of choicing LEA or ADD
16395 Negative value: ADD is more preferred than LEA
16396 Zero: Netrual
16397 Positive value: LEA is more preferred than ADD*/
16398 #define IX86_LEA_PRIORITY 0
16399
16400 /* Return true if usage of lea INSN has performance advantage
16401 over a sequence of instructions. Instructions sequence has
16402 SPLIT_COST cycles higher latency than lea latency. */
16403
16404 bool
16405 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16406 unsigned int regno2, unsigned int split_cost)
16407 {
16408 int dist_define, dist_use;
16409
16410 dist_define = distance_non_agu_define (regno1, regno2, insn);
16411 dist_use = distance_agu_use (regno0, insn);
16412
16413 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16414 {
16415 /* If there is no non AGU operand definition, no AGU
16416 operand usage and split cost is 0 then both lea
16417 and non lea variants have same priority. Currently
16418 we prefer lea for 64 bit code and non lea on 32 bit
16419 code. */
16420 if (dist_use < 0 && split_cost == 0)
16421 return TARGET_64BIT || IX86_LEA_PRIORITY;
16422 else
16423 return true;
16424 }
16425
16426 /* With longer definitions distance lea is more preferable.
16427 Here we change it to take into account splitting cost and
16428 lea priority. */
16429 dist_define += split_cost + IX86_LEA_PRIORITY;
16430
16431 /* If there is no use in memory addess then we just check
16432 that split cost does not exceed AGU stall. */
16433 if (dist_use < 0)
16434 return dist_define >= LEA_MAX_STALL;
16435
16436 /* If this insn has both backward non-agu dependence and forward
16437 agu dependence, the one with short distance takes effect. */
16438 return dist_define >= dist_use;
16439 }
16440
16441 /* Return true if it is legal to clobber flags by INSN and
16442 false otherwise. */
16443
16444 static bool
16445 ix86_ok_to_clobber_flags (rtx insn)
16446 {
16447 basic_block bb = BLOCK_FOR_INSN (insn);
16448 df_ref *use;
16449 bitmap live;
16450
16451 while (insn)
16452 {
16453 if (NONDEBUG_INSN_P (insn))
16454 {
16455 for (use = DF_INSN_USES (insn); *use; use++)
16456 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16457 return false;
16458
16459 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16460 return true;
16461 }
16462
16463 if (insn == BB_END (bb))
16464 break;
16465
16466 insn = NEXT_INSN (insn);
16467 }
16468
16469 live = df_get_live_out(bb);
16470 return !REGNO_REG_SET_P (live, FLAGS_REG);
16471 }
16472
16473 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16474 move and add to avoid AGU stalls. */
16475
16476 bool
16477 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16478 {
16479 unsigned int regno0 = true_regnum (operands[0]);
16480 unsigned int regno1 = true_regnum (operands[1]);
16481 unsigned int regno2 = true_regnum (operands[2]);
16482
16483 /* Check if we need to optimize. */
16484 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16485 return false;
16486
16487 /* Check it is correct to split here. */
16488 if (!ix86_ok_to_clobber_flags(insn))
16489 return false;
16490
16491 /* We need to split only adds with non destructive
16492 destination operand. */
16493 if (regno0 == regno1 || regno0 == regno2)
16494 return false;
16495 else
16496 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16497 }
16498
16499 /* Return true if we need to split lea into a sequence of
16500 instructions to avoid AGU stalls. */
16501
16502 bool
16503 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16504 {
16505 unsigned int regno0 = true_regnum (operands[0]) ;
16506 unsigned int regno1 = -1;
16507 unsigned int regno2 = -1;
16508 unsigned int split_cost = 0;
16509 struct ix86_address parts;
16510 int ok;
16511
16512 /* Check we need to optimize. */
16513 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16514 return false;
16515
16516 /* Check it is correct to split here. */
16517 if (!ix86_ok_to_clobber_flags(insn))
16518 return false;
16519
16520 ok = ix86_decompose_address (operands[1], &parts);
16521 gcc_assert (ok);
16522
16523 /* We should not split into add if non legitimate pic
16524 operand is used as displacement. */
16525 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16526 return false;
16527
16528 if (parts.base)
16529 regno1 = true_regnum (parts.base);
16530 if (parts.index)
16531 regno2 = true_regnum (parts.index);
16532
16533 /* Compute how many cycles we will add to execution time
16534 if split lea into a sequence of instructions. */
16535 if (parts.base || parts.index)
16536 {
16537 /* Have to use mov instruction if non desctructive
16538 destination form is used. */
16539 if (regno1 != regno0 && regno2 != regno0)
16540 split_cost += 1;
16541
16542 /* Have to add index to base if both exist. */
16543 if (parts.base && parts.index)
16544 split_cost += 1;
16545
16546 /* Have to use shift and adds if scale is 2 or greater. */
16547 if (parts.scale > 1)
16548 {
16549 if (regno0 != regno1)
16550 split_cost += 1;
16551 else if (regno2 == regno0)
16552 split_cost += 4;
16553 else
16554 split_cost += parts.scale;
16555 }
16556
16557 /* Have to use add instruction with immediate if
16558 disp is non zero. */
16559 if (parts.disp && parts.disp != const0_rtx)
16560 split_cost += 1;
16561
16562 /* Subtract the price of lea. */
16563 split_cost -= 1;
16564 }
16565
16566 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
16567 }
16568
16569 /* Emit x86 binary operand CODE in mode MODE, where the first operand
16570 matches destination. RTX includes clobber of FLAGS_REG. */
16571
16572 static void
16573 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
16574 rtx dst, rtx src)
16575 {
16576 rtx op, clob;
16577
16578 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
16579 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16580
16581 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16582 }
16583
16584 /* Split lea instructions into a sequence of instructions
16585 which are executed on ALU to avoid AGU stalls.
16586 It is assumed that it is allowed to clobber flags register
16587 at lea position. */
16588
16589 extern void
16590 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
16591 {
16592 unsigned int regno0 = true_regnum (operands[0]) ;
16593 unsigned int regno1 = INVALID_REGNUM;
16594 unsigned int regno2 = INVALID_REGNUM;
16595 struct ix86_address parts;
16596 rtx tmp;
16597 int ok, adds;
16598
16599 ok = ix86_decompose_address (operands[1], &parts);
16600 gcc_assert (ok);
16601
16602 if (parts.base)
16603 {
16604 if (GET_MODE (parts.base) != mode)
16605 parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
16606 regno1 = true_regnum (parts.base);
16607 }
16608
16609 if (parts.index)
16610 {
16611 if (GET_MODE (parts.index) != mode)
16612 parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
16613 regno2 = true_regnum (parts.index);
16614 }
16615
16616 if (parts.scale > 1)
16617 {
16618 /* Case r1 = r1 + ... */
16619 if (regno1 == regno0)
16620 {
16621 /* If we have a case r1 = r1 + C * r1 then we
16622 should use multiplication which is very
16623 expensive. Assume cost model is wrong if we
16624 have such case here. */
16625 gcc_assert (regno2 != regno0);
16626
16627 for (adds = parts.scale; adds > 0; adds--)
16628 ix86_emit_binop (PLUS, mode, operands[0], parts.index);
16629 }
16630 else
16631 {
16632 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
16633 if (regno0 != regno2)
16634 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16635
16636 /* Use shift for scaling. */
16637 ix86_emit_binop (ASHIFT, mode, operands[0],
16638 GEN_INT (exact_log2 (parts.scale)));
16639
16640 if (parts.base)
16641 ix86_emit_binop (PLUS, mode, operands[0], parts.base);
16642
16643 if (parts.disp && parts.disp != const0_rtx)
16644 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16645 }
16646 }
16647 else if (!parts.base && !parts.index)
16648 {
16649 gcc_assert(parts.disp);
16650 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
16651 }
16652 else
16653 {
16654 if (!parts.base)
16655 {
16656 if (regno0 != regno2)
16657 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16658 }
16659 else if (!parts.index)
16660 {
16661 if (regno0 != regno1)
16662 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16663 }
16664 else
16665 {
16666 if (regno0 == regno1)
16667 tmp = parts.index;
16668 else if (regno0 == regno2)
16669 tmp = parts.base;
16670 else
16671 {
16672 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16673 tmp = parts.index;
16674 }
16675
16676 ix86_emit_binop (PLUS, mode, operands[0], tmp);
16677 }
16678
16679 if (parts.disp && parts.disp != const0_rtx)
16680 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16681 }
16682 }
16683
16684 /* Return true if it is ok to optimize an ADD operation to LEA
16685 operation to avoid flag register consumation. For most processors,
16686 ADD is faster than LEA. For the processors like ATOM, if the
16687 destination register of LEA holds an actual address which will be
16688 used soon, LEA is better and otherwise ADD is better. */
16689
16690 bool
16691 ix86_lea_for_add_ok (rtx insn, rtx operands[])
16692 {
16693 unsigned int regno0 = true_regnum (operands[0]);
16694 unsigned int regno1 = true_regnum (operands[1]);
16695 unsigned int regno2 = true_regnum (operands[2]);
16696
16697 /* If a = b + c, (a!=b && a!=c), must use lea form. */
16698 if (regno0 != regno1 && regno0 != regno2)
16699 return true;
16700
16701 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16702 return false;
16703
16704 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
16705 }
16706
16707 /* Return true if destination reg of SET_BODY is shift count of
16708 USE_BODY. */
16709
16710 static bool
16711 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
16712 {
16713 rtx set_dest;
16714 rtx shift_rtx;
16715 int i;
16716
16717 /* Retrieve destination of SET_BODY. */
16718 switch (GET_CODE (set_body))
16719 {
16720 case SET:
16721 set_dest = SET_DEST (set_body);
16722 if (!set_dest || !REG_P (set_dest))
16723 return false;
16724 break;
16725 case PARALLEL:
16726 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
16727 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
16728 use_body))
16729 return true;
16730 default:
16731 return false;
16732 break;
16733 }
16734
16735 /* Retrieve shift count of USE_BODY. */
16736 switch (GET_CODE (use_body))
16737 {
16738 case SET:
16739 shift_rtx = XEXP (use_body, 1);
16740 break;
16741 case PARALLEL:
16742 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
16743 if (ix86_dep_by_shift_count_body (set_body,
16744 XVECEXP (use_body, 0, i)))
16745 return true;
16746 default:
16747 return false;
16748 break;
16749 }
16750
16751 if (shift_rtx
16752 && (GET_CODE (shift_rtx) == ASHIFT
16753 || GET_CODE (shift_rtx) == LSHIFTRT
16754 || GET_CODE (shift_rtx) == ASHIFTRT
16755 || GET_CODE (shift_rtx) == ROTATE
16756 || GET_CODE (shift_rtx) == ROTATERT))
16757 {
16758 rtx shift_count = XEXP (shift_rtx, 1);
16759
16760 /* Return true if shift count is dest of SET_BODY. */
16761 if (REG_P (shift_count)
16762 && true_regnum (set_dest) == true_regnum (shift_count))
16763 return true;
16764 }
16765
16766 return false;
16767 }
16768
16769 /* Return true if destination reg of SET_INSN is shift count of
16770 USE_INSN. */
16771
16772 bool
16773 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
16774 {
16775 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
16776 PATTERN (use_insn));
16777 }
16778
16779 /* Return TRUE or FALSE depending on whether the unary operator meets the
16780 appropriate constraints. */
16781
16782 bool
16783 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
16784 enum machine_mode mode ATTRIBUTE_UNUSED,
16785 rtx operands[2] ATTRIBUTE_UNUSED)
16786 {
16787 /* If one of operands is memory, source and destination must match. */
16788 if ((MEM_P (operands[0])
16789 || MEM_P (operands[1]))
16790 && ! rtx_equal_p (operands[0], operands[1]))
16791 return false;
16792 return true;
16793 }
16794
16795 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
16796 are ok, keeping in mind the possible movddup alternative. */
16797
16798 bool
16799 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
16800 {
16801 if (MEM_P (operands[0]))
16802 return rtx_equal_p (operands[0], operands[1 + high]);
16803 if (MEM_P (operands[1]) && MEM_P (operands[2]))
16804 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
16805 return true;
16806 }
16807
16808 /* Post-reload splitter for converting an SF or DFmode value in an
16809 SSE register into an unsigned SImode. */
16810
16811 void
16812 ix86_split_convert_uns_si_sse (rtx operands[])
16813 {
16814 enum machine_mode vecmode;
16815 rtx value, large, zero_or_two31, input, two31, x;
16816
16817 large = operands[1];
16818 zero_or_two31 = operands[2];
16819 input = operands[3];
16820 two31 = operands[4];
16821 vecmode = GET_MODE (large);
16822 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
16823
16824 /* Load up the value into the low element. We must ensure that the other
16825 elements are valid floats -- zero is the easiest such value. */
16826 if (MEM_P (input))
16827 {
16828 if (vecmode == V4SFmode)
16829 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
16830 else
16831 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
16832 }
16833 else
16834 {
16835 input = gen_rtx_REG (vecmode, REGNO (input));
16836 emit_move_insn (value, CONST0_RTX (vecmode));
16837 if (vecmode == V4SFmode)
16838 emit_insn (gen_sse_movss (value, value, input));
16839 else
16840 emit_insn (gen_sse2_movsd (value, value, input));
16841 }
16842
16843 emit_move_insn (large, two31);
16844 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
16845
16846 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
16847 emit_insn (gen_rtx_SET (VOIDmode, large, x));
16848
16849 x = gen_rtx_AND (vecmode, zero_or_two31, large);
16850 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
16851
16852 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
16853 emit_insn (gen_rtx_SET (VOIDmode, value, x));
16854
16855 large = gen_rtx_REG (V4SImode, REGNO (large));
16856 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
16857
16858 x = gen_rtx_REG (V4SImode, REGNO (value));
16859 if (vecmode == V4SFmode)
16860 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
16861 else
16862 emit_insn (gen_sse2_cvttpd2dq (x, value));
16863 value = x;
16864
16865 emit_insn (gen_xorv4si3 (value, value, large));
16866 }
16867
16868 /* Convert an unsigned DImode value into a DFmode, using only SSE.
16869 Expects the 64-bit DImode to be supplied in a pair of integral
16870 registers. Requires SSE2; will use SSE3 if available. For x86_32,
16871 -mfpmath=sse, !optimize_size only. */
16872
16873 void
16874 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
16875 {
16876 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
16877 rtx int_xmm, fp_xmm;
16878 rtx biases, exponents;
16879 rtx x;
16880
16881 int_xmm = gen_reg_rtx (V4SImode);
16882 if (TARGET_INTER_UNIT_MOVES)
16883 emit_insn (gen_movdi_to_sse (int_xmm, input));
16884 else if (TARGET_SSE_SPLIT_REGS)
16885 {
16886 emit_clobber (int_xmm);
16887 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
16888 }
16889 else
16890 {
16891 x = gen_reg_rtx (V2DImode);
16892 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
16893 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
16894 }
16895
16896 x = gen_rtx_CONST_VECTOR (V4SImode,
16897 gen_rtvec (4, GEN_INT (0x43300000UL),
16898 GEN_INT (0x45300000UL),
16899 const0_rtx, const0_rtx));
16900 exponents = validize_mem (force_const_mem (V4SImode, x));
16901
16902 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
16903 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
16904
16905 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
16906 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
16907 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
16908 (0x1.0p84 + double(fp_value_hi_xmm)).
16909 Note these exponents differ by 32. */
16910
16911 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
16912
16913 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
16914 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
16915 real_ldexp (&bias_lo_rvt, &dconst1, 52);
16916 real_ldexp (&bias_hi_rvt, &dconst1, 84);
16917 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
16918 x = const_double_from_real_value (bias_hi_rvt, DFmode);
16919 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
16920 biases = validize_mem (force_const_mem (V2DFmode, biases));
16921 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
16922
16923 /* Add the upper and lower DFmode values together. */
16924 if (TARGET_SSE3)
16925 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
16926 else
16927 {
16928 x = copy_to_mode_reg (V2DFmode, fp_xmm);
16929 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
16930 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
16931 }
16932
16933 ix86_expand_vector_extract (false, target, fp_xmm, 0);
16934 }
16935
16936 /* Not used, but eases macroization of patterns. */
16937 void
16938 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
16939 rtx input ATTRIBUTE_UNUSED)
16940 {
16941 gcc_unreachable ();
16942 }
16943
16944 /* Convert an unsigned SImode value into a DFmode. Only currently used
16945 for SSE, but applicable anywhere. */
16946
16947 void
16948 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
16949 {
16950 REAL_VALUE_TYPE TWO31r;
16951 rtx x, fp;
16952
16953 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
16954 NULL, 1, OPTAB_DIRECT);
16955
16956 fp = gen_reg_rtx (DFmode);
16957 emit_insn (gen_floatsidf2 (fp, x));
16958
16959 real_ldexp (&TWO31r, &dconst1, 31);
16960 x = const_double_from_real_value (TWO31r, DFmode);
16961
16962 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
16963 if (x != target)
16964 emit_move_insn (target, x);
16965 }
16966
16967 /* Convert a signed DImode value into a DFmode. Only used for SSE in
16968 32-bit mode; otherwise we have a direct convert instruction. */
16969
16970 void
16971 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
16972 {
16973 REAL_VALUE_TYPE TWO32r;
16974 rtx fp_lo, fp_hi, x;
16975
16976 fp_lo = gen_reg_rtx (DFmode);
16977 fp_hi = gen_reg_rtx (DFmode);
16978
16979 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
16980
16981 real_ldexp (&TWO32r, &dconst1, 32);
16982 x = const_double_from_real_value (TWO32r, DFmode);
16983 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
16984
16985 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
16986
16987 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
16988 0, OPTAB_DIRECT);
16989 if (x != target)
16990 emit_move_insn (target, x);
16991 }
16992
16993 /* Convert an unsigned SImode value into a SFmode, using only SSE.
16994 For x86_32, -mfpmath=sse, !optimize_size only. */
16995 void
16996 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
16997 {
16998 REAL_VALUE_TYPE ONE16r;
16999 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17000
17001 real_ldexp (&ONE16r, &dconst1, 16);
17002 x = const_double_from_real_value (ONE16r, SFmode);
17003 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17004 NULL, 0, OPTAB_DIRECT);
17005 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17006 NULL, 0, OPTAB_DIRECT);
17007 fp_hi = gen_reg_rtx (SFmode);
17008 fp_lo = gen_reg_rtx (SFmode);
17009 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17010 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17011 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17012 0, OPTAB_DIRECT);
17013 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17014 0, OPTAB_DIRECT);
17015 if (!rtx_equal_p (target, fp_hi))
17016 emit_move_insn (target, fp_hi);
17017 }
17018
17019 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17020 a vector of unsigned ints VAL to vector of floats TARGET. */
17021
17022 void
17023 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17024 {
17025 rtx tmp[8];
17026 REAL_VALUE_TYPE TWO16r;
17027 enum machine_mode intmode = GET_MODE (val);
17028 enum machine_mode fltmode = GET_MODE (target);
17029 rtx (*cvt) (rtx, rtx);
17030
17031 if (intmode == V4SImode)
17032 cvt = gen_floatv4siv4sf2;
17033 else
17034 cvt = gen_floatv8siv8sf2;
17035 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17036 tmp[0] = force_reg (intmode, tmp[0]);
17037 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17038 OPTAB_DIRECT);
17039 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17040 NULL_RTX, 1, OPTAB_DIRECT);
17041 tmp[3] = gen_reg_rtx (fltmode);
17042 emit_insn (cvt (tmp[3], tmp[1]));
17043 tmp[4] = gen_reg_rtx (fltmode);
17044 emit_insn (cvt (tmp[4], tmp[2]));
17045 real_ldexp (&TWO16r, &dconst1, 16);
17046 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17047 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17048 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17049 OPTAB_DIRECT);
17050 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17051 OPTAB_DIRECT);
17052 if (tmp[7] != target)
17053 emit_move_insn (target, tmp[7]);
17054 }
17055
17056 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17057 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17058 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17059 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17060
17061 rtx
17062 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17063 {
17064 REAL_VALUE_TYPE TWO31r;
17065 rtx two31r, tmp[4];
17066 enum machine_mode mode = GET_MODE (val);
17067 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17068 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17069 rtx (*cmp) (rtx, rtx, rtx, rtx);
17070 int i;
17071
17072 for (i = 0; i < 3; i++)
17073 tmp[i] = gen_reg_rtx (mode);
17074 real_ldexp (&TWO31r, &dconst1, 31);
17075 two31r = const_double_from_real_value (TWO31r, scalarmode);
17076 two31r = ix86_build_const_vector (mode, 1, two31r);
17077 two31r = force_reg (mode, two31r);
17078 switch (mode)
17079 {
17080 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17081 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17082 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17083 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17084 default: gcc_unreachable ();
17085 }
17086 tmp[3] = gen_rtx_LE (mode, two31r, val);
17087 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17088 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17089 0, OPTAB_DIRECT);
17090 if (intmode == V4SImode || TARGET_AVX2)
17091 *xorp = expand_simple_binop (intmode, ASHIFT,
17092 gen_lowpart (intmode, tmp[0]),
17093 GEN_INT (31), NULL_RTX, 0,
17094 OPTAB_DIRECT);
17095 else
17096 {
17097 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17098 two31 = ix86_build_const_vector (intmode, 1, two31);
17099 *xorp = expand_simple_binop (intmode, AND,
17100 gen_lowpart (intmode, tmp[0]),
17101 two31, NULL_RTX, 0,
17102 OPTAB_DIRECT);
17103 }
17104 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17105 0, OPTAB_DIRECT);
17106 }
17107
17108 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17109 then replicate the value for all elements of the vector
17110 register. */
17111
17112 rtx
17113 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17114 {
17115 int i, n_elt;
17116 rtvec v;
17117 enum machine_mode scalar_mode;
17118
17119 switch (mode)
17120 {
17121 case V32QImode:
17122 case V16QImode:
17123 case V16HImode:
17124 case V8HImode:
17125 case V8SImode:
17126 case V4SImode:
17127 case V4DImode:
17128 case V2DImode:
17129 gcc_assert (vect);
17130 case V8SFmode:
17131 case V4SFmode:
17132 case V4DFmode:
17133 case V2DFmode:
17134 n_elt = GET_MODE_NUNITS (mode);
17135 v = rtvec_alloc (n_elt);
17136 scalar_mode = GET_MODE_INNER (mode);
17137
17138 RTVEC_ELT (v, 0) = value;
17139
17140 for (i = 1; i < n_elt; ++i)
17141 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17142
17143 return gen_rtx_CONST_VECTOR (mode, v);
17144
17145 default:
17146 gcc_unreachable ();
17147 }
17148 }
17149
17150 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17151 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17152 for an SSE register. If VECT is true, then replicate the mask for
17153 all elements of the vector register. If INVERT is true, then create
17154 a mask excluding the sign bit. */
17155
17156 rtx
17157 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17158 {
17159 enum machine_mode vec_mode, imode;
17160 HOST_WIDE_INT hi, lo;
17161 int shift = 63;
17162 rtx v;
17163 rtx mask;
17164
17165 /* Find the sign bit, sign extended to 2*HWI. */
17166 switch (mode)
17167 {
17168 case V8SImode:
17169 case V4SImode:
17170 case V8SFmode:
17171 case V4SFmode:
17172 vec_mode = mode;
17173 mode = GET_MODE_INNER (mode);
17174 imode = SImode;
17175 lo = 0x80000000, hi = lo < 0;
17176 break;
17177
17178 case V4DImode:
17179 case V2DImode:
17180 case V4DFmode:
17181 case V2DFmode:
17182 vec_mode = mode;
17183 mode = GET_MODE_INNER (mode);
17184 imode = DImode;
17185 if (HOST_BITS_PER_WIDE_INT >= 64)
17186 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17187 else
17188 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17189 break;
17190
17191 case TImode:
17192 case TFmode:
17193 vec_mode = VOIDmode;
17194 if (HOST_BITS_PER_WIDE_INT >= 64)
17195 {
17196 imode = TImode;
17197 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17198 }
17199 else
17200 {
17201 rtvec vec;
17202
17203 imode = DImode;
17204 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17205
17206 if (invert)
17207 {
17208 lo = ~lo, hi = ~hi;
17209 v = constm1_rtx;
17210 }
17211 else
17212 v = const0_rtx;
17213
17214 mask = immed_double_const (lo, hi, imode);
17215
17216 vec = gen_rtvec (2, v, mask);
17217 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17218 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17219
17220 return v;
17221 }
17222 break;
17223
17224 default:
17225 gcc_unreachable ();
17226 }
17227
17228 if (invert)
17229 lo = ~lo, hi = ~hi;
17230
17231 /* Force this value into the low part of a fp vector constant. */
17232 mask = immed_double_const (lo, hi, imode);
17233 mask = gen_lowpart (mode, mask);
17234
17235 if (vec_mode == VOIDmode)
17236 return force_reg (mode, mask);
17237
17238 v = ix86_build_const_vector (vec_mode, vect, mask);
17239 return force_reg (vec_mode, v);
17240 }
17241
17242 /* Generate code for floating point ABS or NEG. */
17243
17244 void
17245 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17246 rtx operands[])
17247 {
17248 rtx mask, set, dst, src;
17249 bool use_sse = false;
17250 bool vector_mode = VECTOR_MODE_P (mode);
17251 enum machine_mode vmode = mode;
17252
17253 if (vector_mode)
17254 use_sse = true;
17255 else if (mode == TFmode)
17256 use_sse = true;
17257 else if (TARGET_SSE_MATH)
17258 {
17259 use_sse = SSE_FLOAT_MODE_P (mode);
17260 if (mode == SFmode)
17261 vmode = V4SFmode;
17262 else if (mode == DFmode)
17263 vmode = V2DFmode;
17264 }
17265
17266 /* NEG and ABS performed with SSE use bitwise mask operations.
17267 Create the appropriate mask now. */
17268 if (use_sse)
17269 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17270 else
17271 mask = NULL_RTX;
17272
17273 dst = operands[0];
17274 src = operands[1];
17275
17276 set = gen_rtx_fmt_e (code, mode, src);
17277 set = gen_rtx_SET (VOIDmode, dst, set);
17278
17279 if (mask)
17280 {
17281 rtx use, clob;
17282 rtvec par;
17283
17284 use = gen_rtx_USE (VOIDmode, mask);
17285 if (vector_mode)
17286 par = gen_rtvec (2, set, use);
17287 else
17288 {
17289 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17290 par = gen_rtvec (3, set, use, clob);
17291 }
17292 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17293 }
17294 else
17295 emit_insn (set);
17296 }
17297
17298 /* Expand a copysign operation. Special case operand 0 being a constant. */
17299
17300 void
17301 ix86_expand_copysign (rtx operands[])
17302 {
17303 enum machine_mode mode, vmode;
17304 rtx dest, op0, op1, mask, nmask;
17305
17306 dest = operands[0];
17307 op0 = operands[1];
17308 op1 = operands[2];
17309
17310 mode = GET_MODE (dest);
17311
17312 if (mode == SFmode)
17313 vmode = V4SFmode;
17314 else if (mode == DFmode)
17315 vmode = V2DFmode;
17316 else
17317 vmode = mode;
17318
17319 if (GET_CODE (op0) == CONST_DOUBLE)
17320 {
17321 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17322
17323 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17324 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17325
17326 if (mode == SFmode || mode == DFmode)
17327 {
17328 if (op0 == CONST0_RTX (mode))
17329 op0 = CONST0_RTX (vmode);
17330 else
17331 {
17332 rtx v = ix86_build_const_vector (vmode, false, op0);
17333
17334 op0 = force_reg (vmode, v);
17335 }
17336 }
17337 else if (op0 != CONST0_RTX (mode))
17338 op0 = force_reg (mode, op0);
17339
17340 mask = ix86_build_signbit_mask (vmode, 0, 0);
17341
17342 if (mode == SFmode)
17343 copysign_insn = gen_copysignsf3_const;
17344 else if (mode == DFmode)
17345 copysign_insn = gen_copysigndf3_const;
17346 else
17347 copysign_insn = gen_copysigntf3_const;
17348
17349 emit_insn (copysign_insn (dest, op0, op1, mask));
17350 }
17351 else
17352 {
17353 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17354
17355 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17356 mask = ix86_build_signbit_mask (vmode, 0, 0);
17357
17358 if (mode == SFmode)
17359 copysign_insn = gen_copysignsf3_var;
17360 else if (mode == DFmode)
17361 copysign_insn = gen_copysigndf3_var;
17362 else
17363 copysign_insn = gen_copysigntf3_var;
17364
17365 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17366 }
17367 }
17368
17369 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17370 be a constant, and so has already been expanded into a vector constant. */
17371
17372 void
17373 ix86_split_copysign_const (rtx operands[])
17374 {
17375 enum machine_mode mode, vmode;
17376 rtx dest, op0, mask, x;
17377
17378 dest = operands[0];
17379 op0 = operands[1];
17380 mask = operands[3];
17381
17382 mode = GET_MODE (dest);
17383 vmode = GET_MODE (mask);
17384
17385 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17386 x = gen_rtx_AND (vmode, dest, mask);
17387 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17388
17389 if (op0 != CONST0_RTX (vmode))
17390 {
17391 x = gen_rtx_IOR (vmode, dest, op0);
17392 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17393 }
17394 }
17395
17396 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17397 so we have to do two masks. */
17398
17399 void
17400 ix86_split_copysign_var (rtx operands[])
17401 {
17402 enum machine_mode mode, vmode;
17403 rtx dest, scratch, op0, op1, mask, nmask, x;
17404
17405 dest = operands[0];
17406 scratch = operands[1];
17407 op0 = operands[2];
17408 op1 = operands[3];
17409 nmask = operands[4];
17410 mask = operands[5];
17411
17412 mode = GET_MODE (dest);
17413 vmode = GET_MODE (mask);
17414
17415 if (rtx_equal_p (op0, op1))
17416 {
17417 /* Shouldn't happen often (it's useless, obviously), but when it does
17418 we'd generate incorrect code if we continue below. */
17419 emit_move_insn (dest, op0);
17420 return;
17421 }
17422
17423 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17424 {
17425 gcc_assert (REGNO (op1) == REGNO (scratch));
17426
17427 x = gen_rtx_AND (vmode, scratch, mask);
17428 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17429
17430 dest = mask;
17431 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17432 x = gen_rtx_NOT (vmode, dest);
17433 x = gen_rtx_AND (vmode, x, op0);
17434 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17435 }
17436 else
17437 {
17438 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17439 {
17440 x = gen_rtx_AND (vmode, scratch, mask);
17441 }
17442 else /* alternative 2,4 */
17443 {
17444 gcc_assert (REGNO (mask) == REGNO (scratch));
17445 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17446 x = gen_rtx_AND (vmode, scratch, op1);
17447 }
17448 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17449
17450 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17451 {
17452 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17453 x = gen_rtx_AND (vmode, dest, nmask);
17454 }
17455 else /* alternative 3,4 */
17456 {
17457 gcc_assert (REGNO (nmask) == REGNO (dest));
17458 dest = nmask;
17459 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17460 x = gen_rtx_AND (vmode, dest, op0);
17461 }
17462 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17463 }
17464
17465 x = gen_rtx_IOR (vmode, dest, scratch);
17466 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17467 }
17468
17469 /* Return TRUE or FALSE depending on whether the first SET in INSN
17470 has source and destination with matching CC modes, and that the
17471 CC mode is at least as constrained as REQ_MODE. */
17472
17473 bool
17474 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17475 {
17476 rtx set;
17477 enum machine_mode set_mode;
17478
17479 set = PATTERN (insn);
17480 if (GET_CODE (set) == PARALLEL)
17481 set = XVECEXP (set, 0, 0);
17482 gcc_assert (GET_CODE (set) == SET);
17483 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17484
17485 set_mode = GET_MODE (SET_DEST (set));
17486 switch (set_mode)
17487 {
17488 case CCNOmode:
17489 if (req_mode != CCNOmode
17490 && (req_mode != CCmode
17491 || XEXP (SET_SRC (set), 1) != const0_rtx))
17492 return false;
17493 break;
17494 case CCmode:
17495 if (req_mode == CCGCmode)
17496 return false;
17497 /* FALLTHRU */
17498 case CCGCmode:
17499 if (req_mode == CCGOCmode || req_mode == CCNOmode)
17500 return false;
17501 /* FALLTHRU */
17502 case CCGOCmode:
17503 if (req_mode == CCZmode)
17504 return false;
17505 /* FALLTHRU */
17506 case CCZmode:
17507 break;
17508
17509 case CCAmode:
17510 case CCCmode:
17511 case CCOmode:
17512 case CCSmode:
17513 if (set_mode != req_mode)
17514 return false;
17515 break;
17516
17517 default:
17518 gcc_unreachable ();
17519 }
17520
17521 return GET_MODE (SET_SRC (set)) == set_mode;
17522 }
17523
17524 /* Generate insn patterns to do an integer compare of OPERANDS. */
17525
17526 static rtx
17527 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
17528 {
17529 enum machine_mode cmpmode;
17530 rtx tmp, flags;
17531
17532 cmpmode = SELECT_CC_MODE (code, op0, op1);
17533 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
17534
17535 /* This is very simple, but making the interface the same as in the
17536 FP case makes the rest of the code easier. */
17537 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
17538 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
17539
17540 /* Return the test that should be put into the flags user, i.e.
17541 the bcc, scc, or cmov instruction. */
17542 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
17543 }
17544
17545 /* Figure out whether to use ordered or unordered fp comparisons.
17546 Return the appropriate mode to use. */
17547
17548 enum machine_mode
17549 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
17550 {
17551 /* ??? In order to make all comparisons reversible, we do all comparisons
17552 non-trapping when compiling for IEEE. Once gcc is able to distinguish
17553 all forms trapping and nontrapping comparisons, we can make inequality
17554 comparisons trapping again, since it results in better code when using
17555 FCOM based compares. */
17556 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
17557 }
17558
17559 enum machine_mode
17560 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
17561 {
17562 enum machine_mode mode = GET_MODE (op0);
17563
17564 if (SCALAR_FLOAT_MODE_P (mode))
17565 {
17566 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17567 return ix86_fp_compare_mode (code);
17568 }
17569
17570 switch (code)
17571 {
17572 /* Only zero flag is needed. */
17573 case EQ: /* ZF=0 */
17574 case NE: /* ZF!=0 */
17575 return CCZmode;
17576 /* Codes needing carry flag. */
17577 case GEU: /* CF=0 */
17578 case LTU: /* CF=1 */
17579 /* Detect overflow checks. They need just the carry flag. */
17580 if (GET_CODE (op0) == PLUS
17581 && rtx_equal_p (op1, XEXP (op0, 0)))
17582 return CCCmode;
17583 else
17584 return CCmode;
17585 case GTU: /* CF=0 & ZF=0 */
17586 case LEU: /* CF=1 | ZF=1 */
17587 /* Detect overflow checks. They need just the carry flag. */
17588 if (GET_CODE (op0) == MINUS
17589 && rtx_equal_p (op1, XEXP (op0, 0)))
17590 return CCCmode;
17591 else
17592 return CCmode;
17593 /* Codes possibly doable only with sign flag when
17594 comparing against zero. */
17595 case GE: /* SF=OF or SF=0 */
17596 case LT: /* SF<>OF or SF=1 */
17597 if (op1 == const0_rtx)
17598 return CCGOCmode;
17599 else
17600 /* For other cases Carry flag is not required. */
17601 return CCGCmode;
17602 /* Codes doable only with sign flag when comparing
17603 against zero, but we miss jump instruction for it
17604 so we need to use relational tests against overflow
17605 that thus needs to be zero. */
17606 case GT: /* ZF=0 & SF=OF */
17607 case LE: /* ZF=1 | SF<>OF */
17608 if (op1 == const0_rtx)
17609 return CCNOmode;
17610 else
17611 return CCGCmode;
17612 /* strcmp pattern do (use flags) and combine may ask us for proper
17613 mode. */
17614 case USE:
17615 return CCmode;
17616 default:
17617 gcc_unreachable ();
17618 }
17619 }
17620
17621 /* Return the fixed registers used for condition codes. */
17622
17623 static bool
17624 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
17625 {
17626 *p1 = FLAGS_REG;
17627 *p2 = FPSR_REG;
17628 return true;
17629 }
17630
17631 /* If two condition code modes are compatible, return a condition code
17632 mode which is compatible with both. Otherwise, return
17633 VOIDmode. */
17634
17635 static enum machine_mode
17636 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
17637 {
17638 if (m1 == m2)
17639 return m1;
17640
17641 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
17642 return VOIDmode;
17643
17644 if ((m1 == CCGCmode && m2 == CCGOCmode)
17645 || (m1 == CCGOCmode && m2 == CCGCmode))
17646 return CCGCmode;
17647
17648 switch (m1)
17649 {
17650 default:
17651 gcc_unreachable ();
17652
17653 case CCmode:
17654 case CCGCmode:
17655 case CCGOCmode:
17656 case CCNOmode:
17657 case CCAmode:
17658 case CCCmode:
17659 case CCOmode:
17660 case CCSmode:
17661 case CCZmode:
17662 switch (m2)
17663 {
17664 default:
17665 return VOIDmode;
17666
17667 case CCmode:
17668 case CCGCmode:
17669 case CCGOCmode:
17670 case CCNOmode:
17671 case CCAmode:
17672 case CCCmode:
17673 case CCOmode:
17674 case CCSmode:
17675 case CCZmode:
17676 return CCmode;
17677 }
17678
17679 case CCFPmode:
17680 case CCFPUmode:
17681 /* These are only compatible with themselves, which we already
17682 checked above. */
17683 return VOIDmode;
17684 }
17685 }
17686
17687
17688 /* Return a comparison we can do and that it is equivalent to
17689 swap_condition (code) apart possibly from orderedness.
17690 But, never change orderedness if TARGET_IEEE_FP, returning
17691 UNKNOWN in that case if necessary. */
17692
17693 static enum rtx_code
17694 ix86_fp_swap_condition (enum rtx_code code)
17695 {
17696 switch (code)
17697 {
17698 case GT: /* GTU - CF=0 & ZF=0 */
17699 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
17700 case GE: /* GEU - CF=0 */
17701 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
17702 case UNLT: /* LTU - CF=1 */
17703 return TARGET_IEEE_FP ? UNKNOWN : GT;
17704 case UNLE: /* LEU - CF=1 | ZF=1 */
17705 return TARGET_IEEE_FP ? UNKNOWN : GE;
17706 default:
17707 return swap_condition (code);
17708 }
17709 }
17710
17711 /* Return cost of comparison CODE using the best strategy for performance.
17712 All following functions do use number of instructions as a cost metrics.
17713 In future this should be tweaked to compute bytes for optimize_size and
17714 take into account performance of various instructions on various CPUs. */
17715
17716 static int
17717 ix86_fp_comparison_cost (enum rtx_code code)
17718 {
17719 int arith_cost;
17720
17721 /* The cost of code using bit-twiddling on %ah. */
17722 switch (code)
17723 {
17724 case UNLE:
17725 case UNLT:
17726 case LTGT:
17727 case GT:
17728 case GE:
17729 case UNORDERED:
17730 case ORDERED:
17731 case UNEQ:
17732 arith_cost = 4;
17733 break;
17734 case LT:
17735 case NE:
17736 case EQ:
17737 case UNGE:
17738 arith_cost = TARGET_IEEE_FP ? 5 : 4;
17739 break;
17740 case LE:
17741 case UNGT:
17742 arith_cost = TARGET_IEEE_FP ? 6 : 4;
17743 break;
17744 default:
17745 gcc_unreachable ();
17746 }
17747
17748 switch (ix86_fp_comparison_strategy (code))
17749 {
17750 case IX86_FPCMP_COMI:
17751 return arith_cost > 4 ? 3 : 2;
17752 case IX86_FPCMP_SAHF:
17753 return arith_cost > 4 ? 4 : 3;
17754 default:
17755 return arith_cost;
17756 }
17757 }
17758
17759 /* Return strategy to use for floating-point. We assume that fcomi is always
17760 preferrable where available, since that is also true when looking at size
17761 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
17762
17763 enum ix86_fpcmp_strategy
17764 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
17765 {
17766 /* Do fcomi/sahf based test when profitable. */
17767
17768 if (TARGET_CMOVE)
17769 return IX86_FPCMP_COMI;
17770
17771 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
17772 return IX86_FPCMP_SAHF;
17773
17774 return IX86_FPCMP_ARITH;
17775 }
17776
17777 /* Swap, force into registers, or otherwise massage the two operands
17778 to a fp comparison. The operands are updated in place; the new
17779 comparison code is returned. */
17780
17781 static enum rtx_code
17782 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
17783 {
17784 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
17785 rtx op0 = *pop0, op1 = *pop1;
17786 enum machine_mode op_mode = GET_MODE (op0);
17787 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
17788
17789 /* All of the unordered compare instructions only work on registers.
17790 The same is true of the fcomi compare instructions. The XFmode
17791 compare instructions require registers except when comparing
17792 against zero or when converting operand 1 from fixed point to
17793 floating point. */
17794
17795 if (!is_sse
17796 && (fpcmp_mode == CCFPUmode
17797 || (op_mode == XFmode
17798 && ! (standard_80387_constant_p (op0) == 1
17799 || standard_80387_constant_p (op1) == 1)
17800 && GET_CODE (op1) != FLOAT)
17801 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
17802 {
17803 op0 = force_reg (op_mode, op0);
17804 op1 = force_reg (op_mode, op1);
17805 }
17806 else
17807 {
17808 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
17809 things around if they appear profitable, otherwise force op0
17810 into a register. */
17811
17812 if (standard_80387_constant_p (op0) == 0
17813 || (MEM_P (op0)
17814 && ! (standard_80387_constant_p (op1) == 0
17815 || MEM_P (op1))))
17816 {
17817 enum rtx_code new_code = ix86_fp_swap_condition (code);
17818 if (new_code != UNKNOWN)
17819 {
17820 rtx tmp;
17821 tmp = op0, op0 = op1, op1 = tmp;
17822 code = new_code;
17823 }
17824 }
17825
17826 if (!REG_P (op0))
17827 op0 = force_reg (op_mode, op0);
17828
17829 if (CONSTANT_P (op1))
17830 {
17831 int tmp = standard_80387_constant_p (op1);
17832 if (tmp == 0)
17833 op1 = validize_mem (force_const_mem (op_mode, op1));
17834 else if (tmp == 1)
17835 {
17836 if (TARGET_CMOVE)
17837 op1 = force_reg (op_mode, op1);
17838 }
17839 else
17840 op1 = force_reg (op_mode, op1);
17841 }
17842 }
17843
17844 /* Try to rearrange the comparison to make it cheaper. */
17845 if (ix86_fp_comparison_cost (code)
17846 > ix86_fp_comparison_cost (swap_condition (code))
17847 && (REG_P (op1) || can_create_pseudo_p ()))
17848 {
17849 rtx tmp;
17850 tmp = op0, op0 = op1, op1 = tmp;
17851 code = swap_condition (code);
17852 if (!REG_P (op0))
17853 op0 = force_reg (op_mode, op0);
17854 }
17855
17856 *pop0 = op0;
17857 *pop1 = op1;
17858 return code;
17859 }
17860
17861 /* Convert comparison codes we use to represent FP comparison to integer
17862 code that will result in proper branch. Return UNKNOWN if no such code
17863 is available. */
17864
17865 enum rtx_code
17866 ix86_fp_compare_code_to_integer (enum rtx_code code)
17867 {
17868 switch (code)
17869 {
17870 case GT:
17871 return GTU;
17872 case GE:
17873 return GEU;
17874 case ORDERED:
17875 case UNORDERED:
17876 return code;
17877 break;
17878 case UNEQ:
17879 return EQ;
17880 break;
17881 case UNLT:
17882 return LTU;
17883 break;
17884 case UNLE:
17885 return LEU;
17886 break;
17887 case LTGT:
17888 return NE;
17889 break;
17890 default:
17891 return UNKNOWN;
17892 }
17893 }
17894
17895 /* Generate insn patterns to do a floating point compare of OPERANDS. */
17896
17897 static rtx
17898 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
17899 {
17900 enum machine_mode fpcmp_mode, intcmp_mode;
17901 rtx tmp, tmp2;
17902
17903 fpcmp_mode = ix86_fp_compare_mode (code);
17904 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
17905
17906 /* Do fcomi/sahf based test when profitable. */
17907 switch (ix86_fp_comparison_strategy (code))
17908 {
17909 case IX86_FPCMP_COMI:
17910 intcmp_mode = fpcmp_mode;
17911 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
17912 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
17913 tmp);
17914 emit_insn (tmp);
17915 break;
17916
17917 case IX86_FPCMP_SAHF:
17918 intcmp_mode = fpcmp_mode;
17919 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
17920 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
17921 tmp);
17922
17923 if (!scratch)
17924 scratch = gen_reg_rtx (HImode);
17925 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
17926 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
17927 break;
17928
17929 case IX86_FPCMP_ARITH:
17930 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
17931 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
17932 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
17933 if (!scratch)
17934 scratch = gen_reg_rtx (HImode);
17935 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
17936
17937 /* In the unordered case, we have to check C2 for NaN's, which
17938 doesn't happen to work out to anything nice combination-wise.
17939 So do some bit twiddling on the value we've got in AH to come
17940 up with an appropriate set of condition codes. */
17941
17942 intcmp_mode = CCNOmode;
17943 switch (code)
17944 {
17945 case GT:
17946 case UNGT:
17947 if (code == GT || !TARGET_IEEE_FP)
17948 {
17949 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
17950 code = EQ;
17951 }
17952 else
17953 {
17954 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
17955 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
17956 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
17957 intcmp_mode = CCmode;
17958 code = GEU;
17959 }
17960 break;
17961 case LT:
17962 case UNLT:
17963 if (code == LT && TARGET_IEEE_FP)
17964 {
17965 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
17966 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
17967 intcmp_mode = CCmode;
17968 code = EQ;
17969 }
17970 else
17971 {
17972 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
17973 code = NE;
17974 }
17975 break;
17976 case GE:
17977 case UNGE:
17978 if (code == GE || !TARGET_IEEE_FP)
17979 {
17980 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
17981 code = EQ;
17982 }
17983 else
17984 {
17985 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
17986 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
17987 code = NE;
17988 }
17989 break;
17990 case LE:
17991 case UNLE:
17992 if (code == LE && TARGET_IEEE_FP)
17993 {
17994 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
17995 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
17996 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
17997 intcmp_mode = CCmode;
17998 code = LTU;
17999 }
18000 else
18001 {
18002 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18003 code = NE;
18004 }
18005 break;
18006 case EQ:
18007 case UNEQ:
18008 if (code == EQ && TARGET_IEEE_FP)
18009 {
18010 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18011 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18012 intcmp_mode = CCmode;
18013 code = EQ;
18014 }
18015 else
18016 {
18017 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18018 code = NE;
18019 }
18020 break;
18021 case NE:
18022 case LTGT:
18023 if (code == NE && TARGET_IEEE_FP)
18024 {
18025 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18026 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18027 GEN_INT (0x40)));
18028 code = NE;
18029 }
18030 else
18031 {
18032 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18033 code = EQ;
18034 }
18035 break;
18036
18037 case UNORDERED:
18038 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18039 code = NE;
18040 break;
18041 case ORDERED:
18042 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18043 code = EQ;
18044 break;
18045
18046 default:
18047 gcc_unreachable ();
18048 }
18049 break;
18050
18051 default:
18052 gcc_unreachable();
18053 }
18054
18055 /* Return the test that should be put into the flags user, i.e.
18056 the bcc, scc, or cmov instruction. */
18057 return gen_rtx_fmt_ee (code, VOIDmode,
18058 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18059 const0_rtx);
18060 }
18061
18062 static rtx
18063 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18064 {
18065 rtx ret;
18066
18067 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18068 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18069
18070 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18071 {
18072 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18073 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18074 }
18075 else
18076 ret = ix86_expand_int_compare (code, op0, op1);
18077
18078 return ret;
18079 }
18080
18081 void
18082 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18083 {
18084 enum machine_mode mode = GET_MODE (op0);
18085 rtx tmp;
18086
18087 switch (mode)
18088 {
18089 case SFmode:
18090 case DFmode:
18091 case XFmode:
18092 case QImode:
18093 case HImode:
18094 case SImode:
18095 simple:
18096 tmp = ix86_expand_compare (code, op0, op1);
18097 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18098 gen_rtx_LABEL_REF (VOIDmode, label),
18099 pc_rtx);
18100 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18101 return;
18102
18103 case DImode:
18104 if (TARGET_64BIT)
18105 goto simple;
18106 case TImode:
18107 /* Expand DImode branch into multiple compare+branch. */
18108 {
18109 rtx lo[2], hi[2], label2;
18110 enum rtx_code code1, code2, code3;
18111 enum machine_mode submode;
18112
18113 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18114 {
18115 tmp = op0, op0 = op1, op1 = tmp;
18116 code = swap_condition (code);
18117 }
18118
18119 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18120 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18121
18122 submode = mode == DImode ? SImode : DImode;
18123
18124 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18125 avoid two branches. This costs one extra insn, so disable when
18126 optimizing for size. */
18127
18128 if ((code == EQ || code == NE)
18129 && (!optimize_insn_for_size_p ()
18130 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18131 {
18132 rtx xor0, xor1;
18133
18134 xor1 = hi[0];
18135 if (hi[1] != const0_rtx)
18136 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18137 NULL_RTX, 0, OPTAB_WIDEN);
18138
18139 xor0 = lo[0];
18140 if (lo[1] != const0_rtx)
18141 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18142 NULL_RTX, 0, OPTAB_WIDEN);
18143
18144 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18145 NULL_RTX, 0, OPTAB_WIDEN);
18146
18147 ix86_expand_branch (code, tmp, const0_rtx, label);
18148 return;
18149 }
18150
18151 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18152 op1 is a constant and the low word is zero, then we can just
18153 examine the high word. Similarly for low word -1 and
18154 less-or-equal-than or greater-than. */
18155
18156 if (CONST_INT_P (hi[1]))
18157 switch (code)
18158 {
18159 case LT: case LTU: case GE: case GEU:
18160 if (lo[1] == const0_rtx)
18161 {
18162 ix86_expand_branch (code, hi[0], hi[1], label);
18163 return;
18164 }
18165 break;
18166 case LE: case LEU: case GT: case GTU:
18167 if (lo[1] == constm1_rtx)
18168 {
18169 ix86_expand_branch (code, hi[0], hi[1], label);
18170 return;
18171 }
18172 break;
18173 default:
18174 break;
18175 }
18176
18177 /* Otherwise, we need two or three jumps. */
18178
18179 label2 = gen_label_rtx ();
18180
18181 code1 = code;
18182 code2 = swap_condition (code);
18183 code3 = unsigned_condition (code);
18184
18185 switch (code)
18186 {
18187 case LT: case GT: case LTU: case GTU:
18188 break;
18189
18190 case LE: code1 = LT; code2 = GT; break;
18191 case GE: code1 = GT; code2 = LT; break;
18192 case LEU: code1 = LTU; code2 = GTU; break;
18193 case GEU: code1 = GTU; code2 = LTU; break;
18194
18195 case EQ: code1 = UNKNOWN; code2 = NE; break;
18196 case NE: code2 = UNKNOWN; break;
18197
18198 default:
18199 gcc_unreachable ();
18200 }
18201
18202 /*
18203 * a < b =>
18204 * if (hi(a) < hi(b)) goto true;
18205 * if (hi(a) > hi(b)) goto false;
18206 * if (lo(a) < lo(b)) goto true;
18207 * false:
18208 */
18209
18210 if (code1 != UNKNOWN)
18211 ix86_expand_branch (code1, hi[0], hi[1], label);
18212 if (code2 != UNKNOWN)
18213 ix86_expand_branch (code2, hi[0], hi[1], label2);
18214
18215 ix86_expand_branch (code3, lo[0], lo[1], label);
18216
18217 if (code2 != UNKNOWN)
18218 emit_label (label2);
18219 return;
18220 }
18221
18222 default:
18223 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18224 goto simple;
18225 }
18226 }
18227
18228 /* Split branch based on floating point condition. */
18229 void
18230 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18231 rtx target1, rtx target2, rtx tmp, rtx pushed)
18232 {
18233 rtx condition;
18234 rtx i;
18235
18236 if (target2 != pc_rtx)
18237 {
18238 rtx tmp = target2;
18239 code = reverse_condition_maybe_unordered (code);
18240 target2 = target1;
18241 target1 = tmp;
18242 }
18243
18244 condition = ix86_expand_fp_compare (code, op1, op2,
18245 tmp);
18246
18247 /* Remove pushed operand from stack. */
18248 if (pushed)
18249 ix86_free_from_memory (GET_MODE (pushed));
18250
18251 i = emit_jump_insn (gen_rtx_SET
18252 (VOIDmode, pc_rtx,
18253 gen_rtx_IF_THEN_ELSE (VOIDmode,
18254 condition, target1, target2)));
18255 if (split_branch_probability >= 0)
18256 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18257 }
18258
18259 void
18260 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18261 {
18262 rtx ret;
18263
18264 gcc_assert (GET_MODE (dest) == QImode);
18265
18266 ret = ix86_expand_compare (code, op0, op1);
18267 PUT_MODE (ret, QImode);
18268 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18269 }
18270
18271 /* Expand comparison setting or clearing carry flag. Return true when
18272 successful and set pop for the operation. */
18273 static bool
18274 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18275 {
18276 enum machine_mode mode =
18277 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18278
18279 /* Do not handle double-mode compares that go through special path. */
18280 if (mode == (TARGET_64BIT ? TImode : DImode))
18281 return false;
18282
18283 if (SCALAR_FLOAT_MODE_P (mode))
18284 {
18285 rtx compare_op, compare_seq;
18286
18287 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18288
18289 /* Shortcut: following common codes never translate
18290 into carry flag compares. */
18291 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18292 || code == ORDERED || code == UNORDERED)
18293 return false;
18294
18295 /* These comparisons require zero flag; swap operands so they won't. */
18296 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18297 && !TARGET_IEEE_FP)
18298 {
18299 rtx tmp = op0;
18300 op0 = op1;
18301 op1 = tmp;
18302 code = swap_condition (code);
18303 }
18304
18305 /* Try to expand the comparison and verify that we end up with
18306 carry flag based comparison. This fails to be true only when
18307 we decide to expand comparison using arithmetic that is not
18308 too common scenario. */
18309 start_sequence ();
18310 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18311 compare_seq = get_insns ();
18312 end_sequence ();
18313
18314 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18315 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18316 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18317 else
18318 code = GET_CODE (compare_op);
18319
18320 if (code != LTU && code != GEU)
18321 return false;
18322
18323 emit_insn (compare_seq);
18324 *pop = compare_op;
18325 return true;
18326 }
18327
18328 if (!INTEGRAL_MODE_P (mode))
18329 return false;
18330
18331 switch (code)
18332 {
18333 case LTU:
18334 case GEU:
18335 break;
18336
18337 /* Convert a==0 into (unsigned)a<1. */
18338 case EQ:
18339 case NE:
18340 if (op1 != const0_rtx)
18341 return false;
18342 op1 = const1_rtx;
18343 code = (code == EQ ? LTU : GEU);
18344 break;
18345
18346 /* Convert a>b into b<a or a>=b-1. */
18347 case GTU:
18348 case LEU:
18349 if (CONST_INT_P (op1))
18350 {
18351 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18352 /* Bail out on overflow. We still can swap operands but that
18353 would force loading of the constant into register. */
18354 if (op1 == const0_rtx
18355 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18356 return false;
18357 code = (code == GTU ? GEU : LTU);
18358 }
18359 else
18360 {
18361 rtx tmp = op1;
18362 op1 = op0;
18363 op0 = tmp;
18364 code = (code == GTU ? LTU : GEU);
18365 }
18366 break;
18367
18368 /* Convert a>=0 into (unsigned)a<0x80000000. */
18369 case LT:
18370 case GE:
18371 if (mode == DImode || op1 != const0_rtx)
18372 return false;
18373 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18374 code = (code == LT ? GEU : LTU);
18375 break;
18376 case LE:
18377 case GT:
18378 if (mode == DImode || op1 != constm1_rtx)
18379 return false;
18380 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18381 code = (code == LE ? GEU : LTU);
18382 break;
18383
18384 default:
18385 return false;
18386 }
18387 /* Swapping operands may cause constant to appear as first operand. */
18388 if (!nonimmediate_operand (op0, VOIDmode))
18389 {
18390 if (!can_create_pseudo_p ())
18391 return false;
18392 op0 = force_reg (mode, op0);
18393 }
18394 *pop = ix86_expand_compare (code, op0, op1);
18395 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18396 return true;
18397 }
18398
18399 bool
18400 ix86_expand_int_movcc (rtx operands[])
18401 {
18402 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18403 rtx compare_seq, compare_op;
18404 enum machine_mode mode = GET_MODE (operands[0]);
18405 bool sign_bit_compare_p = false;
18406 rtx op0 = XEXP (operands[1], 0);
18407 rtx op1 = XEXP (operands[1], 1);
18408
18409 start_sequence ();
18410 compare_op = ix86_expand_compare (code, op0, op1);
18411 compare_seq = get_insns ();
18412 end_sequence ();
18413
18414 compare_code = GET_CODE (compare_op);
18415
18416 if ((op1 == const0_rtx && (code == GE || code == LT))
18417 || (op1 == constm1_rtx && (code == GT || code == LE)))
18418 sign_bit_compare_p = true;
18419
18420 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18421 HImode insns, we'd be swallowed in word prefix ops. */
18422
18423 if ((mode != HImode || TARGET_FAST_PREFIX)
18424 && (mode != (TARGET_64BIT ? TImode : DImode))
18425 && CONST_INT_P (operands[2])
18426 && CONST_INT_P (operands[3]))
18427 {
18428 rtx out = operands[0];
18429 HOST_WIDE_INT ct = INTVAL (operands[2]);
18430 HOST_WIDE_INT cf = INTVAL (operands[3]);
18431 HOST_WIDE_INT diff;
18432
18433 diff = ct - cf;
18434 /* Sign bit compares are better done using shifts than we do by using
18435 sbb. */
18436 if (sign_bit_compare_p
18437 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18438 {
18439 /* Detect overlap between destination and compare sources. */
18440 rtx tmp = out;
18441
18442 if (!sign_bit_compare_p)
18443 {
18444 rtx flags;
18445 bool fpcmp = false;
18446
18447 compare_code = GET_CODE (compare_op);
18448
18449 flags = XEXP (compare_op, 0);
18450
18451 if (GET_MODE (flags) == CCFPmode
18452 || GET_MODE (flags) == CCFPUmode)
18453 {
18454 fpcmp = true;
18455 compare_code
18456 = ix86_fp_compare_code_to_integer (compare_code);
18457 }
18458
18459 /* To simplify rest of code, restrict to the GEU case. */
18460 if (compare_code == LTU)
18461 {
18462 HOST_WIDE_INT tmp = ct;
18463 ct = cf;
18464 cf = tmp;
18465 compare_code = reverse_condition (compare_code);
18466 code = reverse_condition (code);
18467 }
18468 else
18469 {
18470 if (fpcmp)
18471 PUT_CODE (compare_op,
18472 reverse_condition_maybe_unordered
18473 (GET_CODE (compare_op)));
18474 else
18475 PUT_CODE (compare_op,
18476 reverse_condition (GET_CODE (compare_op)));
18477 }
18478 diff = ct - cf;
18479
18480 if (reg_overlap_mentioned_p (out, op0)
18481 || reg_overlap_mentioned_p (out, op1))
18482 tmp = gen_reg_rtx (mode);
18483
18484 if (mode == DImode)
18485 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18486 else
18487 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
18488 flags, compare_op));
18489 }
18490 else
18491 {
18492 if (code == GT || code == GE)
18493 code = reverse_condition (code);
18494 else
18495 {
18496 HOST_WIDE_INT tmp = ct;
18497 ct = cf;
18498 cf = tmp;
18499 diff = ct - cf;
18500 }
18501 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18502 }
18503
18504 if (diff == 1)
18505 {
18506 /*
18507 * cmpl op0,op1
18508 * sbbl dest,dest
18509 * [addl dest, ct]
18510 *
18511 * Size 5 - 8.
18512 */
18513 if (ct)
18514 tmp = expand_simple_binop (mode, PLUS,
18515 tmp, GEN_INT (ct),
18516 copy_rtx (tmp), 1, OPTAB_DIRECT);
18517 }
18518 else if (cf == -1)
18519 {
18520 /*
18521 * cmpl op0,op1
18522 * sbbl dest,dest
18523 * orl $ct, dest
18524 *
18525 * Size 8.
18526 */
18527 tmp = expand_simple_binop (mode, IOR,
18528 tmp, GEN_INT (ct),
18529 copy_rtx (tmp), 1, OPTAB_DIRECT);
18530 }
18531 else if (diff == -1 && ct)
18532 {
18533 /*
18534 * cmpl op0,op1
18535 * sbbl dest,dest
18536 * notl dest
18537 * [addl dest, cf]
18538 *
18539 * Size 8 - 11.
18540 */
18541 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18542 if (cf)
18543 tmp = expand_simple_binop (mode, PLUS,
18544 copy_rtx (tmp), GEN_INT (cf),
18545 copy_rtx (tmp), 1, OPTAB_DIRECT);
18546 }
18547 else
18548 {
18549 /*
18550 * cmpl op0,op1
18551 * sbbl dest,dest
18552 * [notl dest]
18553 * andl cf - ct, dest
18554 * [addl dest, ct]
18555 *
18556 * Size 8 - 11.
18557 */
18558
18559 if (cf == 0)
18560 {
18561 cf = ct;
18562 ct = 0;
18563 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18564 }
18565
18566 tmp = expand_simple_binop (mode, AND,
18567 copy_rtx (tmp),
18568 gen_int_mode (cf - ct, mode),
18569 copy_rtx (tmp), 1, OPTAB_DIRECT);
18570 if (ct)
18571 tmp = expand_simple_binop (mode, PLUS,
18572 copy_rtx (tmp), GEN_INT (ct),
18573 copy_rtx (tmp), 1, OPTAB_DIRECT);
18574 }
18575
18576 if (!rtx_equal_p (tmp, out))
18577 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
18578
18579 return true;
18580 }
18581
18582 if (diff < 0)
18583 {
18584 enum machine_mode cmp_mode = GET_MODE (op0);
18585
18586 HOST_WIDE_INT tmp;
18587 tmp = ct, ct = cf, cf = tmp;
18588 diff = -diff;
18589
18590 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18591 {
18592 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18593
18594 /* We may be reversing unordered compare to normal compare, that
18595 is not valid in general (we may convert non-trapping condition
18596 to trapping one), however on i386 we currently emit all
18597 comparisons unordered. */
18598 compare_code = reverse_condition_maybe_unordered (compare_code);
18599 code = reverse_condition_maybe_unordered (code);
18600 }
18601 else
18602 {
18603 compare_code = reverse_condition (compare_code);
18604 code = reverse_condition (code);
18605 }
18606 }
18607
18608 compare_code = UNKNOWN;
18609 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
18610 && CONST_INT_P (op1))
18611 {
18612 if (op1 == const0_rtx
18613 && (code == LT || code == GE))
18614 compare_code = code;
18615 else if (op1 == constm1_rtx)
18616 {
18617 if (code == LE)
18618 compare_code = LT;
18619 else if (code == GT)
18620 compare_code = GE;
18621 }
18622 }
18623
18624 /* Optimize dest = (op0 < 0) ? -1 : cf. */
18625 if (compare_code != UNKNOWN
18626 && GET_MODE (op0) == GET_MODE (out)
18627 && (cf == -1 || ct == -1))
18628 {
18629 /* If lea code below could be used, only optimize
18630 if it results in a 2 insn sequence. */
18631
18632 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
18633 || diff == 3 || diff == 5 || diff == 9)
18634 || (compare_code == LT && ct == -1)
18635 || (compare_code == GE && cf == -1))
18636 {
18637 /*
18638 * notl op1 (if necessary)
18639 * sarl $31, op1
18640 * orl cf, op1
18641 */
18642 if (ct != -1)
18643 {
18644 cf = ct;
18645 ct = -1;
18646 code = reverse_condition (code);
18647 }
18648
18649 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18650
18651 out = expand_simple_binop (mode, IOR,
18652 out, GEN_INT (cf),
18653 out, 1, OPTAB_DIRECT);
18654 if (out != operands[0])
18655 emit_move_insn (operands[0], out);
18656
18657 return true;
18658 }
18659 }
18660
18661
18662 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
18663 || diff == 3 || diff == 5 || diff == 9)
18664 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
18665 && (mode != DImode
18666 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
18667 {
18668 /*
18669 * xorl dest,dest
18670 * cmpl op1,op2
18671 * setcc dest
18672 * lea cf(dest*(ct-cf)),dest
18673 *
18674 * Size 14.
18675 *
18676 * This also catches the degenerate setcc-only case.
18677 */
18678
18679 rtx tmp;
18680 int nops;
18681
18682 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18683
18684 nops = 0;
18685 /* On x86_64 the lea instruction operates on Pmode, so we need
18686 to get arithmetics done in proper mode to match. */
18687 if (diff == 1)
18688 tmp = copy_rtx (out);
18689 else
18690 {
18691 rtx out1;
18692 out1 = copy_rtx (out);
18693 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
18694 nops++;
18695 if (diff & 1)
18696 {
18697 tmp = gen_rtx_PLUS (mode, tmp, out1);
18698 nops++;
18699 }
18700 }
18701 if (cf != 0)
18702 {
18703 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
18704 nops++;
18705 }
18706 if (!rtx_equal_p (tmp, out))
18707 {
18708 if (nops == 1)
18709 out = force_operand (tmp, copy_rtx (out));
18710 else
18711 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
18712 }
18713 if (!rtx_equal_p (out, operands[0]))
18714 emit_move_insn (operands[0], copy_rtx (out));
18715
18716 return true;
18717 }
18718
18719 /*
18720 * General case: Jumpful:
18721 * xorl dest,dest cmpl op1, op2
18722 * cmpl op1, op2 movl ct, dest
18723 * setcc dest jcc 1f
18724 * decl dest movl cf, dest
18725 * andl (cf-ct),dest 1:
18726 * addl ct,dest
18727 *
18728 * Size 20. Size 14.
18729 *
18730 * This is reasonably steep, but branch mispredict costs are
18731 * high on modern cpus, so consider failing only if optimizing
18732 * for space.
18733 */
18734
18735 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
18736 && BRANCH_COST (optimize_insn_for_speed_p (),
18737 false) >= 2)
18738 {
18739 if (cf == 0)
18740 {
18741 enum machine_mode cmp_mode = GET_MODE (op0);
18742
18743 cf = ct;
18744 ct = 0;
18745
18746 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18747 {
18748 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18749
18750 /* We may be reversing unordered compare to normal compare,
18751 that is not valid in general (we may convert non-trapping
18752 condition to trapping one), however on i386 we currently
18753 emit all comparisons unordered. */
18754 code = reverse_condition_maybe_unordered (code);
18755 }
18756 else
18757 {
18758 code = reverse_condition (code);
18759 if (compare_code != UNKNOWN)
18760 compare_code = reverse_condition (compare_code);
18761 }
18762 }
18763
18764 if (compare_code != UNKNOWN)
18765 {
18766 /* notl op1 (if needed)
18767 sarl $31, op1
18768 andl (cf-ct), op1
18769 addl ct, op1
18770
18771 For x < 0 (resp. x <= -1) there will be no notl,
18772 so if possible swap the constants to get rid of the
18773 complement.
18774 True/false will be -1/0 while code below (store flag
18775 followed by decrement) is 0/-1, so the constants need
18776 to be exchanged once more. */
18777
18778 if (compare_code == GE || !cf)
18779 {
18780 code = reverse_condition (code);
18781 compare_code = LT;
18782 }
18783 else
18784 {
18785 HOST_WIDE_INT tmp = cf;
18786 cf = ct;
18787 ct = tmp;
18788 }
18789
18790 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18791 }
18792 else
18793 {
18794 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18795
18796 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
18797 constm1_rtx,
18798 copy_rtx (out), 1, OPTAB_DIRECT);
18799 }
18800
18801 out = expand_simple_binop (mode, AND, copy_rtx (out),
18802 gen_int_mode (cf - ct, mode),
18803 copy_rtx (out), 1, OPTAB_DIRECT);
18804 if (ct)
18805 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
18806 copy_rtx (out), 1, OPTAB_DIRECT);
18807 if (!rtx_equal_p (out, operands[0]))
18808 emit_move_insn (operands[0], copy_rtx (out));
18809
18810 return true;
18811 }
18812 }
18813
18814 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
18815 {
18816 /* Try a few things more with specific constants and a variable. */
18817
18818 optab op;
18819 rtx var, orig_out, out, tmp;
18820
18821 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
18822 return false;
18823
18824 /* If one of the two operands is an interesting constant, load a
18825 constant with the above and mask it in with a logical operation. */
18826
18827 if (CONST_INT_P (operands[2]))
18828 {
18829 var = operands[3];
18830 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
18831 operands[3] = constm1_rtx, op = and_optab;
18832 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
18833 operands[3] = const0_rtx, op = ior_optab;
18834 else
18835 return false;
18836 }
18837 else if (CONST_INT_P (operands[3]))
18838 {
18839 var = operands[2];
18840 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
18841 operands[2] = constm1_rtx, op = and_optab;
18842 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
18843 operands[2] = const0_rtx, op = ior_optab;
18844 else
18845 return false;
18846 }
18847 else
18848 return false;
18849
18850 orig_out = operands[0];
18851 tmp = gen_reg_rtx (mode);
18852 operands[0] = tmp;
18853
18854 /* Recurse to get the constant loaded. */
18855 if (ix86_expand_int_movcc (operands) == 0)
18856 return false;
18857
18858 /* Mask in the interesting variable. */
18859 out = expand_binop (mode, op, var, tmp, orig_out, 0,
18860 OPTAB_WIDEN);
18861 if (!rtx_equal_p (out, orig_out))
18862 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
18863
18864 return true;
18865 }
18866
18867 /*
18868 * For comparison with above,
18869 *
18870 * movl cf,dest
18871 * movl ct,tmp
18872 * cmpl op1,op2
18873 * cmovcc tmp,dest
18874 *
18875 * Size 15.
18876 */
18877
18878 if (! nonimmediate_operand (operands[2], mode))
18879 operands[2] = force_reg (mode, operands[2]);
18880 if (! nonimmediate_operand (operands[3], mode))
18881 operands[3] = force_reg (mode, operands[3]);
18882
18883 if (! register_operand (operands[2], VOIDmode)
18884 && (mode == QImode
18885 || ! register_operand (operands[3], VOIDmode)))
18886 operands[2] = force_reg (mode, operands[2]);
18887
18888 if (mode == QImode
18889 && ! register_operand (operands[3], VOIDmode))
18890 operands[3] = force_reg (mode, operands[3]);
18891
18892 emit_insn (compare_seq);
18893 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
18894 gen_rtx_IF_THEN_ELSE (mode,
18895 compare_op, operands[2],
18896 operands[3])));
18897 return true;
18898 }
18899
18900 /* Swap, force into registers, or otherwise massage the two operands
18901 to an sse comparison with a mask result. Thus we differ a bit from
18902 ix86_prepare_fp_compare_args which expects to produce a flags result.
18903
18904 The DEST operand exists to help determine whether to commute commutative
18905 operators. The POP0/POP1 operands are updated in place. The new
18906 comparison code is returned, or UNKNOWN if not implementable. */
18907
18908 static enum rtx_code
18909 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
18910 rtx *pop0, rtx *pop1)
18911 {
18912 rtx tmp;
18913
18914 switch (code)
18915 {
18916 case LTGT:
18917 case UNEQ:
18918 /* AVX supports all the needed comparisons. */
18919 if (TARGET_AVX)
18920 break;
18921 /* We have no LTGT as an operator. We could implement it with
18922 NE & ORDERED, but this requires an extra temporary. It's
18923 not clear that it's worth it. */
18924 return UNKNOWN;
18925
18926 case LT:
18927 case LE:
18928 case UNGT:
18929 case UNGE:
18930 /* These are supported directly. */
18931 break;
18932
18933 case EQ:
18934 case NE:
18935 case UNORDERED:
18936 case ORDERED:
18937 /* AVX has 3 operand comparisons, no need to swap anything. */
18938 if (TARGET_AVX)
18939 break;
18940 /* For commutative operators, try to canonicalize the destination
18941 operand to be first in the comparison - this helps reload to
18942 avoid extra moves. */
18943 if (!dest || !rtx_equal_p (dest, *pop1))
18944 break;
18945 /* FALLTHRU */
18946
18947 case GE:
18948 case GT:
18949 case UNLE:
18950 case UNLT:
18951 /* These are not supported directly before AVX, and furthermore
18952 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
18953 comparison operands to transform into something that is
18954 supported. */
18955 tmp = *pop0;
18956 *pop0 = *pop1;
18957 *pop1 = tmp;
18958 code = swap_condition (code);
18959 break;
18960
18961 default:
18962 gcc_unreachable ();
18963 }
18964
18965 return code;
18966 }
18967
18968 /* Detect conditional moves that exactly match min/max operational
18969 semantics. Note that this is IEEE safe, as long as we don't
18970 interchange the operands.
18971
18972 Returns FALSE if this conditional move doesn't match a MIN/MAX,
18973 and TRUE if the operation is successful and instructions are emitted. */
18974
18975 static bool
18976 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
18977 rtx cmp_op1, rtx if_true, rtx if_false)
18978 {
18979 enum machine_mode mode;
18980 bool is_min;
18981 rtx tmp;
18982
18983 if (code == LT)
18984 ;
18985 else if (code == UNGE)
18986 {
18987 tmp = if_true;
18988 if_true = if_false;
18989 if_false = tmp;
18990 }
18991 else
18992 return false;
18993
18994 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
18995 is_min = true;
18996 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
18997 is_min = false;
18998 else
18999 return false;
19000
19001 mode = GET_MODE (dest);
19002
19003 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19004 but MODE may be a vector mode and thus not appropriate. */
19005 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19006 {
19007 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19008 rtvec v;
19009
19010 if_true = force_reg (mode, if_true);
19011 v = gen_rtvec (2, if_true, if_false);
19012 tmp = gen_rtx_UNSPEC (mode, v, u);
19013 }
19014 else
19015 {
19016 code = is_min ? SMIN : SMAX;
19017 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19018 }
19019
19020 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19021 return true;
19022 }
19023
19024 /* Expand an sse vector comparison. Return the register with the result. */
19025
19026 static rtx
19027 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19028 rtx op_true, rtx op_false)
19029 {
19030 enum machine_mode mode = GET_MODE (dest);
19031 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19032 rtx x;
19033
19034 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19035 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19036 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19037
19038 if (optimize
19039 || reg_overlap_mentioned_p (dest, op_true)
19040 || reg_overlap_mentioned_p (dest, op_false))
19041 dest = gen_reg_rtx (mode);
19042
19043 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19044 if (cmp_mode != mode)
19045 {
19046 x = force_reg (cmp_mode, x);
19047 convert_move (dest, x, false);
19048 }
19049 else
19050 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19051
19052 return dest;
19053 }
19054
19055 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19056 operations. This is used for both scalar and vector conditional moves. */
19057
19058 static void
19059 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19060 {
19061 enum machine_mode mode = GET_MODE (dest);
19062 rtx t2, t3, x;
19063
19064 if (vector_all_ones_operand (op_true, mode)
19065 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19066 {
19067 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19068 }
19069 else if (op_false == CONST0_RTX (mode))
19070 {
19071 op_true = force_reg (mode, op_true);
19072 x = gen_rtx_AND (mode, cmp, op_true);
19073 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19074 }
19075 else if (op_true == CONST0_RTX (mode))
19076 {
19077 op_false = force_reg (mode, op_false);
19078 x = gen_rtx_NOT (mode, cmp);
19079 x = gen_rtx_AND (mode, x, op_false);
19080 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19081 }
19082 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19083 {
19084 op_false = force_reg (mode, op_false);
19085 x = gen_rtx_IOR (mode, cmp, op_false);
19086 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19087 }
19088 else if (TARGET_XOP)
19089 {
19090 op_true = force_reg (mode, op_true);
19091
19092 if (!nonimmediate_operand (op_false, mode))
19093 op_false = force_reg (mode, op_false);
19094
19095 emit_insn (gen_rtx_SET (mode, dest,
19096 gen_rtx_IF_THEN_ELSE (mode, cmp,
19097 op_true,
19098 op_false)));
19099 }
19100 else
19101 {
19102 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19103
19104 if (!nonimmediate_operand (op_true, mode))
19105 op_true = force_reg (mode, op_true);
19106
19107 op_false = force_reg (mode, op_false);
19108
19109 switch (mode)
19110 {
19111 case V4SFmode:
19112 if (TARGET_SSE4_1)
19113 gen = gen_sse4_1_blendvps;
19114 break;
19115 case V2DFmode:
19116 if (TARGET_SSE4_1)
19117 gen = gen_sse4_1_blendvpd;
19118 break;
19119 case V16QImode:
19120 case V8HImode:
19121 case V4SImode:
19122 case V2DImode:
19123 if (TARGET_SSE4_1)
19124 {
19125 gen = gen_sse4_1_pblendvb;
19126 dest = gen_lowpart (V16QImode, dest);
19127 op_false = gen_lowpart (V16QImode, op_false);
19128 op_true = gen_lowpart (V16QImode, op_true);
19129 cmp = gen_lowpart (V16QImode, cmp);
19130 }
19131 break;
19132 case V8SFmode:
19133 if (TARGET_AVX)
19134 gen = gen_avx_blendvps256;
19135 break;
19136 case V4DFmode:
19137 if (TARGET_AVX)
19138 gen = gen_avx_blendvpd256;
19139 break;
19140 case V32QImode:
19141 case V16HImode:
19142 case V8SImode:
19143 case V4DImode:
19144 if (TARGET_AVX2)
19145 {
19146 gen = gen_avx2_pblendvb;
19147 dest = gen_lowpart (V32QImode, dest);
19148 op_false = gen_lowpart (V32QImode, op_false);
19149 op_true = gen_lowpart (V32QImode, op_true);
19150 cmp = gen_lowpart (V32QImode, cmp);
19151 }
19152 break;
19153 default:
19154 break;
19155 }
19156
19157 if (gen != NULL)
19158 emit_insn (gen (dest, op_false, op_true, cmp));
19159 else
19160 {
19161 op_true = force_reg (mode, op_true);
19162
19163 t2 = gen_reg_rtx (mode);
19164 if (optimize)
19165 t3 = gen_reg_rtx (mode);
19166 else
19167 t3 = dest;
19168
19169 x = gen_rtx_AND (mode, op_true, cmp);
19170 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19171
19172 x = gen_rtx_NOT (mode, cmp);
19173 x = gen_rtx_AND (mode, x, op_false);
19174 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19175
19176 x = gen_rtx_IOR (mode, t3, t2);
19177 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19178 }
19179 }
19180 }
19181
19182 /* Expand a floating-point conditional move. Return true if successful. */
19183
19184 bool
19185 ix86_expand_fp_movcc (rtx operands[])
19186 {
19187 enum machine_mode mode = GET_MODE (operands[0]);
19188 enum rtx_code code = GET_CODE (operands[1]);
19189 rtx tmp, compare_op;
19190 rtx op0 = XEXP (operands[1], 0);
19191 rtx op1 = XEXP (operands[1], 1);
19192
19193 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19194 {
19195 enum machine_mode cmode;
19196
19197 /* Since we've no cmove for sse registers, don't force bad register
19198 allocation just to gain access to it. Deny movcc when the
19199 comparison mode doesn't match the move mode. */
19200 cmode = GET_MODE (op0);
19201 if (cmode == VOIDmode)
19202 cmode = GET_MODE (op1);
19203 if (cmode != mode)
19204 return false;
19205
19206 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19207 if (code == UNKNOWN)
19208 return false;
19209
19210 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19211 operands[2], operands[3]))
19212 return true;
19213
19214 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19215 operands[2], operands[3]);
19216 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19217 return true;
19218 }
19219
19220 /* The floating point conditional move instructions don't directly
19221 support conditions resulting from a signed integer comparison. */
19222
19223 compare_op = ix86_expand_compare (code, op0, op1);
19224 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19225 {
19226 tmp = gen_reg_rtx (QImode);
19227 ix86_expand_setcc (tmp, code, op0, op1);
19228
19229 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19230 }
19231
19232 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19233 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19234 operands[2], operands[3])));
19235
19236 return true;
19237 }
19238
19239 /* Expand a floating-point vector conditional move; a vcond operation
19240 rather than a movcc operation. */
19241
19242 bool
19243 ix86_expand_fp_vcond (rtx operands[])
19244 {
19245 enum rtx_code code = GET_CODE (operands[3]);
19246 rtx cmp;
19247
19248 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19249 &operands[4], &operands[5]);
19250 if (code == UNKNOWN)
19251 {
19252 rtx temp;
19253 switch (GET_CODE (operands[3]))
19254 {
19255 case LTGT:
19256 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19257 operands[5], operands[0], operands[0]);
19258 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19259 operands[5], operands[1], operands[2]);
19260 code = AND;
19261 break;
19262 case UNEQ:
19263 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19264 operands[5], operands[0], operands[0]);
19265 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19266 operands[5], operands[1], operands[2]);
19267 code = IOR;
19268 break;
19269 default:
19270 gcc_unreachable ();
19271 }
19272 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19273 OPTAB_DIRECT);
19274 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19275 return true;
19276 }
19277
19278 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19279 operands[5], operands[1], operands[2]))
19280 return true;
19281
19282 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19283 operands[1], operands[2]);
19284 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19285 return true;
19286 }
19287
19288 /* Expand a signed/unsigned integral vector conditional move. */
19289
19290 bool
19291 ix86_expand_int_vcond (rtx operands[])
19292 {
19293 enum machine_mode data_mode = GET_MODE (operands[0]);
19294 enum machine_mode mode = GET_MODE (operands[4]);
19295 enum rtx_code code = GET_CODE (operands[3]);
19296 bool negate = false;
19297 rtx x, cop0, cop1;
19298
19299 cop0 = operands[4];
19300 cop1 = operands[5];
19301
19302 /* XOP supports all of the comparisons on all vector int types. */
19303 if (!TARGET_XOP)
19304 {
19305 /* Canonicalize the comparison to EQ, GT, GTU. */
19306 switch (code)
19307 {
19308 case EQ:
19309 case GT:
19310 case GTU:
19311 break;
19312
19313 case NE:
19314 case LE:
19315 case LEU:
19316 code = reverse_condition (code);
19317 negate = true;
19318 break;
19319
19320 case GE:
19321 case GEU:
19322 code = reverse_condition (code);
19323 negate = true;
19324 /* FALLTHRU */
19325
19326 case LT:
19327 case LTU:
19328 code = swap_condition (code);
19329 x = cop0, cop0 = cop1, cop1 = x;
19330 break;
19331
19332 default:
19333 gcc_unreachable ();
19334 }
19335
19336 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19337 if (mode == V2DImode)
19338 {
19339 switch (code)
19340 {
19341 case EQ:
19342 /* SSE4.1 supports EQ. */
19343 if (!TARGET_SSE4_1)
19344 return false;
19345 break;
19346
19347 case GT:
19348 case GTU:
19349 /* SSE4.2 supports GT/GTU. */
19350 if (!TARGET_SSE4_2)
19351 return false;
19352 break;
19353
19354 default:
19355 gcc_unreachable ();
19356 }
19357 }
19358
19359 /* Unsigned parallel compare is not supported by the hardware.
19360 Play some tricks to turn this into a signed comparison
19361 against 0. */
19362 if (code == GTU)
19363 {
19364 cop0 = force_reg (mode, cop0);
19365
19366 switch (mode)
19367 {
19368 case V8SImode:
19369 case V4DImode:
19370 case V4SImode:
19371 case V2DImode:
19372 {
19373 rtx t1, t2, mask;
19374 rtx (*gen_sub3) (rtx, rtx, rtx);
19375
19376 switch (mode)
19377 {
19378 case V8SImode: gen_sub3 = gen_subv8si3; break;
19379 case V4DImode: gen_sub3 = gen_subv4di3; break;
19380 case V4SImode: gen_sub3 = gen_subv4si3; break;
19381 case V2DImode: gen_sub3 = gen_subv2di3; break;
19382 default:
19383 gcc_unreachable ();
19384 }
19385 /* Subtract (-(INT MAX) - 1) from both operands to make
19386 them signed. */
19387 mask = ix86_build_signbit_mask (mode, true, false);
19388 t1 = gen_reg_rtx (mode);
19389 emit_insn (gen_sub3 (t1, cop0, mask));
19390
19391 t2 = gen_reg_rtx (mode);
19392 emit_insn (gen_sub3 (t2, cop1, mask));
19393
19394 cop0 = t1;
19395 cop1 = t2;
19396 code = GT;
19397 }
19398 break;
19399
19400 case V32QImode:
19401 case V16HImode:
19402 case V16QImode:
19403 case V8HImode:
19404 /* Perform a parallel unsigned saturating subtraction. */
19405 x = gen_reg_rtx (mode);
19406 emit_insn (gen_rtx_SET (VOIDmode, x,
19407 gen_rtx_US_MINUS (mode, cop0, cop1)));
19408
19409 cop0 = x;
19410 cop1 = CONST0_RTX (mode);
19411 code = EQ;
19412 negate = !negate;
19413 break;
19414
19415 default:
19416 gcc_unreachable ();
19417 }
19418 }
19419 }
19420
19421 /* Allow the comparison to be done in one mode, but the movcc to
19422 happen in another mode. */
19423 if (data_mode == mode)
19424 {
19425 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19426 operands[1+negate], operands[2-negate]);
19427 }
19428 else
19429 {
19430 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19431 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19432 code, cop0, cop1,
19433 operands[1+negate], operands[2-negate]);
19434 x = gen_lowpart (data_mode, x);
19435 }
19436
19437 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19438 operands[2-negate]);
19439 return true;
19440 }
19441
19442 /* Expand a variable vector permutation. */
19443
19444 void
19445 ix86_expand_vec_perm (rtx operands[])
19446 {
19447 rtx target = operands[0];
19448 rtx op0 = operands[1];
19449 rtx op1 = operands[2];
19450 rtx mask = operands[3];
19451 rtx t1, t2, t3, t4, vt, vt2, vec[32];
19452 enum machine_mode mode = GET_MODE (op0);
19453 enum machine_mode maskmode = GET_MODE (mask);
19454 int w, e, i;
19455 bool one_operand_shuffle = rtx_equal_p (op0, op1);
19456
19457 /* Number of elements in the vector. */
19458 w = GET_MODE_NUNITS (mode);
19459 e = GET_MODE_UNIT_SIZE (mode);
19460 gcc_assert (w <= 32);
19461
19462 if (TARGET_AVX2)
19463 {
19464 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19465 {
19466 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19467 an constant shuffle operand. With a tiny bit of effort we can
19468 use VPERMD instead. A re-interpretation stall for V4DFmode is
19469 unfortunate but there's no avoiding it.
19470 Similarly for V16HImode we don't have instructions for variable
19471 shuffling, while for V32QImode we can use after preparing suitable
19472 masks vpshufb; vpshufb; vpermq; vpor. */
19473
19474 if (mode == V16HImode)
19475 {
19476 maskmode = mode = V32QImode;
19477 w = 32;
19478 e = 1;
19479 }
19480 else
19481 {
19482 maskmode = mode = V8SImode;
19483 w = 8;
19484 e = 4;
19485 }
19486 t1 = gen_reg_rtx (maskmode);
19487
19488 /* Replicate the low bits of the V4DImode mask into V8SImode:
19489 mask = { A B C D }
19490 t1 = { A A B B C C D D }. */
19491 for (i = 0; i < w / 2; ++i)
19492 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
19493 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19494 vt = force_reg (maskmode, vt);
19495 mask = gen_lowpart (maskmode, mask);
19496 if (maskmode == V8SImode)
19497 emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
19498 else
19499 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
19500
19501 /* Multiply the shuffle indicies by two. */
19502 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
19503 OPTAB_DIRECT);
19504
19505 /* Add one to the odd shuffle indicies:
19506 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
19507 for (i = 0; i < w / 2; ++i)
19508 {
19509 vec[i * 2] = const0_rtx;
19510 vec[i * 2 + 1] = const1_rtx;
19511 }
19512 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19513 vt = force_const_mem (maskmode, vt);
19514 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
19515 OPTAB_DIRECT);
19516
19517 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
19518 operands[3] = mask = t1;
19519 target = gen_lowpart (mode, target);
19520 op0 = gen_lowpart (mode, op0);
19521 op1 = gen_lowpart (mode, op1);
19522 }
19523
19524 switch (mode)
19525 {
19526 case V8SImode:
19527 /* The VPERMD and VPERMPS instructions already properly ignore
19528 the high bits of the shuffle elements. No need for us to
19529 perform an AND ourselves. */
19530 if (one_operand_shuffle)
19531 emit_insn (gen_avx2_permvarv8si (target, mask, op0));
19532 else
19533 {
19534 t1 = gen_reg_rtx (V8SImode);
19535 t2 = gen_reg_rtx (V8SImode);
19536 emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
19537 emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
19538 goto merge_two;
19539 }
19540 return;
19541
19542 case V8SFmode:
19543 mask = gen_lowpart (V8SFmode, mask);
19544 if (one_operand_shuffle)
19545 emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
19546 else
19547 {
19548 t1 = gen_reg_rtx (V8SFmode);
19549 t2 = gen_reg_rtx (V8SFmode);
19550 emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
19551 emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
19552 goto merge_two;
19553 }
19554 return;
19555
19556 case V4SImode:
19557 /* By combining the two 128-bit input vectors into one 256-bit
19558 input vector, we can use VPERMD and VPERMPS for the full
19559 two-operand shuffle. */
19560 t1 = gen_reg_rtx (V8SImode);
19561 t2 = gen_reg_rtx (V8SImode);
19562 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
19563 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19564 emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
19565 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
19566 return;
19567
19568 case V4SFmode:
19569 t1 = gen_reg_rtx (V8SFmode);
19570 t2 = gen_reg_rtx (V8SFmode);
19571 mask = gen_lowpart (V4SFmode, mask);
19572 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
19573 emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
19574 emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
19575 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
19576 return;
19577
19578 case V32QImode:
19579 t1 = gen_reg_rtx (V32QImode);
19580 t2 = gen_reg_rtx (V32QImode);
19581 t3 = gen_reg_rtx (V32QImode);
19582 vt2 = GEN_INT (128);
19583 for (i = 0; i < 32; i++)
19584 vec[i] = vt2;
19585 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19586 vt = force_reg (V32QImode, vt);
19587 for (i = 0; i < 32; i++)
19588 vec[i] = i < 16 ? vt2 : const0_rtx;
19589 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19590 vt2 = force_reg (V32QImode, vt2);
19591 /* From mask create two adjusted masks, which contain the same
19592 bits as mask in the low 7 bits of each vector element.
19593 The first mask will have the most significant bit clear
19594 if it requests element from the same 128-bit lane
19595 and MSB set if it requests element from the other 128-bit lane.
19596 The second mask will have the opposite values of the MSB,
19597 and additionally will have its 128-bit lanes swapped.
19598 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
19599 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
19600 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
19601 stands for other 12 bytes. */
19602 /* The bit whether element is from the same lane or the other
19603 lane is bit 4, so shift it up by 3 to the MSB position. */
19604 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
19605 gen_lowpart (V4DImode, mask),
19606 GEN_INT (3)));
19607 /* Clear MSB bits from the mask just in case it had them set. */
19608 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
19609 /* After this t1 will have MSB set for elements from other lane. */
19610 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
19611 /* Clear bits other than MSB. */
19612 emit_insn (gen_andv32qi3 (t1, t1, vt));
19613 /* Or in the lower bits from mask into t3. */
19614 emit_insn (gen_iorv32qi3 (t3, t1, t2));
19615 /* And invert MSB bits in t1, so MSB is set for elements from the same
19616 lane. */
19617 emit_insn (gen_xorv32qi3 (t1, t1, vt));
19618 /* Swap 128-bit lanes in t3. */
19619 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19620 gen_lowpart (V4DImode, t3),
19621 const2_rtx, GEN_INT (3),
19622 const0_rtx, const1_rtx));
19623 /* And or in the lower bits from mask into t1. */
19624 emit_insn (gen_iorv32qi3 (t1, t1, t2));
19625 if (one_operand_shuffle)
19626 {
19627 /* Each of these shuffles will put 0s in places where
19628 element from the other 128-bit lane is needed, otherwise
19629 will shuffle in the requested value. */
19630 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
19631 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
19632 /* For t3 the 128-bit lanes are swapped again. */
19633 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19634 gen_lowpart (V4DImode, t3),
19635 const2_rtx, GEN_INT (3),
19636 const0_rtx, const1_rtx));
19637 /* And oring both together leads to the result. */
19638 emit_insn (gen_iorv32qi3 (target, t1, t3));
19639 return;
19640 }
19641
19642 t4 = gen_reg_rtx (V32QImode);
19643 /* Similarly to the above one_operand_shuffle code,
19644 just for repeated twice for each operand. merge_two:
19645 code will merge the two results together. */
19646 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
19647 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
19648 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
19649 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
19650 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
19651 gen_lowpart (V4DImode, t4),
19652 const2_rtx, GEN_INT (3),
19653 const0_rtx, const1_rtx));
19654 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19655 gen_lowpart (V4DImode, t3),
19656 const2_rtx, GEN_INT (3),
19657 const0_rtx, const1_rtx));
19658 emit_insn (gen_iorv32qi3 (t4, t2, t4));
19659 emit_insn (gen_iorv32qi3 (t3, t1, t3));
19660 t1 = t4;
19661 t2 = t3;
19662 goto merge_two;
19663
19664 default:
19665 gcc_assert (GET_MODE_SIZE (mode) <= 16);
19666 break;
19667 }
19668 }
19669
19670 if (TARGET_XOP)
19671 {
19672 /* The XOP VPPERM insn supports three inputs. By ignoring the
19673 one_operand_shuffle special case, we avoid creating another
19674 set of constant vectors in memory. */
19675 one_operand_shuffle = false;
19676
19677 /* mask = mask & {2*w-1, ...} */
19678 vt = GEN_INT (2*w - 1);
19679 }
19680 else
19681 {
19682 /* mask = mask & {w-1, ...} */
19683 vt = GEN_INT (w - 1);
19684 }
19685
19686 for (i = 0; i < w; i++)
19687 vec[i] = vt;
19688 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19689 mask = expand_simple_binop (maskmode, AND, mask, vt,
19690 NULL_RTX, 0, OPTAB_DIRECT);
19691
19692 /* For non-QImode operations, convert the word permutation control
19693 into a byte permutation control. */
19694 if (mode != V16QImode)
19695 {
19696 mask = expand_simple_binop (maskmode, ASHIFT, mask,
19697 GEN_INT (exact_log2 (e)),
19698 NULL_RTX, 0, OPTAB_DIRECT);
19699
19700 /* Convert mask to vector of chars. */
19701 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
19702
19703 /* Replicate each of the input bytes into byte positions:
19704 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
19705 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
19706 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
19707 for (i = 0; i < 16; ++i)
19708 vec[i] = GEN_INT (i/e * e);
19709 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
19710 vt = force_const_mem (V16QImode, vt);
19711 if (TARGET_XOP)
19712 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
19713 else
19714 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
19715
19716 /* Convert it into the byte positions by doing
19717 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
19718 for (i = 0; i < 16; ++i)
19719 vec[i] = GEN_INT (i % e);
19720 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
19721 vt = force_const_mem (V16QImode, vt);
19722 emit_insn (gen_addv16qi3 (mask, mask, vt));
19723 }
19724
19725 /* The actual shuffle operations all operate on V16QImode. */
19726 op0 = gen_lowpart (V16QImode, op0);
19727 op1 = gen_lowpart (V16QImode, op1);
19728 target = gen_lowpart (V16QImode, target);
19729
19730 if (TARGET_XOP)
19731 {
19732 emit_insn (gen_xop_pperm (target, op0, op1, mask));
19733 }
19734 else if (one_operand_shuffle)
19735 {
19736 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
19737 }
19738 else
19739 {
19740 rtx xops[6];
19741 bool ok;
19742
19743 /* Shuffle the two input vectors independently. */
19744 t1 = gen_reg_rtx (V16QImode);
19745 t2 = gen_reg_rtx (V16QImode);
19746 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
19747 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
19748
19749 merge_two:
19750 /* Then merge them together. The key is whether any given control
19751 element contained a bit set that indicates the second word. */
19752 mask = operands[3];
19753 vt = GEN_INT (w);
19754 if (maskmode == V2DImode && !TARGET_SSE4_1)
19755 {
19756 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
19757 more shuffle to convert the V2DI input mask into a V4SI
19758 input mask. At which point the masking that expand_int_vcond
19759 will work as desired. */
19760 rtx t3 = gen_reg_rtx (V4SImode);
19761 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
19762 const0_rtx, const0_rtx,
19763 const2_rtx, const2_rtx));
19764 mask = t3;
19765 maskmode = V4SImode;
19766 e = w = 4;
19767 }
19768
19769 for (i = 0; i < w; i++)
19770 vec[i] = vt;
19771 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19772 vt = force_reg (maskmode, vt);
19773 mask = expand_simple_binop (maskmode, AND, mask, vt,
19774 NULL_RTX, 0, OPTAB_DIRECT);
19775
19776 xops[0] = gen_lowpart (mode, operands[0]);
19777 xops[1] = gen_lowpart (mode, t2);
19778 xops[2] = gen_lowpart (mode, t1);
19779 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
19780 xops[4] = mask;
19781 xops[5] = vt;
19782 ok = ix86_expand_int_vcond (xops);
19783 gcc_assert (ok);
19784 }
19785 }
19786
19787 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
19788 true if we should do zero extension, else sign extension. HIGH_P is
19789 true if we want the N/2 high elements, else the low elements. */
19790
19791 void
19792 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
19793 {
19794 enum machine_mode imode = GET_MODE (operands[1]);
19795 rtx tmp, dest;
19796
19797 if (TARGET_SSE4_1)
19798 {
19799 rtx (*unpack)(rtx, rtx);
19800 rtx (*extract)(rtx, rtx) = NULL;
19801 enum machine_mode halfmode = BLKmode;
19802
19803 switch (imode)
19804 {
19805 case V32QImode:
19806 if (unsigned_p)
19807 unpack = gen_avx2_zero_extendv16qiv16hi2;
19808 else
19809 unpack = gen_avx2_sign_extendv16qiv16hi2;
19810 halfmode = V16QImode;
19811 extract
19812 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
19813 break;
19814 case V16HImode:
19815 if (unsigned_p)
19816 unpack = gen_avx2_zero_extendv8hiv8si2;
19817 else
19818 unpack = gen_avx2_sign_extendv8hiv8si2;
19819 halfmode = V8HImode;
19820 extract
19821 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
19822 break;
19823 case V8SImode:
19824 if (unsigned_p)
19825 unpack = gen_avx2_zero_extendv4siv4di2;
19826 else
19827 unpack = gen_avx2_sign_extendv4siv4di2;
19828 halfmode = V4SImode;
19829 extract
19830 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
19831 break;
19832 case V16QImode:
19833 if (unsigned_p)
19834 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
19835 else
19836 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
19837 break;
19838 case V8HImode:
19839 if (unsigned_p)
19840 unpack = gen_sse4_1_zero_extendv4hiv4si2;
19841 else
19842 unpack = gen_sse4_1_sign_extendv4hiv4si2;
19843 break;
19844 case V4SImode:
19845 if (unsigned_p)
19846 unpack = gen_sse4_1_zero_extendv2siv2di2;
19847 else
19848 unpack = gen_sse4_1_sign_extendv2siv2di2;
19849 break;
19850 default:
19851 gcc_unreachable ();
19852 }
19853
19854 if (GET_MODE_SIZE (imode) == 32)
19855 {
19856 tmp = gen_reg_rtx (halfmode);
19857 emit_insn (extract (tmp, operands[1]));
19858 }
19859 else if (high_p)
19860 {
19861 /* Shift higher 8 bytes to lower 8 bytes. */
19862 tmp = gen_reg_rtx (imode);
19863 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
19864 gen_lowpart (V1TImode, operands[1]),
19865 GEN_INT (64)));
19866 }
19867 else
19868 tmp = operands[1];
19869
19870 emit_insn (unpack (operands[0], tmp));
19871 }
19872 else
19873 {
19874 rtx (*unpack)(rtx, rtx, rtx);
19875
19876 switch (imode)
19877 {
19878 case V16QImode:
19879 if (high_p)
19880 unpack = gen_vec_interleave_highv16qi;
19881 else
19882 unpack = gen_vec_interleave_lowv16qi;
19883 break;
19884 case V8HImode:
19885 if (high_p)
19886 unpack = gen_vec_interleave_highv8hi;
19887 else
19888 unpack = gen_vec_interleave_lowv8hi;
19889 break;
19890 case V4SImode:
19891 if (high_p)
19892 unpack = gen_vec_interleave_highv4si;
19893 else
19894 unpack = gen_vec_interleave_lowv4si;
19895 break;
19896 default:
19897 gcc_unreachable ();
19898 }
19899
19900 dest = gen_lowpart (imode, operands[0]);
19901
19902 if (unsigned_p)
19903 tmp = force_reg (imode, CONST0_RTX (imode));
19904 else
19905 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
19906 operands[1], pc_rtx, pc_rtx);
19907
19908 emit_insn (unpack (dest, operands[1], tmp));
19909 }
19910 }
19911
19912 /* Expand conditional increment or decrement using adb/sbb instructions.
19913 The default case using setcc followed by the conditional move can be
19914 done by generic code. */
19915 bool
19916 ix86_expand_int_addcc (rtx operands[])
19917 {
19918 enum rtx_code code = GET_CODE (operands[1]);
19919 rtx flags;
19920 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
19921 rtx compare_op;
19922 rtx val = const0_rtx;
19923 bool fpcmp = false;
19924 enum machine_mode mode;
19925 rtx op0 = XEXP (operands[1], 0);
19926 rtx op1 = XEXP (operands[1], 1);
19927
19928 if (operands[3] != const1_rtx
19929 && operands[3] != constm1_rtx)
19930 return false;
19931 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19932 return false;
19933 code = GET_CODE (compare_op);
19934
19935 flags = XEXP (compare_op, 0);
19936
19937 if (GET_MODE (flags) == CCFPmode
19938 || GET_MODE (flags) == CCFPUmode)
19939 {
19940 fpcmp = true;
19941 code = ix86_fp_compare_code_to_integer (code);
19942 }
19943
19944 if (code != LTU)
19945 {
19946 val = constm1_rtx;
19947 if (fpcmp)
19948 PUT_CODE (compare_op,
19949 reverse_condition_maybe_unordered
19950 (GET_CODE (compare_op)));
19951 else
19952 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
19953 }
19954
19955 mode = GET_MODE (operands[0]);
19956
19957 /* Construct either adc or sbb insn. */
19958 if ((code == LTU) == (operands[3] == constm1_rtx))
19959 {
19960 switch (mode)
19961 {
19962 case QImode:
19963 insn = gen_subqi3_carry;
19964 break;
19965 case HImode:
19966 insn = gen_subhi3_carry;
19967 break;
19968 case SImode:
19969 insn = gen_subsi3_carry;
19970 break;
19971 case DImode:
19972 insn = gen_subdi3_carry;
19973 break;
19974 default:
19975 gcc_unreachable ();
19976 }
19977 }
19978 else
19979 {
19980 switch (mode)
19981 {
19982 case QImode:
19983 insn = gen_addqi3_carry;
19984 break;
19985 case HImode:
19986 insn = gen_addhi3_carry;
19987 break;
19988 case SImode:
19989 insn = gen_addsi3_carry;
19990 break;
19991 case DImode:
19992 insn = gen_adddi3_carry;
19993 break;
19994 default:
19995 gcc_unreachable ();
19996 }
19997 }
19998 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
19999
20000 return true;
20001 }
20002
20003
20004 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20005 but works for floating pointer parameters and nonoffsetable memories.
20006 For pushes, it returns just stack offsets; the values will be saved
20007 in the right order. Maximally three parts are generated. */
20008
20009 static int
20010 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20011 {
20012 int size;
20013
20014 if (!TARGET_64BIT)
20015 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20016 else
20017 size = (GET_MODE_SIZE (mode) + 4) / 8;
20018
20019 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20020 gcc_assert (size >= 2 && size <= 4);
20021
20022 /* Optimize constant pool reference to immediates. This is used by fp
20023 moves, that force all constants to memory to allow combining. */
20024 if (MEM_P (operand) && MEM_READONLY_P (operand))
20025 {
20026 rtx tmp = maybe_get_pool_constant (operand);
20027 if (tmp)
20028 operand = tmp;
20029 }
20030
20031 if (MEM_P (operand) && !offsettable_memref_p (operand))
20032 {
20033 /* The only non-offsetable memories we handle are pushes. */
20034 int ok = push_operand (operand, VOIDmode);
20035
20036 gcc_assert (ok);
20037
20038 operand = copy_rtx (operand);
20039 PUT_MODE (operand, Pmode);
20040 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20041 return size;
20042 }
20043
20044 if (GET_CODE (operand) == CONST_VECTOR)
20045 {
20046 enum machine_mode imode = int_mode_for_mode (mode);
20047 /* Caution: if we looked through a constant pool memory above,
20048 the operand may actually have a different mode now. That's
20049 ok, since we want to pun this all the way back to an integer. */
20050 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20051 gcc_assert (operand != NULL);
20052 mode = imode;
20053 }
20054
20055 if (!TARGET_64BIT)
20056 {
20057 if (mode == DImode)
20058 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20059 else
20060 {
20061 int i;
20062
20063 if (REG_P (operand))
20064 {
20065 gcc_assert (reload_completed);
20066 for (i = 0; i < size; i++)
20067 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20068 }
20069 else if (offsettable_memref_p (operand))
20070 {
20071 operand = adjust_address (operand, SImode, 0);
20072 parts[0] = operand;
20073 for (i = 1; i < size; i++)
20074 parts[i] = adjust_address (operand, SImode, 4 * i);
20075 }
20076 else if (GET_CODE (operand) == CONST_DOUBLE)
20077 {
20078 REAL_VALUE_TYPE r;
20079 long l[4];
20080
20081 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20082 switch (mode)
20083 {
20084 case TFmode:
20085 real_to_target (l, &r, mode);
20086 parts[3] = gen_int_mode (l[3], SImode);
20087 parts[2] = gen_int_mode (l[2], SImode);
20088 break;
20089 case XFmode:
20090 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20091 parts[2] = gen_int_mode (l[2], SImode);
20092 break;
20093 case DFmode:
20094 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20095 break;
20096 default:
20097 gcc_unreachable ();
20098 }
20099 parts[1] = gen_int_mode (l[1], SImode);
20100 parts[0] = gen_int_mode (l[0], SImode);
20101 }
20102 else
20103 gcc_unreachable ();
20104 }
20105 }
20106 else
20107 {
20108 if (mode == TImode)
20109 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20110 if (mode == XFmode || mode == TFmode)
20111 {
20112 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20113 if (REG_P (operand))
20114 {
20115 gcc_assert (reload_completed);
20116 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20117 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20118 }
20119 else if (offsettable_memref_p (operand))
20120 {
20121 operand = adjust_address (operand, DImode, 0);
20122 parts[0] = operand;
20123 parts[1] = adjust_address (operand, upper_mode, 8);
20124 }
20125 else if (GET_CODE (operand) == CONST_DOUBLE)
20126 {
20127 REAL_VALUE_TYPE r;
20128 long l[4];
20129
20130 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20131 real_to_target (l, &r, mode);
20132
20133 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20134 if (HOST_BITS_PER_WIDE_INT >= 64)
20135 parts[0]
20136 = gen_int_mode
20137 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20138 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20139 DImode);
20140 else
20141 parts[0] = immed_double_const (l[0], l[1], DImode);
20142
20143 if (upper_mode == SImode)
20144 parts[1] = gen_int_mode (l[2], SImode);
20145 else if (HOST_BITS_PER_WIDE_INT >= 64)
20146 parts[1]
20147 = gen_int_mode
20148 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20149 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20150 DImode);
20151 else
20152 parts[1] = immed_double_const (l[2], l[3], DImode);
20153 }
20154 else
20155 gcc_unreachable ();
20156 }
20157 }
20158
20159 return size;
20160 }
20161
20162 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20163 Return false when normal moves are needed; true when all required
20164 insns have been emitted. Operands 2-4 contain the input values
20165 int the correct order; operands 5-7 contain the output values. */
20166
20167 void
20168 ix86_split_long_move (rtx operands[])
20169 {
20170 rtx part[2][4];
20171 int nparts, i, j;
20172 int push = 0;
20173 int collisions = 0;
20174 enum machine_mode mode = GET_MODE (operands[0]);
20175 bool collisionparts[4];
20176
20177 /* The DFmode expanders may ask us to move double.
20178 For 64bit target this is single move. By hiding the fact
20179 here we simplify i386.md splitters. */
20180 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20181 {
20182 /* Optimize constant pool reference to immediates. This is used by
20183 fp moves, that force all constants to memory to allow combining. */
20184
20185 if (MEM_P (operands[1])
20186 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20187 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20188 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20189 if (push_operand (operands[0], VOIDmode))
20190 {
20191 operands[0] = copy_rtx (operands[0]);
20192 PUT_MODE (operands[0], Pmode);
20193 }
20194 else
20195 operands[0] = gen_lowpart (DImode, operands[0]);
20196 operands[1] = gen_lowpart (DImode, operands[1]);
20197 emit_move_insn (operands[0], operands[1]);
20198 return;
20199 }
20200
20201 /* The only non-offsettable memory we handle is push. */
20202 if (push_operand (operands[0], VOIDmode))
20203 push = 1;
20204 else
20205 gcc_assert (!MEM_P (operands[0])
20206 || offsettable_memref_p (operands[0]));
20207
20208 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20209 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20210
20211 /* When emitting push, take care for source operands on the stack. */
20212 if (push && MEM_P (operands[1])
20213 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20214 {
20215 rtx src_base = XEXP (part[1][nparts - 1], 0);
20216
20217 /* Compensate for the stack decrement by 4. */
20218 if (!TARGET_64BIT && nparts == 3
20219 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20220 src_base = plus_constant (src_base, 4);
20221
20222 /* src_base refers to the stack pointer and is
20223 automatically decreased by emitted push. */
20224 for (i = 0; i < nparts; i++)
20225 part[1][i] = change_address (part[1][i],
20226 GET_MODE (part[1][i]), src_base);
20227 }
20228
20229 /* We need to do copy in the right order in case an address register
20230 of the source overlaps the destination. */
20231 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20232 {
20233 rtx tmp;
20234
20235 for (i = 0; i < nparts; i++)
20236 {
20237 collisionparts[i]
20238 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20239 if (collisionparts[i])
20240 collisions++;
20241 }
20242
20243 /* Collision in the middle part can be handled by reordering. */
20244 if (collisions == 1 && nparts == 3 && collisionparts [1])
20245 {
20246 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20247 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20248 }
20249 else if (collisions == 1
20250 && nparts == 4
20251 && (collisionparts [1] || collisionparts [2]))
20252 {
20253 if (collisionparts [1])
20254 {
20255 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20256 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20257 }
20258 else
20259 {
20260 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20261 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20262 }
20263 }
20264
20265 /* If there are more collisions, we can't handle it by reordering.
20266 Do an lea to the last part and use only one colliding move. */
20267 else if (collisions > 1)
20268 {
20269 rtx base;
20270
20271 collisions = 1;
20272
20273 base = part[0][nparts - 1];
20274
20275 /* Handle the case when the last part isn't valid for lea.
20276 Happens in 64-bit mode storing the 12-byte XFmode. */
20277 if (GET_MODE (base) != Pmode)
20278 base = gen_rtx_REG (Pmode, REGNO (base));
20279
20280 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20281 part[1][0] = replace_equiv_address (part[1][0], base);
20282 for (i = 1; i < nparts; i++)
20283 {
20284 tmp = plus_constant (base, UNITS_PER_WORD * i);
20285 part[1][i] = replace_equiv_address (part[1][i], tmp);
20286 }
20287 }
20288 }
20289
20290 if (push)
20291 {
20292 if (!TARGET_64BIT)
20293 {
20294 if (nparts == 3)
20295 {
20296 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20297 emit_insn (gen_addsi3 (stack_pointer_rtx,
20298 stack_pointer_rtx, GEN_INT (-4)));
20299 emit_move_insn (part[0][2], part[1][2]);
20300 }
20301 else if (nparts == 4)
20302 {
20303 emit_move_insn (part[0][3], part[1][3]);
20304 emit_move_insn (part[0][2], part[1][2]);
20305 }
20306 }
20307 else
20308 {
20309 /* In 64bit mode we don't have 32bit push available. In case this is
20310 register, it is OK - we will just use larger counterpart. We also
20311 retype memory - these comes from attempt to avoid REX prefix on
20312 moving of second half of TFmode value. */
20313 if (GET_MODE (part[1][1]) == SImode)
20314 {
20315 switch (GET_CODE (part[1][1]))
20316 {
20317 case MEM:
20318 part[1][1] = adjust_address (part[1][1], DImode, 0);
20319 break;
20320
20321 case REG:
20322 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20323 break;
20324
20325 default:
20326 gcc_unreachable ();
20327 }
20328
20329 if (GET_MODE (part[1][0]) == SImode)
20330 part[1][0] = part[1][1];
20331 }
20332 }
20333 emit_move_insn (part[0][1], part[1][1]);
20334 emit_move_insn (part[0][0], part[1][0]);
20335 return;
20336 }
20337
20338 /* Choose correct order to not overwrite the source before it is copied. */
20339 if ((REG_P (part[0][0])
20340 && REG_P (part[1][1])
20341 && (REGNO (part[0][0]) == REGNO (part[1][1])
20342 || (nparts == 3
20343 && REGNO (part[0][0]) == REGNO (part[1][2]))
20344 || (nparts == 4
20345 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20346 || (collisions > 0
20347 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20348 {
20349 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20350 {
20351 operands[2 + i] = part[0][j];
20352 operands[6 + i] = part[1][j];
20353 }
20354 }
20355 else
20356 {
20357 for (i = 0; i < nparts; i++)
20358 {
20359 operands[2 + i] = part[0][i];
20360 operands[6 + i] = part[1][i];
20361 }
20362 }
20363
20364 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20365 if (optimize_insn_for_size_p ())
20366 {
20367 for (j = 0; j < nparts - 1; j++)
20368 if (CONST_INT_P (operands[6 + j])
20369 && operands[6 + j] != const0_rtx
20370 && REG_P (operands[2 + j]))
20371 for (i = j; i < nparts - 1; i++)
20372 if (CONST_INT_P (operands[7 + i])
20373 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20374 operands[7 + i] = operands[2 + j];
20375 }
20376
20377 for (i = 0; i < nparts; i++)
20378 emit_move_insn (operands[2 + i], operands[6 + i]);
20379
20380 return;
20381 }
20382
20383 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20384 left shift by a constant, either using a single shift or
20385 a sequence of add instructions. */
20386
20387 static void
20388 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20389 {
20390 rtx (*insn)(rtx, rtx, rtx);
20391
20392 if (count == 1
20393 || (count * ix86_cost->add <= ix86_cost->shift_const
20394 && !optimize_insn_for_size_p ()))
20395 {
20396 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20397 while (count-- > 0)
20398 emit_insn (insn (operand, operand, operand));
20399 }
20400 else
20401 {
20402 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20403 emit_insn (insn (operand, operand, GEN_INT (count)));
20404 }
20405 }
20406
20407 void
20408 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20409 {
20410 rtx (*gen_ashl3)(rtx, rtx, rtx);
20411 rtx (*gen_shld)(rtx, rtx, rtx);
20412 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20413
20414 rtx low[2], high[2];
20415 int count;
20416
20417 if (CONST_INT_P (operands[2]))
20418 {
20419 split_double_mode (mode, operands, 2, low, high);
20420 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20421
20422 if (count >= half_width)
20423 {
20424 emit_move_insn (high[0], low[1]);
20425 emit_move_insn (low[0], const0_rtx);
20426
20427 if (count > half_width)
20428 ix86_expand_ashl_const (high[0], count - half_width, mode);
20429 }
20430 else
20431 {
20432 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20433
20434 if (!rtx_equal_p (operands[0], operands[1]))
20435 emit_move_insn (operands[0], operands[1]);
20436
20437 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20438 ix86_expand_ashl_const (low[0], count, mode);
20439 }
20440 return;
20441 }
20442
20443 split_double_mode (mode, operands, 1, low, high);
20444
20445 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20446
20447 if (operands[1] == const1_rtx)
20448 {
20449 /* Assuming we've chosen a QImode capable registers, then 1 << N
20450 can be done with two 32/64-bit shifts, no branches, no cmoves. */
20451 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20452 {
20453 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20454
20455 ix86_expand_clear (low[0]);
20456 ix86_expand_clear (high[0]);
20457 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20458
20459 d = gen_lowpart (QImode, low[0]);
20460 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20461 s = gen_rtx_EQ (QImode, flags, const0_rtx);
20462 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20463
20464 d = gen_lowpart (QImode, high[0]);
20465 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20466 s = gen_rtx_NE (QImode, flags, const0_rtx);
20467 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20468 }
20469
20470 /* Otherwise, we can get the same results by manually performing
20471 a bit extract operation on bit 5/6, and then performing the two
20472 shifts. The two methods of getting 0/1 into low/high are exactly
20473 the same size. Avoiding the shift in the bit extract case helps
20474 pentium4 a bit; no one else seems to care much either way. */
20475 else
20476 {
20477 enum machine_mode half_mode;
20478 rtx (*gen_lshr3)(rtx, rtx, rtx);
20479 rtx (*gen_and3)(rtx, rtx, rtx);
20480 rtx (*gen_xor3)(rtx, rtx, rtx);
20481 HOST_WIDE_INT bits;
20482 rtx x;
20483
20484 if (mode == DImode)
20485 {
20486 half_mode = SImode;
20487 gen_lshr3 = gen_lshrsi3;
20488 gen_and3 = gen_andsi3;
20489 gen_xor3 = gen_xorsi3;
20490 bits = 5;
20491 }
20492 else
20493 {
20494 half_mode = DImode;
20495 gen_lshr3 = gen_lshrdi3;
20496 gen_and3 = gen_anddi3;
20497 gen_xor3 = gen_xordi3;
20498 bits = 6;
20499 }
20500
20501 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
20502 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
20503 else
20504 x = gen_lowpart (half_mode, operands[2]);
20505 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
20506
20507 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
20508 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
20509 emit_move_insn (low[0], high[0]);
20510 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
20511 }
20512
20513 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20514 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
20515 return;
20516 }
20517
20518 if (operands[1] == constm1_rtx)
20519 {
20520 /* For -1 << N, we can avoid the shld instruction, because we
20521 know that we're shifting 0...31/63 ones into a -1. */
20522 emit_move_insn (low[0], constm1_rtx);
20523 if (optimize_insn_for_size_p ())
20524 emit_move_insn (high[0], low[0]);
20525 else
20526 emit_move_insn (high[0], constm1_rtx);
20527 }
20528 else
20529 {
20530 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20531
20532 if (!rtx_equal_p (operands[0], operands[1]))
20533 emit_move_insn (operands[0], operands[1]);
20534
20535 split_double_mode (mode, operands, 1, low, high);
20536 emit_insn (gen_shld (high[0], low[0], operands[2]));
20537 }
20538
20539 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20540
20541 if (TARGET_CMOVE && scratch)
20542 {
20543 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20544 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20545
20546 ix86_expand_clear (scratch);
20547 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
20548 }
20549 else
20550 {
20551 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20552 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20553
20554 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
20555 }
20556 }
20557
20558 void
20559 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
20560 {
20561 rtx (*gen_ashr3)(rtx, rtx, rtx)
20562 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
20563 rtx (*gen_shrd)(rtx, rtx, rtx);
20564 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20565
20566 rtx low[2], high[2];
20567 int count;
20568
20569 if (CONST_INT_P (operands[2]))
20570 {
20571 split_double_mode (mode, operands, 2, low, high);
20572 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20573
20574 if (count == GET_MODE_BITSIZE (mode) - 1)
20575 {
20576 emit_move_insn (high[0], high[1]);
20577 emit_insn (gen_ashr3 (high[0], high[0],
20578 GEN_INT (half_width - 1)));
20579 emit_move_insn (low[0], high[0]);
20580
20581 }
20582 else if (count >= half_width)
20583 {
20584 emit_move_insn (low[0], high[1]);
20585 emit_move_insn (high[0], low[0]);
20586 emit_insn (gen_ashr3 (high[0], high[0],
20587 GEN_INT (half_width - 1)));
20588
20589 if (count > half_width)
20590 emit_insn (gen_ashr3 (low[0], low[0],
20591 GEN_INT (count - half_width)));
20592 }
20593 else
20594 {
20595 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20596
20597 if (!rtx_equal_p (operands[0], operands[1]))
20598 emit_move_insn (operands[0], operands[1]);
20599
20600 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20601 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
20602 }
20603 }
20604 else
20605 {
20606 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20607
20608 if (!rtx_equal_p (operands[0], operands[1]))
20609 emit_move_insn (operands[0], operands[1]);
20610
20611 split_double_mode (mode, operands, 1, low, high);
20612
20613 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20614 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
20615
20616 if (TARGET_CMOVE && scratch)
20617 {
20618 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20619 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20620
20621 emit_move_insn (scratch, high[0]);
20622 emit_insn (gen_ashr3 (scratch, scratch,
20623 GEN_INT (half_width - 1)));
20624 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20625 scratch));
20626 }
20627 else
20628 {
20629 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
20630 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
20631
20632 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
20633 }
20634 }
20635 }
20636
20637 void
20638 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
20639 {
20640 rtx (*gen_lshr3)(rtx, rtx, rtx)
20641 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
20642 rtx (*gen_shrd)(rtx, rtx, rtx);
20643 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20644
20645 rtx low[2], high[2];
20646 int count;
20647
20648 if (CONST_INT_P (operands[2]))
20649 {
20650 split_double_mode (mode, operands, 2, low, high);
20651 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20652
20653 if (count >= half_width)
20654 {
20655 emit_move_insn (low[0], high[1]);
20656 ix86_expand_clear (high[0]);
20657
20658 if (count > half_width)
20659 emit_insn (gen_lshr3 (low[0], low[0],
20660 GEN_INT (count - half_width)));
20661 }
20662 else
20663 {
20664 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20665
20666 if (!rtx_equal_p (operands[0], operands[1]))
20667 emit_move_insn (operands[0], operands[1]);
20668
20669 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20670 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
20671 }
20672 }
20673 else
20674 {
20675 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20676
20677 if (!rtx_equal_p (operands[0], operands[1]))
20678 emit_move_insn (operands[0], operands[1]);
20679
20680 split_double_mode (mode, operands, 1, low, high);
20681
20682 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20683 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
20684
20685 if (TARGET_CMOVE && scratch)
20686 {
20687 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20688 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20689
20690 ix86_expand_clear (scratch);
20691 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20692 scratch));
20693 }
20694 else
20695 {
20696 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20697 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20698
20699 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
20700 }
20701 }
20702 }
20703
20704 /* Predict just emitted jump instruction to be taken with probability PROB. */
20705 static void
20706 predict_jump (int prob)
20707 {
20708 rtx insn = get_last_insn ();
20709 gcc_assert (JUMP_P (insn));
20710 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
20711 }
20712
20713 /* Helper function for the string operations below. Dest VARIABLE whether
20714 it is aligned to VALUE bytes. If true, jump to the label. */
20715 static rtx
20716 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
20717 {
20718 rtx label = gen_label_rtx ();
20719 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
20720 if (GET_MODE (variable) == DImode)
20721 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
20722 else
20723 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
20724 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
20725 1, label);
20726 if (epilogue)
20727 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20728 else
20729 predict_jump (REG_BR_PROB_BASE * 90 / 100);
20730 return label;
20731 }
20732
20733 /* Adjust COUNTER by the VALUE. */
20734 static void
20735 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
20736 {
20737 rtx (*gen_add)(rtx, rtx, rtx)
20738 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
20739
20740 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
20741 }
20742
20743 /* Zero extend possibly SImode EXP to Pmode register. */
20744 rtx
20745 ix86_zero_extend_to_Pmode (rtx exp)
20746 {
20747 rtx r;
20748 if (GET_MODE (exp) == VOIDmode)
20749 return force_reg (Pmode, exp);
20750 if (GET_MODE (exp) == Pmode)
20751 return copy_to_mode_reg (Pmode, exp);
20752 r = gen_reg_rtx (Pmode);
20753 emit_insn (gen_zero_extendsidi2 (r, exp));
20754 return r;
20755 }
20756
20757 /* Divide COUNTREG by SCALE. */
20758 static rtx
20759 scale_counter (rtx countreg, int scale)
20760 {
20761 rtx sc;
20762
20763 if (scale == 1)
20764 return countreg;
20765 if (CONST_INT_P (countreg))
20766 return GEN_INT (INTVAL (countreg) / scale);
20767 gcc_assert (REG_P (countreg));
20768
20769 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
20770 GEN_INT (exact_log2 (scale)),
20771 NULL, 1, OPTAB_DIRECT);
20772 return sc;
20773 }
20774
20775 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
20776 DImode for constant loop counts. */
20777
20778 static enum machine_mode
20779 counter_mode (rtx count_exp)
20780 {
20781 if (GET_MODE (count_exp) != VOIDmode)
20782 return GET_MODE (count_exp);
20783 if (!CONST_INT_P (count_exp))
20784 return Pmode;
20785 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
20786 return DImode;
20787 return SImode;
20788 }
20789
20790 /* When SRCPTR is non-NULL, output simple loop to move memory
20791 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
20792 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
20793 equivalent loop to set memory by VALUE (supposed to be in MODE).
20794
20795 The size is rounded down to whole number of chunk size moved at once.
20796 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
20797
20798
20799 static void
20800 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
20801 rtx destptr, rtx srcptr, rtx value,
20802 rtx count, enum machine_mode mode, int unroll,
20803 int expected_size)
20804 {
20805 rtx out_label, top_label, iter, tmp;
20806 enum machine_mode iter_mode = counter_mode (count);
20807 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
20808 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
20809 rtx size;
20810 rtx x_addr;
20811 rtx y_addr;
20812 int i;
20813
20814 top_label = gen_label_rtx ();
20815 out_label = gen_label_rtx ();
20816 iter = gen_reg_rtx (iter_mode);
20817
20818 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
20819 NULL, 1, OPTAB_DIRECT);
20820 /* Those two should combine. */
20821 if (piece_size == const1_rtx)
20822 {
20823 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
20824 true, out_label);
20825 predict_jump (REG_BR_PROB_BASE * 10 / 100);
20826 }
20827 emit_move_insn (iter, const0_rtx);
20828
20829 emit_label (top_label);
20830
20831 tmp = convert_modes (Pmode, iter_mode, iter, true);
20832 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
20833 destmem = change_address (destmem, mode, x_addr);
20834
20835 if (srcmem)
20836 {
20837 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
20838 srcmem = change_address (srcmem, mode, y_addr);
20839
20840 /* When unrolling for chips that reorder memory reads and writes,
20841 we can save registers by using single temporary.
20842 Also using 4 temporaries is overkill in 32bit mode. */
20843 if (!TARGET_64BIT && 0)
20844 {
20845 for (i = 0; i < unroll; i++)
20846 {
20847 if (i)
20848 {
20849 destmem =
20850 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
20851 srcmem =
20852 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
20853 }
20854 emit_move_insn (destmem, srcmem);
20855 }
20856 }
20857 else
20858 {
20859 rtx tmpreg[4];
20860 gcc_assert (unroll <= 4);
20861 for (i = 0; i < unroll; i++)
20862 {
20863 tmpreg[i] = gen_reg_rtx (mode);
20864 if (i)
20865 {
20866 srcmem =
20867 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
20868 }
20869 emit_move_insn (tmpreg[i], srcmem);
20870 }
20871 for (i = 0; i < unroll; i++)
20872 {
20873 if (i)
20874 {
20875 destmem =
20876 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
20877 }
20878 emit_move_insn (destmem, tmpreg[i]);
20879 }
20880 }
20881 }
20882 else
20883 for (i = 0; i < unroll; i++)
20884 {
20885 if (i)
20886 destmem =
20887 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
20888 emit_move_insn (destmem, value);
20889 }
20890
20891 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
20892 true, OPTAB_LIB_WIDEN);
20893 if (tmp != iter)
20894 emit_move_insn (iter, tmp);
20895
20896 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
20897 true, top_label);
20898 if (expected_size != -1)
20899 {
20900 expected_size /= GET_MODE_SIZE (mode) * unroll;
20901 if (expected_size == 0)
20902 predict_jump (0);
20903 else if (expected_size > REG_BR_PROB_BASE)
20904 predict_jump (REG_BR_PROB_BASE - 1);
20905 else
20906 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
20907 }
20908 else
20909 predict_jump (REG_BR_PROB_BASE * 80 / 100);
20910 iter = ix86_zero_extend_to_Pmode (iter);
20911 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
20912 true, OPTAB_LIB_WIDEN);
20913 if (tmp != destptr)
20914 emit_move_insn (destptr, tmp);
20915 if (srcptr)
20916 {
20917 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
20918 true, OPTAB_LIB_WIDEN);
20919 if (tmp != srcptr)
20920 emit_move_insn (srcptr, tmp);
20921 }
20922 emit_label (out_label);
20923 }
20924
20925 /* Output "rep; mov" instruction.
20926 Arguments have same meaning as for previous function */
20927 static void
20928 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
20929 rtx destptr, rtx srcptr,
20930 rtx count,
20931 enum machine_mode mode)
20932 {
20933 rtx destexp;
20934 rtx srcexp;
20935 rtx countreg;
20936 HOST_WIDE_INT rounded_count;
20937
20938 /* If the size is known, it is shorter to use rep movs. */
20939 if (mode == QImode && CONST_INT_P (count)
20940 && !(INTVAL (count) & 3))
20941 mode = SImode;
20942
20943 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
20944 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
20945 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
20946 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
20947 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
20948 if (mode != QImode)
20949 {
20950 destexp = gen_rtx_ASHIFT (Pmode, countreg,
20951 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
20952 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
20953 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
20954 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
20955 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
20956 }
20957 else
20958 {
20959 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
20960 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
20961 }
20962 if (CONST_INT_P (count))
20963 {
20964 rounded_count = (INTVAL (count)
20965 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
20966 destmem = shallow_copy_rtx (destmem);
20967 srcmem = shallow_copy_rtx (srcmem);
20968 set_mem_size (destmem, rounded_count);
20969 set_mem_size (srcmem, rounded_count);
20970 }
20971 else
20972 {
20973 if (MEM_SIZE_KNOWN_P (destmem))
20974 clear_mem_size (destmem);
20975 if (MEM_SIZE_KNOWN_P (srcmem))
20976 clear_mem_size (srcmem);
20977 }
20978 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
20979 destexp, srcexp));
20980 }
20981
20982 /* Output "rep; stos" instruction.
20983 Arguments have same meaning as for previous function */
20984 static void
20985 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
20986 rtx count, enum machine_mode mode,
20987 rtx orig_value)
20988 {
20989 rtx destexp;
20990 rtx countreg;
20991 HOST_WIDE_INT rounded_count;
20992
20993 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
20994 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
20995 value = force_reg (mode, gen_lowpart (mode, value));
20996 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
20997 if (mode != QImode)
20998 {
20999 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21000 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21001 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21002 }
21003 else
21004 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21005 if (orig_value == const0_rtx && CONST_INT_P (count))
21006 {
21007 rounded_count = (INTVAL (count)
21008 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21009 destmem = shallow_copy_rtx (destmem);
21010 set_mem_size (destmem, rounded_count);
21011 }
21012 else if (MEM_SIZE_KNOWN_P (destmem))
21013 clear_mem_size (destmem);
21014 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21015 }
21016
21017 static void
21018 emit_strmov (rtx destmem, rtx srcmem,
21019 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21020 {
21021 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21022 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21023 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21024 }
21025
21026 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21027 static void
21028 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21029 rtx destptr, rtx srcptr, rtx count, int max_size)
21030 {
21031 rtx src, dest;
21032 if (CONST_INT_P (count))
21033 {
21034 HOST_WIDE_INT countval = INTVAL (count);
21035 int offset = 0;
21036
21037 if ((countval & 0x10) && max_size > 16)
21038 {
21039 if (TARGET_64BIT)
21040 {
21041 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21042 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21043 }
21044 else
21045 gcc_unreachable ();
21046 offset += 16;
21047 }
21048 if ((countval & 0x08) && max_size > 8)
21049 {
21050 if (TARGET_64BIT)
21051 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21052 else
21053 {
21054 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21055 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21056 }
21057 offset += 8;
21058 }
21059 if ((countval & 0x04) && max_size > 4)
21060 {
21061 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21062 offset += 4;
21063 }
21064 if ((countval & 0x02) && max_size > 2)
21065 {
21066 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21067 offset += 2;
21068 }
21069 if ((countval & 0x01) && max_size > 1)
21070 {
21071 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21072 offset += 1;
21073 }
21074 return;
21075 }
21076 if (max_size > 8)
21077 {
21078 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21079 count, 1, OPTAB_DIRECT);
21080 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21081 count, QImode, 1, 4);
21082 return;
21083 }
21084
21085 /* When there are stringops, we can cheaply increase dest and src pointers.
21086 Otherwise we save code size by maintaining offset (zero is readily
21087 available from preceding rep operation) and using x86 addressing modes.
21088 */
21089 if (TARGET_SINGLE_STRINGOP)
21090 {
21091 if (max_size > 4)
21092 {
21093 rtx label = ix86_expand_aligntest (count, 4, true);
21094 src = change_address (srcmem, SImode, srcptr);
21095 dest = change_address (destmem, SImode, destptr);
21096 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21097 emit_label (label);
21098 LABEL_NUSES (label) = 1;
21099 }
21100 if (max_size > 2)
21101 {
21102 rtx label = ix86_expand_aligntest (count, 2, true);
21103 src = change_address (srcmem, HImode, srcptr);
21104 dest = change_address (destmem, HImode, destptr);
21105 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21106 emit_label (label);
21107 LABEL_NUSES (label) = 1;
21108 }
21109 if (max_size > 1)
21110 {
21111 rtx label = ix86_expand_aligntest (count, 1, true);
21112 src = change_address (srcmem, QImode, srcptr);
21113 dest = change_address (destmem, QImode, destptr);
21114 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21115 emit_label (label);
21116 LABEL_NUSES (label) = 1;
21117 }
21118 }
21119 else
21120 {
21121 rtx offset = force_reg (Pmode, const0_rtx);
21122 rtx tmp;
21123
21124 if (max_size > 4)
21125 {
21126 rtx label = ix86_expand_aligntest (count, 4, true);
21127 src = change_address (srcmem, SImode, srcptr);
21128 dest = change_address (destmem, SImode, destptr);
21129 emit_move_insn (dest, src);
21130 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21131 true, OPTAB_LIB_WIDEN);
21132 if (tmp != offset)
21133 emit_move_insn (offset, tmp);
21134 emit_label (label);
21135 LABEL_NUSES (label) = 1;
21136 }
21137 if (max_size > 2)
21138 {
21139 rtx label = ix86_expand_aligntest (count, 2, true);
21140 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21141 src = change_address (srcmem, HImode, tmp);
21142 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21143 dest = change_address (destmem, HImode, tmp);
21144 emit_move_insn (dest, src);
21145 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21146 true, OPTAB_LIB_WIDEN);
21147 if (tmp != offset)
21148 emit_move_insn (offset, tmp);
21149 emit_label (label);
21150 LABEL_NUSES (label) = 1;
21151 }
21152 if (max_size > 1)
21153 {
21154 rtx label = ix86_expand_aligntest (count, 1, true);
21155 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21156 src = change_address (srcmem, QImode, tmp);
21157 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21158 dest = change_address (destmem, QImode, tmp);
21159 emit_move_insn (dest, src);
21160 emit_label (label);
21161 LABEL_NUSES (label) = 1;
21162 }
21163 }
21164 }
21165
21166 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21167 static void
21168 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21169 rtx count, int max_size)
21170 {
21171 count =
21172 expand_simple_binop (counter_mode (count), AND, count,
21173 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21174 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21175 gen_lowpart (QImode, value), count, QImode,
21176 1, max_size / 2);
21177 }
21178
21179 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21180 static void
21181 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21182 {
21183 rtx dest;
21184
21185 if (CONST_INT_P (count))
21186 {
21187 HOST_WIDE_INT countval = INTVAL (count);
21188 int offset = 0;
21189
21190 if ((countval & 0x10) && max_size > 16)
21191 {
21192 if (TARGET_64BIT)
21193 {
21194 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21195 emit_insn (gen_strset (destptr, dest, value));
21196 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21197 emit_insn (gen_strset (destptr, dest, value));
21198 }
21199 else
21200 gcc_unreachable ();
21201 offset += 16;
21202 }
21203 if ((countval & 0x08) && max_size > 8)
21204 {
21205 if (TARGET_64BIT)
21206 {
21207 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21208 emit_insn (gen_strset (destptr, dest, value));
21209 }
21210 else
21211 {
21212 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21213 emit_insn (gen_strset (destptr, dest, value));
21214 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21215 emit_insn (gen_strset (destptr, dest, value));
21216 }
21217 offset += 8;
21218 }
21219 if ((countval & 0x04) && max_size > 4)
21220 {
21221 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21222 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21223 offset += 4;
21224 }
21225 if ((countval & 0x02) && max_size > 2)
21226 {
21227 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21228 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21229 offset += 2;
21230 }
21231 if ((countval & 0x01) && max_size > 1)
21232 {
21233 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21234 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21235 offset += 1;
21236 }
21237 return;
21238 }
21239 if (max_size > 32)
21240 {
21241 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21242 return;
21243 }
21244 if (max_size > 16)
21245 {
21246 rtx label = ix86_expand_aligntest (count, 16, true);
21247 if (TARGET_64BIT)
21248 {
21249 dest = change_address (destmem, DImode, destptr);
21250 emit_insn (gen_strset (destptr, dest, value));
21251 emit_insn (gen_strset (destptr, dest, value));
21252 }
21253 else
21254 {
21255 dest = change_address (destmem, SImode, destptr);
21256 emit_insn (gen_strset (destptr, dest, value));
21257 emit_insn (gen_strset (destptr, dest, value));
21258 emit_insn (gen_strset (destptr, dest, value));
21259 emit_insn (gen_strset (destptr, dest, value));
21260 }
21261 emit_label (label);
21262 LABEL_NUSES (label) = 1;
21263 }
21264 if (max_size > 8)
21265 {
21266 rtx label = ix86_expand_aligntest (count, 8, true);
21267 if (TARGET_64BIT)
21268 {
21269 dest = change_address (destmem, DImode, destptr);
21270 emit_insn (gen_strset (destptr, dest, value));
21271 }
21272 else
21273 {
21274 dest = change_address (destmem, SImode, destptr);
21275 emit_insn (gen_strset (destptr, dest, value));
21276 emit_insn (gen_strset (destptr, dest, value));
21277 }
21278 emit_label (label);
21279 LABEL_NUSES (label) = 1;
21280 }
21281 if (max_size > 4)
21282 {
21283 rtx label = ix86_expand_aligntest (count, 4, true);
21284 dest = change_address (destmem, SImode, destptr);
21285 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21286 emit_label (label);
21287 LABEL_NUSES (label) = 1;
21288 }
21289 if (max_size > 2)
21290 {
21291 rtx label = ix86_expand_aligntest (count, 2, true);
21292 dest = change_address (destmem, HImode, destptr);
21293 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21294 emit_label (label);
21295 LABEL_NUSES (label) = 1;
21296 }
21297 if (max_size > 1)
21298 {
21299 rtx label = ix86_expand_aligntest (count, 1, true);
21300 dest = change_address (destmem, QImode, destptr);
21301 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21302 emit_label (label);
21303 LABEL_NUSES (label) = 1;
21304 }
21305 }
21306
21307 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21308 DESIRED_ALIGNMENT. */
21309 static void
21310 expand_movmem_prologue (rtx destmem, rtx srcmem,
21311 rtx destptr, rtx srcptr, rtx count,
21312 int align, int desired_alignment)
21313 {
21314 if (align <= 1 && desired_alignment > 1)
21315 {
21316 rtx label = ix86_expand_aligntest (destptr, 1, false);
21317 srcmem = change_address (srcmem, QImode, srcptr);
21318 destmem = change_address (destmem, QImode, destptr);
21319 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21320 ix86_adjust_counter (count, 1);
21321 emit_label (label);
21322 LABEL_NUSES (label) = 1;
21323 }
21324 if (align <= 2 && desired_alignment > 2)
21325 {
21326 rtx label = ix86_expand_aligntest (destptr, 2, false);
21327 srcmem = change_address (srcmem, HImode, srcptr);
21328 destmem = change_address (destmem, HImode, destptr);
21329 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21330 ix86_adjust_counter (count, 2);
21331 emit_label (label);
21332 LABEL_NUSES (label) = 1;
21333 }
21334 if (align <= 4 && desired_alignment > 4)
21335 {
21336 rtx label = ix86_expand_aligntest (destptr, 4, false);
21337 srcmem = change_address (srcmem, SImode, srcptr);
21338 destmem = change_address (destmem, SImode, destptr);
21339 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21340 ix86_adjust_counter (count, 4);
21341 emit_label (label);
21342 LABEL_NUSES (label) = 1;
21343 }
21344 gcc_assert (desired_alignment <= 8);
21345 }
21346
21347 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21348 ALIGN_BYTES is how many bytes need to be copied. */
21349 static rtx
21350 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21351 int desired_align, int align_bytes)
21352 {
21353 rtx src = *srcp;
21354 rtx orig_dst = dst;
21355 rtx orig_src = src;
21356 int off = 0;
21357 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21358 if (src_align_bytes >= 0)
21359 src_align_bytes = desired_align - src_align_bytes;
21360 if (align_bytes & 1)
21361 {
21362 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21363 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21364 off = 1;
21365 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21366 }
21367 if (align_bytes & 2)
21368 {
21369 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21370 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21371 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21372 set_mem_align (dst, 2 * BITS_PER_UNIT);
21373 if (src_align_bytes >= 0
21374 && (src_align_bytes & 1) == (align_bytes & 1)
21375 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21376 set_mem_align (src, 2 * BITS_PER_UNIT);
21377 off = 2;
21378 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21379 }
21380 if (align_bytes & 4)
21381 {
21382 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21383 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21384 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21385 set_mem_align (dst, 4 * BITS_PER_UNIT);
21386 if (src_align_bytes >= 0)
21387 {
21388 unsigned int src_align = 0;
21389 if ((src_align_bytes & 3) == (align_bytes & 3))
21390 src_align = 4;
21391 else if ((src_align_bytes & 1) == (align_bytes & 1))
21392 src_align = 2;
21393 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21394 set_mem_align (src, src_align * BITS_PER_UNIT);
21395 }
21396 off = 4;
21397 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21398 }
21399 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21400 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21401 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21402 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21403 if (src_align_bytes >= 0)
21404 {
21405 unsigned int src_align = 0;
21406 if ((src_align_bytes & 7) == (align_bytes & 7))
21407 src_align = 8;
21408 else if ((src_align_bytes & 3) == (align_bytes & 3))
21409 src_align = 4;
21410 else if ((src_align_bytes & 1) == (align_bytes & 1))
21411 src_align = 2;
21412 if (src_align > (unsigned int) desired_align)
21413 src_align = desired_align;
21414 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21415 set_mem_align (src, src_align * BITS_PER_UNIT);
21416 }
21417 if (MEM_SIZE_KNOWN_P (orig_dst))
21418 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21419 if (MEM_SIZE_KNOWN_P (orig_src))
21420 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21421 *srcp = src;
21422 return dst;
21423 }
21424
21425 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21426 DESIRED_ALIGNMENT. */
21427 static void
21428 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21429 int align, int desired_alignment)
21430 {
21431 if (align <= 1 && desired_alignment > 1)
21432 {
21433 rtx label = ix86_expand_aligntest (destptr, 1, false);
21434 destmem = change_address (destmem, QImode, destptr);
21435 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21436 ix86_adjust_counter (count, 1);
21437 emit_label (label);
21438 LABEL_NUSES (label) = 1;
21439 }
21440 if (align <= 2 && desired_alignment > 2)
21441 {
21442 rtx label = ix86_expand_aligntest (destptr, 2, false);
21443 destmem = change_address (destmem, HImode, destptr);
21444 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21445 ix86_adjust_counter (count, 2);
21446 emit_label (label);
21447 LABEL_NUSES (label) = 1;
21448 }
21449 if (align <= 4 && desired_alignment > 4)
21450 {
21451 rtx label = ix86_expand_aligntest (destptr, 4, false);
21452 destmem = change_address (destmem, SImode, destptr);
21453 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21454 ix86_adjust_counter (count, 4);
21455 emit_label (label);
21456 LABEL_NUSES (label) = 1;
21457 }
21458 gcc_assert (desired_alignment <= 8);
21459 }
21460
21461 /* Set enough from DST to align DST known to by aligned by ALIGN to
21462 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
21463 static rtx
21464 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21465 int desired_align, int align_bytes)
21466 {
21467 int off = 0;
21468 rtx orig_dst = dst;
21469 if (align_bytes & 1)
21470 {
21471 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21472 off = 1;
21473 emit_insn (gen_strset (destreg, dst,
21474 gen_lowpart (QImode, value)));
21475 }
21476 if (align_bytes & 2)
21477 {
21478 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21479 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21480 set_mem_align (dst, 2 * BITS_PER_UNIT);
21481 off = 2;
21482 emit_insn (gen_strset (destreg, dst,
21483 gen_lowpart (HImode, value)));
21484 }
21485 if (align_bytes & 4)
21486 {
21487 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21488 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21489 set_mem_align (dst, 4 * BITS_PER_UNIT);
21490 off = 4;
21491 emit_insn (gen_strset (destreg, dst,
21492 gen_lowpart (SImode, value)));
21493 }
21494 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21495 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21496 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21497 if (MEM_SIZE_KNOWN_P (orig_dst))
21498 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21499 return dst;
21500 }
21501
21502 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
21503 static enum stringop_alg
21504 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
21505 int *dynamic_check)
21506 {
21507 const struct stringop_algs * algs;
21508 bool optimize_for_speed;
21509 /* Algorithms using the rep prefix want at least edi and ecx;
21510 additionally, memset wants eax and memcpy wants esi. Don't
21511 consider such algorithms if the user has appropriated those
21512 registers for their own purposes. */
21513 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
21514 || (memset
21515 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
21516
21517 #define ALG_USABLE_P(alg) (rep_prefix_usable \
21518 || (alg != rep_prefix_1_byte \
21519 && alg != rep_prefix_4_byte \
21520 && alg != rep_prefix_8_byte))
21521 const struct processor_costs *cost;
21522
21523 /* Even if the string operation call is cold, we still might spend a lot
21524 of time processing large blocks. */
21525 if (optimize_function_for_size_p (cfun)
21526 || (optimize_insn_for_size_p ()
21527 && expected_size != -1 && expected_size < 256))
21528 optimize_for_speed = false;
21529 else
21530 optimize_for_speed = true;
21531
21532 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
21533
21534 *dynamic_check = -1;
21535 if (memset)
21536 algs = &cost->memset[TARGET_64BIT != 0];
21537 else
21538 algs = &cost->memcpy[TARGET_64BIT != 0];
21539 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
21540 return ix86_stringop_alg;
21541 /* rep; movq or rep; movl is the smallest variant. */
21542 else if (!optimize_for_speed)
21543 {
21544 if (!count || (count & 3))
21545 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
21546 else
21547 return rep_prefix_usable ? rep_prefix_4_byte : loop;
21548 }
21549 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
21550 */
21551 else if (expected_size != -1 && expected_size < 4)
21552 return loop_1_byte;
21553 else if (expected_size != -1)
21554 {
21555 unsigned int i;
21556 enum stringop_alg alg = libcall;
21557 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21558 {
21559 /* We get here if the algorithms that were not libcall-based
21560 were rep-prefix based and we are unable to use rep prefixes
21561 based on global register usage. Break out of the loop and
21562 use the heuristic below. */
21563 if (algs->size[i].max == 0)
21564 break;
21565 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
21566 {
21567 enum stringop_alg candidate = algs->size[i].alg;
21568
21569 if (candidate != libcall && ALG_USABLE_P (candidate))
21570 alg = candidate;
21571 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
21572 last non-libcall inline algorithm. */
21573 if (TARGET_INLINE_ALL_STRINGOPS)
21574 {
21575 /* When the current size is best to be copied by a libcall,
21576 but we are still forced to inline, run the heuristic below
21577 that will pick code for medium sized blocks. */
21578 if (alg != libcall)
21579 return alg;
21580 break;
21581 }
21582 else if (ALG_USABLE_P (candidate))
21583 return candidate;
21584 }
21585 }
21586 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
21587 }
21588 /* When asked to inline the call anyway, try to pick meaningful choice.
21589 We look for maximal size of block that is faster to copy by hand and
21590 take blocks of at most of that size guessing that average size will
21591 be roughly half of the block.
21592
21593 If this turns out to be bad, we might simply specify the preferred
21594 choice in ix86_costs. */
21595 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21596 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
21597 {
21598 int max = -1;
21599 enum stringop_alg alg;
21600 int i;
21601 bool any_alg_usable_p = true;
21602
21603 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21604 {
21605 enum stringop_alg candidate = algs->size[i].alg;
21606 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
21607
21608 if (candidate != libcall && candidate
21609 && ALG_USABLE_P (candidate))
21610 max = algs->size[i].max;
21611 }
21612 /* If there aren't any usable algorithms, then recursing on
21613 smaller sizes isn't going to find anything. Just return the
21614 simple byte-at-a-time copy loop. */
21615 if (!any_alg_usable_p)
21616 {
21617 /* Pick something reasonable. */
21618 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21619 *dynamic_check = 128;
21620 return loop_1_byte;
21621 }
21622 if (max == -1)
21623 max = 4096;
21624 alg = decide_alg (count, max / 2, memset, dynamic_check);
21625 gcc_assert (*dynamic_check == -1);
21626 gcc_assert (alg != libcall);
21627 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21628 *dynamic_check = max;
21629 return alg;
21630 }
21631 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
21632 #undef ALG_USABLE_P
21633 }
21634
21635 /* Decide on alignment. We know that the operand is already aligned to ALIGN
21636 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
21637 static int
21638 decide_alignment (int align,
21639 enum stringop_alg alg,
21640 int expected_size)
21641 {
21642 int desired_align = 0;
21643 switch (alg)
21644 {
21645 case no_stringop:
21646 gcc_unreachable ();
21647 case loop:
21648 case unrolled_loop:
21649 desired_align = GET_MODE_SIZE (Pmode);
21650 break;
21651 case rep_prefix_8_byte:
21652 desired_align = 8;
21653 break;
21654 case rep_prefix_4_byte:
21655 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21656 copying whole cacheline at once. */
21657 if (TARGET_PENTIUMPRO)
21658 desired_align = 8;
21659 else
21660 desired_align = 4;
21661 break;
21662 case rep_prefix_1_byte:
21663 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21664 copying whole cacheline at once. */
21665 if (TARGET_PENTIUMPRO)
21666 desired_align = 8;
21667 else
21668 desired_align = 1;
21669 break;
21670 case loop_1_byte:
21671 desired_align = 1;
21672 break;
21673 case libcall:
21674 return 0;
21675 }
21676
21677 if (optimize_size)
21678 desired_align = 1;
21679 if (desired_align < align)
21680 desired_align = align;
21681 if (expected_size != -1 && expected_size < 4)
21682 desired_align = align;
21683 return desired_align;
21684 }
21685
21686 /* Return the smallest power of 2 greater than VAL. */
21687 static int
21688 smallest_pow2_greater_than (int val)
21689 {
21690 int ret = 1;
21691 while (ret <= val)
21692 ret <<= 1;
21693 return ret;
21694 }
21695
21696 /* Expand string move (memcpy) operation. Use i386 string operations
21697 when profitable. expand_setmem contains similar code. The code
21698 depends upon architecture, block size and alignment, but always has
21699 the same overall structure:
21700
21701 1) Prologue guard: Conditional that jumps up to epilogues for small
21702 blocks that can be handled by epilogue alone. This is faster
21703 but also needed for correctness, since prologue assume the block
21704 is larger than the desired alignment.
21705
21706 Optional dynamic check for size and libcall for large
21707 blocks is emitted here too, with -minline-stringops-dynamically.
21708
21709 2) Prologue: copy first few bytes in order to get destination
21710 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
21711 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
21712 copied. We emit either a jump tree on power of two sized
21713 blocks, or a byte loop.
21714
21715 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
21716 with specified algorithm.
21717
21718 4) Epilogue: code copying tail of the block that is too small to be
21719 handled by main body (or up to size guarded by prologue guard). */
21720
21721 bool
21722 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
21723 rtx expected_align_exp, rtx expected_size_exp)
21724 {
21725 rtx destreg;
21726 rtx srcreg;
21727 rtx label = NULL;
21728 rtx tmp;
21729 rtx jump_around_label = NULL;
21730 HOST_WIDE_INT align = 1;
21731 unsigned HOST_WIDE_INT count = 0;
21732 HOST_WIDE_INT expected_size = -1;
21733 int size_needed = 0, epilogue_size_needed;
21734 int desired_align = 0, align_bytes = 0;
21735 enum stringop_alg alg;
21736 int dynamic_check;
21737 bool need_zero_guard = false;
21738
21739 if (CONST_INT_P (align_exp))
21740 align = INTVAL (align_exp);
21741 /* i386 can do misaligned access on reasonably increased cost. */
21742 if (CONST_INT_P (expected_align_exp)
21743 && INTVAL (expected_align_exp) > align)
21744 align = INTVAL (expected_align_exp);
21745 /* ALIGN is the minimum of destination and source alignment, but we care here
21746 just about destination alignment. */
21747 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
21748 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
21749
21750 if (CONST_INT_P (count_exp))
21751 count = expected_size = INTVAL (count_exp);
21752 if (CONST_INT_P (expected_size_exp) && count == 0)
21753 expected_size = INTVAL (expected_size_exp);
21754
21755 /* Make sure we don't need to care about overflow later on. */
21756 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
21757 return false;
21758
21759 /* Step 0: Decide on preferred algorithm, desired alignment and
21760 size of chunks to be copied by main loop. */
21761
21762 alg = decide_alg (count, expected_size, false, &dynamic_check);
21763 desired_align = decide_alignment (align, alg, expected_size);
21764
21765 if (!TARGET_ALIGN_STRINGOPS)
21766 align = desired_align;
21767
21768 if (alg == libcall)
21769 return false;
21770 gcc_assert (alg != no_stringop);
21771 if (!count)
21772 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
21773 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
21774 srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
21775 switch (alg)
21776 {
21777 case libcall:
21778 case no_stringop:
21779 gcc_unreachable ();
21780 case loop:
21781 need_zero_guard = true;
21782 size_needed = GET_MODE_SIZE (Pmode);
21783 break;
21784 case unrolled_loop:
21785 need_zero_guard = true;
21786 size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2);
21787 break;
21788 case rep_prefix_8_byte:
21789 size_needed = 8;
21790 break;
21791 case rep_prefix_4_byte:
21792 size_needed = 4;
21793 break;
21794 case rep_prefix_1_byte:
21795 size_needed = 1;
21796 break;
21797 case loop_1_byte:
21798 need_zero_guard = true;
21799 size_needed = 1;
21800 break;
21801 }
21802
21803 epilogue_size_needed = size_needed;
21804
21805 /* Step 1: Prologue guard. */
21806
21807 /* Alignment code needs count to be in register. */
21808 if (CONST_INT_P (count_exp) && desired_align > align)
21809 {
21810 if (INTVAL (count_exp) > desired_align
21811 && INTVAL (count_exp) > size_needed)
21812 {
21813 align_bytes
21814 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
21815 if (align_bytes <= 0)
21816 align_bytes = 0;
21817 else
21818 align_bytes = desired_align - align_bytes;
21819 }
21820 if (align_bytes == 0)
21821 count_exp = force_reg (counter_mode (count_exp), count_exp);
21822 }
21823 gcc_assert (desired_align >= 1 && align >= 1);
21824
21825 /* Ensure that alignment prologue won't copy past end of block. */
21826 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
21827 {
21828 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
21829 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
21830 Make sure it is power of 2. */
21831 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
21832
21833 if (count)
21834 {
21835 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
21836 {
21837 /* If main algorithm works on QImode, no epilogue is needed.
21838 For small sizes just don't align anything. */
21839 if (size_needed == 1)
21840 desired_align = align;
21841 else
21842 goto epilogue;
21843 }
21844 }
21845 else
21846 {
21847 label = gen_label_rtx ();
21848 emit_cmp_and_jump_insns (count_exp,
21849 GEN_INT (epilogue_size_needed),
21850 LTU, 0, counter_mode (count_exp), 1, label);
21851 if (expected_size == -1 || expected_size < epilogue_size_needed)
21852 predict_jump (REG_BR_PROB_BASE * 60 / 100);
21853 else
21854 predict_jump (REG_BR_PROB_BASE * 20 / 100);
21855 }
21856 }
21857
21858 /* Emit code to decide on runtime whether library call or inline should be
21859 used. */
21860 if (dynamic_check != -1)
21861 {
21862 if (CONST_INT_P (count_exp))
21863 {
21864 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
21865 {
21866 emit_block_move_via_libcall (dst, src, count_exp, false);
21867 count_exp = const0_rtx;
21868 goto epilogue;
21869 }
21870 }
21871 else
21872 {
21873 rtx hot_label = gen_label_rtx ();
21874 jump_around_label = gen_label_rtx ();
21875 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
21876 LEU, 0, GET_MODE (count_exp), 1, hot_label);
21877 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21878 emit_block_move_via_libcall (dst, src, count_exp, false);
21879 emit_jump (jump_around_label);
21880 emit_label (hot_label);
21881 }
21882 }
21883
21884 /* Step 2: Alignment prologue. */
21885
21886 if (desired_align > align)
21887 {
21888 if (align_bytes == 0)
21889 {
21890 /* Except for the first move in epilogue, we no longer know
21891 constant offset in aliasing info. It don't seems to worth
21892 the pain to maintain it for the first move, so throw away
21893 the info early. */
21894 src = change_address (src, BLKmode, srcreg);
21895 dst = change_address (dst, BLKmode, destreg);
21896 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
21897 desired_align);
21898 }
21899 else
21900 {
21901 /* If we know how many bytes need to be stored before dst is
21902 sufficiently aligned, maintain aliasing info accurately. */
21903 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
21904 desired_align, align_bytes);
21905 count_exp = plus_constant (count_exp, -align_bytes);
21906 count -= align_bytes;
21907 }
21908 if (need_zero_guard
21909 && (count < (unsigned HOST_WIDE_INT) size_needed
21910 || (align_bytes == 0
21911 && count < ((unsigned HOST_WIDE_INT) size_needed
21912 + desired_align - align))))
21913 {
21914 /* It is possible that we copied enough so the main loop will not
21915 execute. */
21916 gcc_assert (size_needed > 1);
21917 if (label == NULL_RTX)
21918 label = gen_label_rtx ();
21919 emit_cmp_and_jump_insns (count_exp,
21920 GEN_INT (size_needed),
21921 LTU, 0, counter_mode (count_exp), 1, label);
21922 if (expected_size == -1
21923 || expected_size < (desired_align - align) / 2 + size_needed)
21924 predict_jump (REG_BR_PROB_BASE * 20 / 100);
21925 else
21926 predict_jump (REG_BR_PROB_BASE * 60 / 100);
21927 }
21928 }
21929 if (label && size_needed == 1)
21930 {
21931 emit_label (label);
21932 LABEL_NUSES (label) = 1;
21933 label = NULL;
21934 epilogue_size_needed = 1;
21935 }
21936 else if (label == NULL_RTX)
21937 epilogue_size_needed = size_needed;
21938
21939 /* Step 3: Main loop. */
21940
21941 switch (alg)
21942 {
21943 case libcall:
21944 case no_stringop:
21945 gcc_unreachable ();
21946 case loop_1_byte:
21947 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
21948 count_exp, QImode, 1, expected_size);
21949 break;
21950 case loop:
21951 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
21952 count_exp, Pmode, 1, expected_size);
21953 break;
21954 case unrolled_loop:
21955 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
21956 registers for 4 temporaries anyway. */
21957 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
21958 count_exp, Pmode, TARGET_64BIT ? 4 : 2,
21959 expected_size);
21960 break;
21961 case rep_prefix_8_byte:
21962 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
21963 DImode);
21964 break;
21965 case rep_prefix_4_byte:
21966 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
21967 SImode);
21968 break;
21969 case rep_prefix_1_byte:
21970 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
21971 QImode);
21972 break;
21973 }
21974 /* Adjust properly the offset of src and dest memory for aliasing. */
21975 if (CONST_INT_P (count_exp))
21976 {
21977 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
21978 (count / size_needed) * size_needed);
21979 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
21980 (count / size_needed) * size_needed);
21981 }
21982 else
21983 {
21984 src = change_address (src, BLKmode, srcreg);
21985 dst = change_address (dst, BLKmode, destreg);
21986 }
21987
21988 /* Step 4: Epilogue to copy the remaining bytes. */
21989 epilogue:
21990 if (label)
21991 {
21992 /* When the main loop is done, COUNT_EXP might hold original count,
21993 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
21994 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
21995 bytes. Compensate if needed. */
21996
21997 if (size_needed < epilogue_size_needed)
21998 {
21999 tmp =
22000 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22001 GEN_INT (size_needed - 1), count_exp, 1,
22002 OPTAB_DIRECT);
22003 if (tmp != count_exp)
22004 emit_move_insn (count_exp, tmp);
22005 }
22006 emit_label (label);
22007 LABEL_NUSES (label) = 1;
22008 }
22009
22010 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22011 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22012 epilogue_size_needed);
22013 if (jump_around_label)
22014 emit_label (jump_around_label);
22015 return true;
22016 }
22017
22018 /* Helper function for memcpy. For QImode value 0xXY produce
22019 0xXYXYXYXY of wide specified by MODE. This is essentially
22020 a * 0x10101010, but we can do slightly better than
22021 synth_mult by unwinding the sequence by hand on CPUs with
22022 slow multiply. */
22023 static rtx
22024 promote_duplicated_reg (enum machine_mode mode, rtx val)
22025 {
22026 enum machine_mode valmode = GET_MODE (val);
22027 rtx tmp;
22028 int nops = mode == DImode ? 3 : 2;
22029
22030 gcc_assert (mode == SImode || mode == DImode);
22031 if (val == const0_rtx)
22032 return copy_to_mode_reg (mode, const0_rtx);
22033 if (CONST_INT_P (val))
22034 {
22035 HOST_WIDE_INT v = INTVAL (val) & 255;
22036
22037 v |= v << 8;
22038 v |= v << 16;
22039 if (mode == DImode)
22040 v |= (v << 16) << 16;
22041 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22042 }
22043
22044 if (valmode == VOIDmode)
22045 valmode = QImode;
22046 if (valmode != QImode)
22047 val = gen_lowpart (QImode, val);
22048 if (mode == QImode)
22049 return val;
22050 if (!TARGET_PARTIAL_REG_STALL)
22051 nops--;
22052 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22053 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22054 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22055 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22056 {
22057 rtx reg = convert_modes (mode, QImode, val, true);
22058 tmp = promote_duplicated_reg (mode, const1_rtx);
22059 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22060 OPTAB_DIRECT);
22061 }
22062 else
22063 {
22064 rtx reg = convert_modes (mode, QImode, val, true);
22065
22066 if (!TARGET_PARTIAL_REG_STALL)
22067 if (mode == SImode)
22068 emit_insn (gen_movsi_insv_1 (reg, reg));
22069 else
22070 emit_insn (gen_movdi_insv_1 (reg, reg));
22071 else
22072 {
22073 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22074 NULL, 1, OPTAB_DIRECT);
22075 reg =
22076 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22077 }
22078 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22079 NULL, 1, OPTAB_DIRECT);
22080 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22081 if (mode == SImode)
22082 return reg;
22083 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22084 NULL, 1, OPTAB_DIRECT);
22085 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22086 return reg;
22087 }
22088 }
22089
22090 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22091 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22092 alignment from ALIGN to DESIRED_ALIGN. */
22093 static rtx
22094 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22095 {
22096 rtx promoted_val;
22097
22098 if (TARGET_64BIT
22099 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22100 promoted_val = promote_duplicated_reg (DImode, val);
22101 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22102 promoted_val = promote_duplicated_reg (SImode, val);
22103 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22104 promoted_val = promote_duplicated_reg (HImode, val);
22105 else
22106 promoted_val = val;
22107
22108 return promoted_val;
22109 }
22110
22111 /* Expand string clear operation (bzero). Use i386 string operations when
22112 profitable. See expand_movmem comment for explanation of individual
22113 steps performed. */
22114 bool
22115 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22116 rtx expected_align_exp, rtx expected_size_exp)
22117 {
22118 rtx destreg;
22119 rtx label = NULL;
22120 rtx tmp;
22121 rtx jump_around_label = NULL;
22122 HOST_WIDE_INT align = 1;
22123 unsigned HOST_WIDE_INT count = 0;
22124 HOST_WIDE_INT expected_size = -1;
22125 int size_needed = 0, epilogue_size_needed;
22126 int desired_align = 0, align_bytes = 0;
22127 enum stringop_alg alg;
22128 rtx promoted_val = NULL;
22129 bool force_loopy_epilogue = false;
22130 int dynamic_check;
22131 bool need_zero_guard = false;
22132
22133 if (CONST_INT_P (align_exp))
22134 align = INTVAL (align_exp);
22135 /* i386 can do misaligned access on reasonably increased cost. */
22136 if (CONST_INT_P (expected_align_exp)
22137 && INTVAL (expected_align_exp) > align)
22138 align = INTVAL (expected_align_exp);
22139 if (CONST_INT_P (count_exp))
22140 count = expected_size = INTVAL (count_exp);
22141 if (CONST_INT_P (expected_size_exp) && count == 0)
22142 expected_size = INTVAL (expected_size_exp);
22143
22144 /* Make sure we don't need to care about overflow later on. */
22145 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22146 return false;
22147
22148 /* Step 0: Decide on preferred algorithm, desired alignment and
22149 size of chunks to be copied by main loop. */
22150
22151 alg = decide_alg (count, expected_size, true, &dynamic_check);
22152 desired_align = decide_alignment (align, alg, expected_size);
22153
22154 if (!TARGET_ALIGN_STRINGOPS)
22155 align = desired_align;
22156
22157 if (alg == libcall)
22158 return false;
22159 gcc_assert (alg != no_stringop);
22160 if (!count)
22161 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22162 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22163 switch (alg)
22164 {
22165 case libcall:
22166 case no_stringop:
22167 gcc_unreachable ();
22168 case loop:
22169 need_zero_guard = true;
22170 size_needed = GET_MODE_SIZE (Pmode);
22171 break;
22172 case unrolled_loop:
22173 need_zero_guard = true;
22174 size_needed = GET_MODE_SIZE (Pmode) * 4;
22175 break;
22176 case rep_prefix_8_byte:
22177 size_needed = 8;
22178 break;
22179 case rep_prefix_4_byte:
22180 size_needed = 4;
22181 break;
22182 case rep_prefix_1_byte:
22183 size_needed = 1;
22184 break;
22185 case loop_1_byte:
22186 need_zero_guard = true;
22187 size_needed = 1;
22188 break;
22189 }
22190 epilogue_size_needed = size_needed;
22191
22192 /* Step 1: Prologue guard. */
22193
22194 /* Alignment code needs count to be in register. */
22195 if (CONST_INT_P (count_exp) && desired_align > align)
22196 {
22197 if (INTVAL (count_exp) > desired_align
22198 && INTVAL (count_exp) > size_needed)
22199 {
22200 align_bytes
22201 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22202 if (align_bytes <= 0)
22203 align_bytes = 0;
22204 else
22205 align_bytes = desired_align - align_bytes;
22206 }
22207 if (align_bytes == 0)
22208 {
22209 enum machine_mode mode = SImode;
22210 if (TARGET_64BIT && (count & ~0xffffffff))
22211 mode = DImode;
22212 count_exp = force_reg (mode, count_exp);
22213 }
22214 }
22215 /* Do the cheap promotion to allow better CSE across the
22216 main loop and epilogue (ie one load of the big constant in the
22217 front of all code. */
22218 if (CONST_INT_P (val_exp))
22219 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22220 desired_align, align);
22221 /* Ensure that alignment prologue won't copy past end of block. */
22222 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22223 {
22224 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22225 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22226 Make sure it is power of 2. */
22227 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22228
22229 /* To improve performance of small blocks, we jump around the VAL
22230 promoting mode. This mean that if the promoted VAL is not constant,
22231 we might not use it in the epilogue and have to use byte
22232 loop variant. */
22233 if (epilogue_size_needed > 2 && !promoted_val)
22234 force_loopy_epilogue = true;
22235 if (count)
22236 {
22237 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22238 {
22239 /* If main algorithm works on QImode, no epilogue is needed.
22240 For small sizes just don't align anything. */
22241 if (size_needed == 1)
22242 desired_align = align;
22243 else
22244 goto epilogue;
22245 }
22246 }
22247 else
22248 {
22249 label = gen_label_rtx ();
22250 emit_cmp_and_jump_insns (count_exp,
22251 GEN_INT (epilogue_size_needed),
22252 LTU, 0, counter_mode (count_exp), 1, label);
22253 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22254 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22255 else
22256 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22257 }
22258 }
22259 if (dynamic_check != -1)
22260 {
22261 rtx hot_label = gen_label_rtx ();
22262 jump_around_label = gen_label_rtx ();
22263 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22264 LEU, 0, counter_mode (count_exp), 1, hot_label);
22265 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22266 set_storage_via_libcall (dst, count_exp, val_exp, false);
22267 emit_jump (jump_around_label);
22268 emit_label (hot_label);
22269 }
22270
22271 /* Step 2: Alignment prologue. */
22272
22273 /* Do the expensive promotion once we branched off the small blocks. */
22274 if (!promoted_val)
22275 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22276 desired_align, align);
22277 gcc_assert (desired_align >= 1 && align >= 1);
22278
22279 if (desired_align > align)
22280 {
22281 if (align_bytes == 0)
22282 {
22283 /* Except for the first move in epilogue, we no longer know
22284 constant offset in aliasing info. It don't seems to worth
22285 the pain to maintain it for the first move, so throw away
22286 the info early. */
22287 dst = change_address (dst, BLKmode, destreg);
22288 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22289 desired_align);
22290 }
22291 else
22292 {
22293 /* If we know how many bytes need to be stored before dst is
22294 sufficiently aligned, maintain aliasing info accurately. */
22295 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22296 desired_align, align_bytes);
22297 count_exp = plus_constant (count_exp, -align_bytes);
22298 count -= align_bytes;
22299 }
22300 if (need_zero_guard
22301 && (count < (unsigned HOST_WIDE_INT) size_needed
22302 || (align_bytes == 0
22303 && count < ((unsigned HOST_WIDE_INT) size_needed
22304 + desired_align - align))))
22305 {
22306 /* It is possible that we copied enough so the main loop will not
22307 execute. */
22308 gcc_assert (size_needed > 1);
22309 if (label == NULL_RTX)
22310 label = gen_label_rtx ();
22311 emit_cmp_and_jump_insns (count_exp,
22312 GEN_INT (size_needed),
22313 LTU, 0, counter_mode (count_exp), 1, label);
22314 if (expected_size == -1
22315 || expected_size < (desired_align - align) / 2 + size_needed)
22316 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22317 else
22318 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22319 }
22320 }
22321 if (label && size_needed == 1)
22322 {
22323 emit_label (label);
22324 LABEL_NUSES (label) = 1;
22325 label = NULL;
22326 promoted_val = val_exp;
22327 epilogue_size_needed = 1;
22328 }
22329 else if (label == NULL_RTX)
22330 epilogue_size_needed = size_needed;
22331
22332 /* Step 3: Main loop. */
22333
22334 switch (alg)
22335 {
22336 case libcall:
22337 case no_stringop:
22338 gcc_unreachable ();
22339 case loop_1_byte:
22340 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22341 count_exp, QImode, 1, expected_size);
22342 break;
22343 case loop:
22344 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22345 count_exp, Pmode, 1, expected_size);
22346 break;
22347 case unrolled_loop:
22348 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22349 count_exp, Pmode, 4, expected_size);
22350 break;
22351 case rep_prefix_8_byte:
22352 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22353 DImode, val_exp);
22354 break;
22355 case rep_prefix_4_byte:
22356 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22357 SImode, val_exp);
22358 break;
22359 case rep_prefix_1_byte:
22360 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22361 QImode, val_exp);
22362 break;
22363 }
22364 /* Adjust properly the offset of src and dest memory for aliasing. */
22365 if (CONST_INT_P (count_exp))
22366 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22367 (count / size_needed) * size_needed);
22368 else
22369 dst = change_address (dst, BLKmode, destreg);
22370
22371 /* Step 4: Epilogue to copy the remaining bytes. */
22372
22373 if (label)
22374 {
22375 /* When the main loop is done, COUNT_EXP might hold original count,
22376 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22377 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22378 bytes. Compensate if needed. */
22379
22380 if (size_needed < epilogue_size_needed)
22381 {
22382 tmp =
22383 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22384 GEN_INT (size_needed - 1), count_exp, 1,
22385 OPTAB_DIRECT);
22386 if (tmp != count_exp)
22387 emit_move_insn (count_exp, tmp);
22388 }
22389 emit_label (label);
22390 LABEL_NUSES (label) = 1;
22391 }
22392 epilogue:
22393 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22394 {
22395 if (force_loopy_epilogue)
22396 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22397 epilogue_size_needed);
22398 else
22399 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22400 epilogue_size_needed);
22401 }
22402 if (jump_around_label)
22403 emit_label (jump_around_label);
22404 return true;
22405 }
22406
22407 /* Expand the appropriate insns for doing strlen if not just doing
22408 repnz; scasb
22409
22410 out = result, initialized with the start address
22411 align_rtx = alignment of the address.
22412 scratch = scratch register, initialized with the startaddress when
22413 not aligned, otherwise undefined
22414
22415 This is just the body. It needs the initializations mentioned above and
22416 some address computing at the end. These things are done in i386.md. */
22417
22418 static void
22419 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22420 {
22421 int align;
22422 rtx tmp;
22423 rtx align_2_label = NULL_RTX;
22424 rtx align_3_label = NULL_RTX;
22425 rtx align_4_label = gen_label_rtx ();
22426 rtx end_0_label = gen_label_rtx ();
22427 rtx mem;
22428 rtx tmpreg = gen_reg_rtx (SImode);
22429 rtx scratch = gen_reg_rtx (SImode);
22430 rtx cmp;
22431
22432 align = 0;
22433 if (CONST_INT_P (align_rtx))
22434 align = INTVAL (align_rtx);
22435
22436 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
22437
22438 /* Is there a known alignment and is it less than 4? */
22439 if (align < 4)
22440 {
22441 rtx scratch1 = gen_reg_rtx (Pmode);
22442 emit_move_insn (scratch1, out);
22443 /* Is there a known alignment and is it not 2? */
22444 if (align != 2)
22445 {
22446 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22447 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22448
22449 /* Leave just the 3 lower bits. */
22450 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22451 NULL_RTX, 0, OPTAB_WIDEN);
22452
22453 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22454 Pmode, 1, align_4_label);
22455 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22456 Pmode, 1, align_2_label);
22457 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22458 Pmode, 1, align_3_label);
22459 }
22460 else
22461 {
22462 /* Since the alignment is 2, we have to check 2 or 0 bytes;
22463 check if is aligned to 4 - byte. */
22464
22465 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22466 NULL_RTX, 0, OPTAB_WIDEN);
22467
22468 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22469 Pmode, 1, align_4_label);
22470 }
22471
22472 mem = change_address (src, QImode, out);
22473
22474 /* Now compare the bytes. */
22475
22476 /* Compare the first n unaligned byte on a byte per byte basis. */
22477 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
22478 QImode, 1, end_0_label);
22479
22480 /* Increment the address. */
22481 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22482
22483 /* Not needed with an alignment of 2 */
22484 if (align != 2)
22485 {
22486 emit_label (align_2_label);
22487
22488 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22489 end_0_label);
22490
22491 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22492
22493 emit_label (align_3_label);
22494 }
22495
22496 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22497 end_0_label);
22498
22499 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22500 }
22501
22502 /* Generate loop to check 4 bytes at a time. It is not a good idea to
22503 align this loop. It gives only huge programs, but does not help to
22504 speed up. */
22505 emit_label (align_4_label);
22506
22507 mem = change_address (src, SImode, out);
22508 emit_move_insn (scratch, mem);
22509 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
22510
22511 /* This formula yields a nonzero result iff one of the bytes is zero.
22512 This saves three branches inside loop and many cycles. */
22513
22514 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
22515 emit_insn (gen_one_cmplsi2 (scratch, scratch));
22516 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
22517 emit_insn (gen_andsi3 (tmpreg, tmpreg,
22518 gen_int_mode (0x80808080, SImode)));
22519 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
22520 align_4_label);
22521
22522 if (TARGET_CMOVE)
22523 {
22524 rtx reg = gen_reg_rtx (SImode);
22525 rtx reg2 = gen_reg_rtx (Pmode);
22526 emit_move_insn (reg, tmpreg);
22527 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
22528
22529 /* If zero is not in the first two bytes, move two bytes forward. */
22530 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22531 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22532 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22533 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
22534 gen_rtx_IF_THEN_ELSE (SImode, tmp,
22535 reg,
22536 tmpreg)));
22537 /* Emit lea manually to avoid clobbering of flags. */
22538 emit_insn (gen_rtx_SET (SImode, reg2,
22539 gen_rtx_PLUS (Pmode, out, const2_rtx)));
22540
22541 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22542 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22543 emit_insn (gen_rtx_SET (VOIDmode, out,
22544 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
22545 reg2,
22546 out)));
22547 }
22548 else
22549 {
22550 rtx end_2_label = gen_label_rtx ();
22551 /* Is zero in the first two bytes? */
22552
22553 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22554 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22555 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
22556 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22557 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
22558 pc_rtx);
22559 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
22560 JUMP_LABEL (tmp) = end_2_label;
22561
22562 /* Not in the first two. Move two bytes forward. */
22563 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
22564 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
22565
22566 emit_label (end_2_label);
22567
22568 }
22569
22570 /* Avoid branch in fixing the byte. */
22571 tmpreg = gen_lowpart (QImode, tmpreg);
22572 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
22573 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
22574 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
22575 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
22576
22577 emit_label (end_0_label);
22578 }
22579
22580 /* Expand strlen. */
22581
22582 bool
22583 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
22584 {
22585 rtx addr, scratch1, scratch2, scratch3, scratch4;
22586
22587 /* The generic case of strlen expander is long. Avoid it's
22588 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
22589
22590 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22591 && !TARGET_INLINE_ALL_STRINGOPS
22592 && !optimize_insn_for_size_p ()
22593 && (!CONST_INT_P (align) || INTVAL (align) < 4))
22594 return false;
22595
22596 addr = force_reg (Pmode, XEXP (src, 0));
22597 scratch1 = gen_reg_rtx (Pmode);
22598
22599 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22600 && !optimize_insn_for_size_p ())
22601 {
22602 /* Well it seems that some optimizer does not combine a call like
22603 foo(strlen(bar), strlen(bar));
22604 when the move and the subtraction is done here. It does calculate
22605 the length just once when these instructions are done inside of
22606 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
22607 often used and I use one fewer register for the lifetime of
22608 output_strlen_unroll() this is better. */
22609
22610 emit_move_insn (out, addr);
22611
22612 ix86_expand_strlensi_unroll_1 (out, src, align);
22613
22614 /* strlensi_unroll_1 returns the address of the zero at the end of
22615 the string, like memchr(), so compute the length by subtracting
22616 the start address. */
22617 emit_insn (ix86_gen_sub3 (out, out, addr));
22618 }
22619 else
22620 {
22621 rtx unspec;
22622
22623 /* Can't use this if the user has appropriated eax, ecx, or edi. */
22624 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
22625 return false;
22626
22627 scratch2 = gen_reg_rtx (Pmode);
22628 scratch3 = gen_reg_rtx (Pmode);
22629 scratch4 = force_reg (Pmode, constm1_rtx);
22630
22631 emit_move_insn (scratch3, addr);
22632 eoschar = force_reg (QImode, eoschar);
22633
22634 src = replace_equiv_address_nv (src, scratch3);
22635
22636 /* If .md starts supporting :P, this can be done in .md. */
22637 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
22638 scratch4), UNSPEC_SCAS);
22639 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
22640 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
22641 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
22642 }
22643 return true;
22644 }
22645
22646 /* For given symbol (function) construct code to compute address of it's PLT
22647 entry in large x86-64 PIC model. */
22648 rtx
22649 construct_plt_address (rtx symbol)
22650 {
22651 rtx tmp = gen_reg_rtx (Pmode);
22652 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
22653
22654 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
22655 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
22656
22657 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
22658 emit_insn (gen_adddi3 (tmp, tmp, pic_offset_table_rtx));
22659 return tmp;
22660 }
22661
22662 rtx
22663 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
22664 rtx callarg2,
22665 rtx pop, bool sibcall)
22666 {
22667 /* We need to represent that SI and DI registers are clobbered
22668 by SYSV calls. */
22669 static int clobbered_registers[] = {
22670 XMM6_REG, XMM7_REG, XMM8_REG,
22671 XMM9_REG, XMM10_REG, XMM11_REG,
22672 XMM12_REG, XMM13_REG, XMM14_REG,
22673 XMM15_REG, SI_REG, DI_REG
22674 };
22675 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
22676 rtx use = NULL, call;
22677 unsigned int vec_len;
22678
22679 if (pop == const0_rtx)
22680 pop = NULL;
22681 gcc_assert (!TARGET_64BIT || !pop);
22682
22683 if (TARGET_MACHO && !TARGET_64BIT)
22684 {
22685 #if TARGET_MACHO
22686 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
22687 fnaddr = machopic_indirect_call_target (fnaddr);
22688 #endif
22689 }
22690 else
22691 {
22692 /* Static functions and indirect calls don't need the pic register. */
22693 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
22694 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
22695 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
22696 use_reg (&use, pic_offset_table_rtx);
22697 }
22698
22699 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
22700 {
22701 rtx al = gen_rtx_REG (QImode, AX_REG);
22702 emit_move_insn (al, callarg2);
22703 use_reg (&use, al);
22704 }
22705
22706 if (ix86_cmodel == CM_LARGE_PIC
22707 && MEM_P (fnaddr)
22708 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
22709 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
22710 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
22711 else if (sibcall
22712 ? !sibcall_insn_operand (XEXP (fnaddr, 0), Pmode)
22713 : !call_insn_operand (XEXP (fnaddr, 0), Pmode))
22714 {
22715 fnaddr = XEXP (fnaddr, 0);
22716 if (GET_MODE (fnaddr) != Pmode)
22717 fnaddr = convert_to_mode (Pmode, fnaddr, 1);
22718 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (Pmode, fnaddr));
22719 }
22720
22721 vec_len = 0;
22722 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
22723 if (retval)
22724 call = gen_rtx_SET (VOIDmode, retval, call);
22725 vec[vec_len++] = call;
22726
22727 if (pop)
22728 {
22729 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
22730 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
22731 vec[vec_len++] = pop;
22732 }
22733
22734 if (TARGET_64BIT_MS_ABI
22735 && (!callarg2 || INTVAL (callarg2) != -2))
22736 {
22737 unsigned i;
22738
22739 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
22740 UNSPEC_MS_TO_SYSV_CALL);
22741
22742 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
22743 vec[vec_len++]
22744 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
22745 ? TImode : DImode,
22746 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
22747 ? TImode : DImode,
22748 clobbered_registers[i]));
22749 }
22750
22751 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
22752 if (TARGET_VZEROUPPER)
22753 {
22754 int avx256;
22755 if (cfun->machine->callee_pass_avx256_p)
22756 {
22757 if (cfun->machine->callee_return_avx256_p)
22758 avx256 = callee_return_pass_avx256;
22759 else
22760 avx256 = callee_pass_avx256;
22761 }
22762 else if (cfun->machine->callee_return_avx256_p)
22763 avx256 = callee_return_avx256;
22764 else
22765 avx256 = call_no_avx256;
22766
22767 if (reload_completed)
22768 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
22769 else
22770 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
22771 gen_rtvec (1, GEN_INT (avx256)),
22772 UNSPEC_CALL_NEEDS_VZEROUPPER);
22773 }
22774
22775 if (vec_len > 1)
22776 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
22777 call = emit_call_insn (call);
22778 if (use)
22779 CALL_INSN_FUNCTION_USAGE (call) = use;
22780
22781 return call;
22782 }
22783
22784 void
22785 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
22786 {
22787 rtx pat = PATTERN (insn);
22788 rtvec vec = XVEC (pat, 0);
22789 int len = GET_NUM_ELEM (vec) - 1;
22790
22791 /* Strip off the last entry of the parallel. */
22792 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
22793 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
22794 if (len == 1)
22795 pat = RTVEC_ELT (vec, 0);
22796 else
22797 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
22798
22799 emit_insn (gen_avx_vzeroupper (vzeroupper));
22800 emit_call_insn (pat);
22801 }
22802
22803 /* Output the assembly for a call instruction. */
22804
22805 const char *
22806 ix86_output_call_insn (rtx insn, rtx call_op)
22807 {
22808 bool direct_p = constant_call_address_operand (call_op, Pmode);
22809 bool seh_nop_p = false;
22810 const char *xasm;
22811
22812 if (SIBLING_CALL_P (insn))
22813 {
22814 if (direct_p)
22815 xasm = "jmp\t%P0";
22816 /* SEH epilogue detection requires the indirect branch case
22817 to include REX.W. */
22818 else if (TARGET_SEH)
22819 xasm = "rex.W jmp %A0";
22820 else
22821 xasm = "jmp\t%A0";
22822
22823 output_asm_insn (xasm, &call_op);
22824 return "";
22825 }
22826
22827 /* SEH unwinding can require an extra nop to be emitted in several
22828 circumstances. Determine if we have one of those. */
22829 if (TARGET_SEH)
22830 {
22831 rtx i;
22832
22833 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
22834 {
22835 /* If we get to another real insn, we don't need the nop. */
22836 if (INSN_P (i))
22837 break;
22838
22839 /* If we get to the epilogue note, prevent a catch region from
22840 being adjacent to the standard epilogue sequence. If non-
22841 call-exceptions, we'll have done this during epilogue emission. */
22842 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
22843 && !flag_non_call_exceptions
22844 && !can_throw_internal (insn))
22845 {
22846 seh_nop_p = true;
22847 break;
22848 }
22849 }
22850
22851 /* If we didn't find a real insn following the call, prevent the
22852 unwinder from looking into the next function. */
22853 if (i == NULL)
22854 seh_nop_p = true;
22855 }
22856
22857 if (direct_p)
22858 xasm = "call\t%P0";
22859 else
22860 xasm = "call\t%A0";
22861
22862 output_asm_insn (xasm, &call_op);
22863
22864 if (seh_nop_p)
22865 return "nop";
22866
22867 return "";
22868 }
22869 \f
22870 /* Clear stack slot assignments remembered from previous functions.
22871 This is called from INIT_EXPANDERS once before RTL is emitted for each
22872 function. */
22873
22874 static struct machine_function *
22875 ix86_init_machine_status (void)
22876 {
22877 struct machine_function *f;
22878
22879 f = ggc_alloc_cleared_machine_function ();
22880 f->use_fast_prologue_epilogue_nregs = -1;
22881 f->tls_descriptor_call_expanded_p = 0;
22882 f->call_abi = ix86_abi;
22883
22884 return f;
22885 }
22886
22887 /* Return a MEM corresponding to a stack slot with mode MODE.
22888 Allocate a new slot if necessary.
22889
22890 The RTL for a function can have several slots available: N is
22891 which slot to use. */
22892
22893 rtx
22894 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
22895 {
22896 struct stack_local_entry *s;
22897
22898 gcc_assert (n < MAX_386_STACK_LOCALS);
22899
22900 /* Virtual slot is valid only before vregs are instantiated. */
22901 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
22902
22903 for (s = ix86_stack_locals; s; s = s->next)
22904 if (s->mode == mode && s->n == n)
22905 return validize_mem (copy_rtx (s->rtl));
22906
22907 s = ggc_alloc_stack_local_entry ();
22908 s->n = n;
22909 s->mode = mode;
22910 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
22911
22912 s->next = ix86_stack_locals;
22913 ix86_stack_locals = s;
22914 return validize_mem (s->rtl);
22915 }
22916 \f
22917 /* Calculate the length of the memory address in the instruction encoding.
22918 Includes addr32 prefix, does not include the one-byte modrm, opcode,
22919 or other prefixes. */
22920
22921 int
22922 memory_address_length (rtx addr)
22923 {
22924 struct ix86_address parts;
22925 rtx base, index, disp;
22926 int len;
22927 int ok;
22928
22929 if (GET_CODE (addr) == PRE_DEC
22930 || GET_CODE (addr) == POST_INC
22931 || GET_CODE (addr) == PRE_MODIFY
22932 || GET_CODE (addr) == POST_MODIFY)
22933 return 0;
22934
22935 ok = ix86_decompose_address (addr, &parts);
22936 gcc_assert (ok);
22937
22938 if (parts.base && GET_CODE (parts.base) == SUBREG)
22939 parts.base = SUBREG_REG (parts.base);
22940 if (parts.index && GET_CODE (parts.index) == SUBREG)
22941 parts.index = SUBREG_REG (parts.index);
22942
22943 base = parts.base;
22944 index = parts.index;
22945 disp = parts.disp;
22946
22947 /* Add length of addr32 prefix. */
22948 len = (GET_CODE (addr) == ZERO_EXTEND
22949 || GET_CODE (addr) == AND);
22950
22951 /* Rule of thumb:
22952 - esp as the base always wants an index,
22953 - ebp as the base always wants a displacement,
22954 - r12 as the base always wants an index,
22955 - r13 as the base always wants a displacement. */
22956
22957 /* Register Indirect. */
22958 if (base && !index && !disp)
22959 {
22960 /* esp (for its index) and ebp (for its displacement) need
22961 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
22962 code. */
22963 if (REG_P (addr)
22964 && (addr == arg_pointer_rtx
22965 || addr == frame_pointer_rtx
22966 || REGNO (addr) == SP_REG
22967 || REGNO (addr) == BP_REG
22968 || REGNO (addr) == R12_REG
22969 || REGNO (addr) == R13_REG))
22970 len = 1;
22971 }
22972
22973 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
22974 is not disp32, but disp32(%rip), so for disp32
22975 SIB byte is needed, unless print_operand_address
22976 optimizes it into disp32(%rip) or (%rip) is implied
22977 by UNSPEC. */
22978 else if (disp && !base && !index)
22979 {
22980 len = 4;
22981 if (TARGET_64BIT)
22982 {
22983 rtx symbol = disp;
22984
22985 if (GET_CODE (disp) == CONST)
22986 symbol = XEXP (disp, 0);
22987 if (GET_CODE (symbol) == PLUS
22988 && CONST_INT_P (XEXP (symbol, 1)))
22989 symbol = XEXP (symbol, 0);
22990
22991 if (GET_CODE (symbol) != LABEL_REF
22992 && (GET_CODE (symbol) != SYMBOL_REF
22993 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
22994 && (GET_CODE (symbol) != UNSPEC
22995 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
22996 && XINT (symbol, 1) != UNSPEC_PCREL
22997 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
22998 len += 1;
22999 }
23000 }
23001
23002 else
23003 {
23004 /* Find the length of the displacement constant. */
23005 if (disp)
23006 {
23007 if (base && satisfies_constraint_K (disp))
23008 len = 1;
23009 else
23010 len = 4;
23011 }
23012 /* ebp always wants a displacement. Similarly r13. */
23013 else if (base && REG_P (base)
23014 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23015 len = 1;
23016
23017 /* An index requires the two-byte modrm form.... */
23018 if (index
23019 /* ...like esp (or r12), which always wants an index. */
23020 || base == arg_pointer_rtx
23021 || base == frame_pointer_rtx
23022 || (base && REG_P (base)
23023 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23024 len += 1;
23025 }
23026
23027 switch (parts.seg)
23028 {
23029 case SEG_FS:
23030 case SEG_GS:
23031 len += 1;
23032 break;
23033 default:
23034 break;
23035 }
23036
23037 return len;
23038 }
23039
23040 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23041 is set, expect that insn have 8bit immediate alternative. */
23042 int
23043 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23044 {
23045 int len = 0;
23046 int i;
23047 extract_insn_cached (insn);
23048 for (i = recog_data.n_operands - 1; i >= 0; --i)
23049 if (CONSTANT_P (recog_data.operand[i]))
23050 {
23051 enum attr_mode mode = get_attr_mode (insn);
23052
23053 gcc_assert (!len);
23054 if (shortform && CONST_INT_P (recog_data.operand[i]))
23055 {
23056 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23057 switch (mode)
23058 {
23059 case MODE_QI:
23060 len = 1;
23061 continue;
23062 case MODE_HI:
23063 ival = trunc_int_for_mode (ival, HImode);
23064 break;
23065 case MODE_SI:
23066 ival = trunc_int_for_mode (ival, SImode);
23067 break;
23068 default:
23069 break;
23070 }
23071 if (IN_RANGE (ival, -128, 127))
23072 {
23073 len = 1;
23074 continue;
23075 }
23076 }
23077 switch (mode)
23078 {
23079 case MODE_QI:
23080 len = 1;
23081 break;
23082 case MODE_HI:
23083 len = 2;
23084 break;
23085 case MODE_SI:
23086 len = 4;
23087 break;
23088 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23089 case MODE_DI:
23090 len = 4;
23091 break;
23092 default:
23093 fatal_insn ("unknown insn mode", insn);
23094 }
23095 }
23096 return len;
23097 }
23098 /* Compute default value for "length_address" attribute. */
23099 int
23100 ix86_attr_length_address_default (rtx insn)
23101 {
23102 int i;
23103
23104 if (get_attr_type (insn) == TYPE_LEA)
23105 {
23106 rtx set = PATTERN (insn), addr;
23107
23108 if (GET_CODE (set) == PARALLEL)
23109 set = XVECEXP (set, 0, 0);
23110
23111 gcc_assert (GET_CODE (set) == SET);
23112
23113 addr = SET_SRC (set);
23114 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23115 {
23116 if (GET_CODE (addr) == ZERO_EXTEND)
23117 addr = XEXP (addr, 0);
23118 if (GET_CODE (addr) == SUBREG)
23119 addr = SUBREG_REG (addr);
23120 }
23121
23122 return memory_address_length (addr);
23123 }
23124
23125 extract_insn_cached (insn);
23126 for (i = recog_data.n_operands - 1; i >= 0; --i)
23127 if (MEM_P (recog_data.operand[i]))
23128 {
23129 constrain_operands_cached (reload_completed);
23130 if (which_alternative != -1)
23131 {
23132 const char *constraints = recog_data.constraints[i];
23133 int alt = which_alternative;
23134
23135 while (*constraints == '=' || *constraints == '+')
23136 constraints++;
23137 while (alt-- > 0)
23138 while (*constraints++ != ',')
23139 ;
23140 /* Skip ignored operands. */
23141 if (*constraints == 'X')
23142 continue;
23143 }
23144 return memory_address_length (XEXP (recog_data.operand[i], 0));
23145 }
23146 return 0;
23147 }
23148
23149 /* Compute default value for "length_vex" attribute. It includes
23150 2 or 3 byte VEX prefix and 1 opcode byte. */
23151
23152 int
23153 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23154 {
23155 int i;
23156
23157 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23158 byte VEX prefix. */
23159 if (!has_0f_opcode || has_vex_w)
23160 return 3 + 1;
23161
23162 /* We can always use 2 byte VEX prefix in 32bit. */
23163 if (!TARGET_64BIT)
23164 return 2 + 1;
23165
23166 extract_insn_cached (insn);
23167
23168 for (i = recog_data.n_operands - 1; i >= 0; --i)
23169 if (REG_P (recog_data.operand[i]))
23170 {
23171 /* REX.W bit uses 3 byte VEX prefix. */
23172 if (GET_MODE (recog_data.operand[i]) == DImode
23173 && GENERAL_REG_P (recog_data.operand[i]))
23174 return 3 + 1;
23175 }
23176 else
23177 {
23178 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23179 if (MEM_P (recog_data.operand[i])
23180 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23181 return 3 + 1;
23182 }
23183
23184 return 2 + 1;
23185 }
23186 \f
23187 /* Return the maximum number of instructions a cpu can issue. */
23188
23189 static int
23190 ix86_issue_rate (void)
23191 {
23192 switch (ix86_tune)
23193 {
23194 case PROCESSOR_PENTIUM:
23195 case PROCESSOR_ATOM:
23196 case PROCESSOR_K6:
23197 return 2;
23198
23199 case PROCESSOR_PENTIUMPRO:
23200 case PROCESSOR_PENTIUM4:
23201 case PROCESSOR_CORE2_32:
23202 case PROCESSOR_CORE2_64:
23203 case PROCESSOR_COREI7_32:
23204 case PROCESSOR_COREI7_64:
23205 case PROCESSOR_ATHLON:
23206 case PROCESSOR_K8:
23207 case PROCESSOR_AMDFAM10:
23208 case PROCESSOR_NOCONA:
23209 case PROCESSOR_GENERIC32:
23210 case PROCESSOR_GENERIC64:
23211 case PROCESSOR_BDVER1:
23212 case PROCESSOR_BDVER2:
23213 case PROCESSOR_BTVER1:
23214 return 3;
23215
23216 default:
23217 return 1;
23218 }
23219 }
23220
23221 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23222 by DEP_INSN and nothing set by DEP_INSN. */
23223
23224 static bool
23225 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23226 {
23227 rtx set, set2;
23228
23229 /* Simplify the test for uninteresting insns. */
23230 if (insn_type != TYPE_SETCC
23231 && insn_type != TYPE_ICMOV
23232 && insn_type != TYPE_FCMOV
23233 && insn_type != TYPE_IBR)
23234 return false;
23235
23236 if ((set = single_set (dep_insn)) != 0)
23237 {
23238 set = SET_DEST (set);
23239 set2 = NULL_RTX;
23240 }
23241 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23242 && XVECLEN (PATTERN (dep_insn), 0) == 2
23243 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23244 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23245 {
23246 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23247 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23248 }
23249 else
23250 return false;
23251
23252 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23253 return false;
23254
23255 /* This test is true if the dependent insn reads the flags but
23256 not any other potentially set register. */
23257 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23258 return false;
23259
23260 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23261 return false;
23262
23263 return true;
23264 }
23265
23266 /* Return true iff USE_INSN has a memory address with operands set by
23267 SET_INSN. */
23268
23269 bool
23270 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23271 {
23272 int i;
23273 extract_insn_cached (use_insn);
23274 for (i = recog_data.n_operands - 1; i >= 0; --i)
23275 if (MEM_P (recog_data.operand[i]))
23276 {
23277 rtx addr = XEXP (recog_data.operand[i], 0);
23278 return modified_in_p (addr, set_insn) != 0;
23279 }
23280 return false;
23281 }
23282
23283 static int
23284 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23285 {
23286 enum attr_type insn_type, dep_insn_type;
23287 enum attr_memory memory;
23288 rtx set, set2;
23289 int dep_insn_code_number;
23290
23291 /* Anti and output dependencies have zero cost on all CPUs. */
23292 if (REG_NOTE_KIND (link) != 0)
23293 return 0;
23294
23295 dep_insn_code_number = recog_memoized (dep_insn);
23296
23297 /* If we can't recognize the insns, we can't really do anything. */
23298 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23299 return cost;
23300
23301 insn_type = get_attr_type (insn);
23302 dep_insn_type = get_attr_type (dep_insn);
23303
23304 switch (ix86_tune)
23305 {
23306 case PROCESSOR_PENTIUM:
23307 /* Address Generation Interlock adds a cycle of latency. */
23308 if (insn_type == TYPE_LEA)
23309 {
23310 rtx addr = PATTERN (insn);
23311
23312 if (GET_CODE (addr) == PARALLEL)
23313 addr = XVECEXP (addr, 0, 0);
23314
23315 gcc_assert (GET_CODE (addr) == SET);
23316
23317 addr = SET_SRC (addr);
23318 if (modified_in_p (addr, dep_insn))
23319 cost += 1;
23320 }
23321 else if (ix86_agi_dependent (dep_insn, insn))
23322 cost += 1;
23323
23324 /* ??? Compares pair with jump/setcc. */
23325 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23326 cost = 0;
23327
23328 /* Floating point stores require value to be ready one cycle earlier. */
23329 if (insn_type == TYPE_FMOV
23330 && get_attr_memory (insn) == MEMORY_STORE
23331 && !ix86_agi_dependent (dep_insn, insn))
23332 cost += 1;
23333 break;
23334
23335 case PROCESSOR_PENTIUMPRO:
23336 memory = get_attr_memory (insn);
23337
23338 /* INT->FP conversion is expensive. */
23339 if (get_attr_fp_int_src (dep_insn))
23340 cost += 5;
23341
23342 /* There is one cycle extra latency between an FP op and a store. */
23343 if (insn_type == TYPE_FMOV
23344 && (set = single_set (dep_insn)) != NULL_RTX
23345 && (set2 = single_set (insn)) != NULL_RTX
23346 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23347 && MEM_P (SET_DEST (set2)))
23348 cost += 1;
23349
23350 /* Show ability of reorder buffer to hide latency of load by executing
23351 in parallel with previous instruction in case
23352 previous instruction is not needed to compute the address. */
23353 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23354 && !ix86_agi_dependent (dep_insn, insn))
23355 {
23356 /* Claim moves to take one cycle, as core can issue one load
23357 at time and the next load can start cycle later. */
23358 if (dep_insn_type == TYPE_IMOV
23359 || dep_insn_type == TYPE_FMOV)
23360 cost = 1;
23361 else if (cost > 1)
23362 cost--;
23363 }
23364 break;
23365
23366 case PROCESSOR_K6:
23367 memory = get_attr_memory (insn);
23368
23369 /* The esp dependency is resolved before the instruction is really
23370 finished. */
23371 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23372 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23373 return 1;
23374
23375 /* INT->FP conversion is expensive. */
23376 if (get_attr_fp_int_src (dep_insn))
23377 cost += 5;
23378
23379 /* Show ability of reorder buffer to hide latency of load by executing
23380 in parallel with previous instruction in case
23381 previous instruction is not needed to compute the address. */
23382 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23383 && !ix86_agi_dependent (dep_insn, insn))
23384 {
23385 /* Claim moves to take one cycle, as core can issue one load
23386 at time and the next load can start cycle later. */
23387 if (dep_insn_type == TYPE_IMOV
23388 || dep_insn_type == TYPE_FMOV)
23389 cost = 1;
23390 else if (cost > 2)
23391 cost -= 2;
23392 else
23393 cost = 1;
23394 }
23395 break;
23396
23397 case PROCESSOR_ATHLON:
23398 case PROCESSOR_K8:
23399 case PROCESSOR_AMDFAM10:
23400 case PROCESSOR_BDVER1:
23401 case PROCESSOR_BDVER2:
23402 case PROCESSOR_BTVER1:
23403 case PROCESSOR_ATOM:
23404 case PROCESSOR_GENERIC32:
23405 case PROCESSOR_GENERIC64:
23406 memory = get_attr_memory (insn);
23407
23408 /* Show ability of reorder buffer to hide latency of load by executing
23409 in parallel with previous instruction in case
23410 previous instruction is not needed to compute the address. */
23411 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23412 && !ix86_agi_dependent (dep_insn, insn))
23413 {
23414 enum attr_unit unit = get_attr_unit (insn);
23415 int loadcost = 3;
23416
23417 /* Because of the difference between the length of integer and
23418 floating unit pipeline preparation stages, the memory operands
23419 for floating point are cheaper.
23420
23421 ??? For Athlon it the difference is most probably 2. */
23422 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23423 loadcost = 3;
23424 else
23425 loadcost = TARGET_ATHLON ? 2 : 0;
23426
23427 if (cost >= loadcost)
23428 cost -= loadcost;
23429 else
23430 cost = 0;
23431 }
23432
23433 default:
23434 break;
23435 }
23436
23437 return cost;
23438 }
23439
23440 /* How many alternative schedules to try. This should be as wide as the
23441 scheduling freedom in the DFA, but no wider. Making this value too
23442 large results extra work for the scheduler. */
23443
23444 static int
23445 ia32_multipass_dfa_lookahead (void)
23446 {
23447 switch (ix86_tune)
23448 {
23449 case PROCESSOR_PENTIUM:
23450 return 2;
23451
23452 case PROCESSOR_PENTIUMPRO:
23453 case PROCESSOR_K6:
23454 return 1;
23455
23456 case PROCESSOR_CORE2_32:
23457 case PROCESSOR_CORE2_64:
23458 case PROCESSOR_COREI7_32:
23459 case PROCESSOR_COREI7_64:
23460 /* Generally, we want haifa-sched:max_issue() to look ahead as far
23461 as many instructions can be executed on a cycle, i.e.,
23462 issue_rate. I wonder why tuning for many CPUs does not do this. */
23463 return ix86_issue_rate ();
23464
23465 default:
23466 return 0;
23467 }
23468 }
23469
23470 \f
23471
23472 /* Model decoder of Core 2/i7.
23473 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
23474 track the instruction fetch block boundaries and make sure that long
23475 (9+ bytes) instructions are assigned to D0. */
23476
23477 /* Maximum length of an insn that can be handled by
23478 a secondary decoder unit. '8' for Core 2/i7. */
23479 static int core2i7_secondary_decoder_max_insn_size;
23480
23481 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
23482 '16' for Core 2/i7. */
23483 static int core2i7_ifetch_block_size;
23484
23485 /* Maximum number of instructions decoder can handle per cycle.
23486 '6' for Core 2/i7. */
23487 static int core2i7_ifetch_block_max_insns;
23488
23489 typedef struct ix86_first_cycle_multipass_data_ *
23490 ix86_first_cycle_multipass_data_t;
23491 typedef const struct ix86_first_cycle_multipass_data_ *
23492 const_ix86_first_cycle_multipass_data_t;
23493
23494 /* A variable to store target state across calls to max_issue within
23495 one cycle. */
23496 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
23497 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
23498
23499 /* Initialize DATA. */
23500 static void
23501 core2i7_first_cycle_multipass_init (void *_data)
23502 {
23503 ix86_first_cycle_multipass_data_t data
23504 = (ix86_first_cycle_multipass_data_t) _data;
23505
23506 data->ifetch_block_len = 0;
23507 data->ifetch_block_n_insns = 0;
23508 data->ready_try_change = NULL;
23509 data->ready_try_change_size = 0;
23510 }
23511
23512 /* Advancing the cycle; reset ifetch block counts. */
23513 static void
23514 core2i7_dfa_post_advance_cycle (void)
23515 {
23516 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
23517
23518 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23519
23520 data->ifetch_block_len = 0;
23521 data->ifetch_block_n_insns = 0;
23522 }
23523
23524 static int min_insn_size (rtx);
23525
23526 /* Filter out insns from ready_try that the core will not be able to issue
23527 on current cycle due to decoder. */
23528 static void
23529 core2i7_first_cycle_multipass_filter_ready_try
23530 (const_ix86_first_cycle_multipass_data_t data,
23531 char *ready_try, int n_ready, bool first_cycle_insn_p)
23532 {
23533 while (n_ready--)
23534 {
23535 rtx insn;
23536 int insn_size;
23537
23538 if (ready_try[n_ready])
23539 continue;
23540
23541 insn = get_ready_element (n_ready);
23542 insn_size = min_insn_size (insn);
23543
23544 if (/* If this is a too long an insn for a secondary decoder ... */
23545 (!first_cycle_insn_p
23546 && insn_size > core2i7_secondary_decoder_max_insn_size)
23547 /* ... or it would not fit into the ifetch block ... */
23548 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
23549 /* ... or the decoder is full already ... */
23550 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
23551 /* ... mask the insn out. */
23552 {
23553 ready_try[n_ready] = 1;
23554
23555 if (data->ready_try_change)
23556 SET_BIT (data->ready_try_change, n_ready);
23557 }
23558 }
23559 }
23560
23561 /* Prepare for a new round of multipass lookahead scheduling. */
23562 static void
23563 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
23564 bool first_cycle_insn_p)
23565 {
23566 ix86_first_cycle_multipass_data_t data
23567 = (ix86_first_cycle_multipass_data_t) _data;
23568 const_ix86_first_cycle_multipass_data_t prev_data
23569 = ix86_first_cycle_multipass_data;
23570
23571 /* Restore the state from the end of the previous round. */
23572 data->ifetch_block_len = prev_data->ifetch_block_len;
23573 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
23574
23575 /* Filter instructions that cannot be issued on current cycle due to
23576 decoder restrictions. */
23577 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23578 first_cycle_insn_p);
23579 }
23580
23581 /* INSN is being issued in current solution. Account for its impact on
23582 the decoder model. */
23583 static void
23584 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
23585 rtx insn, const void *_prev_data)
23586 {
23587 ix86_first_cycle_multipass_data_t data
23588 = (ix86_first_cycle_multipass_data_t) _data;
23589 const_ix86_first_cycle_multipass_data_t prev_data
23590 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
23591
23592 int insn_size = min_insn_size (insn);
23593
23594 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
23595 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
23596 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
23597 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23598
23599 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
23600 if (!data->ready_try_change)
23601 {
23602 data->ready_try_change = sbitmap_alloc (n_ready);
23603 data->ready_try_change_size = n_ready;
23604 }
23605 else if (data->ready_try_change_size < n_ready)
23606 {
23607 data->ready_try_change = sbitmap_resize (data->ready_try_change,
23608 n_ready, 0);
23609 data->ready_try_change_size = n_ready;
23610 }
23611 sbitmap_zero (data->ready_try_change);
23612
23613 /* Filter out insns from ready_try that the core will not be able to issue
23614 on current cycle due to decoder. */
23615 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23616 false);
23617 }
23618
23619 /* Revert the effect on ready_try. */
23620 static void
23621 core2i7_first_cycle_multipass_backtrack (const void *_data,
23622 char *ready_try,
23623 int n_ready ATTRIBUTE_UNUSED)
23624 {
23625 const_ix86_first_cycle_multipass_data_t data
23626 = (const_ix86_first_cycle_multipass_data_t) _data;
23627 unsigned int i = 0;
23628 sbitmap_iterator sbi;
23629
23630 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
23631 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
23632 {
23633 ready_try[i] = 0;
23634 }
23635 }
23636
23637 /* Save the result of multipass lookahead scheduling for the next round. */
23638 static void
23639 core2i7_first_cycle_multipass_end (const void *_data)
23640 {
23641 const_ix86_first_cycle_multipass_data_t data
23642 = (const_ix86_first_cycle_multipass_data_t) _data;
23643 ix86_first_cycle_multipass_data_t next_data
23644 = ix86_first_cycle_multipass_data;
23645
23646 if (data != NULL)
23647 {
23648 next_data->ifetch_block_len = data->ifetch_block_len;
23649 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
23650 }
23651 }
23652
23653 /* Deallocate target data. */
23654 static void
23655 core2i7_first_cycle_multipass_fini (void *_data)
23656 {
23657 ix86_first_cycle_multipass_data_t data
23658 = (ix86_first_cycle_multipass_data_t) _data;
23659
23660 if (data->ready_try_change)
23661 {
23662 sbitmap_free (data->ready_try_change);
23663 data->ready_try_change = NULL;
23664 data->ready_try_change_size = 0;
23665 }
23666 }
23667
23668 /* Prepare for scheduling pass. */
23669 static void
23670 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
23671 int verbose ATTRIBUTE_UNUSED,
23672 int max_uid ATTRIBUTE_UNUSED)
23673 {
23674 /* Install scheduling hooks for current CPU. Some of these hooks are used
23675 in time-critical parts of the scheduler, so we only set them up when
23676 they are actually used. */
23677 switch (ix86_tune)
23678 {
23679 case PROCESSOR_CORE2_32:
23680 case PROCESSOR_CORE2_64:
23681 case PROCESSOR_COREI7_32:
23682 case PROCESSOR_COREI7_64:
23683 targetm.sched.dfa_post_advance_cycle
23684 = core2i7_dfa_post_advance_cycle;
23685 targetm.sched.first_cycle_multipass_init
23686 = core2i7_first_cycle_multipass_init;
23687 targetm.sched.first_cycle_multipass_begin
23688 = core2i7_first_cycle_multipass_begin;
23689 targetm.sched.first_cycle_multipass_issue
23690 = core2i7_first_cycle_multipass_issue;
23691 targetm.sched.first_cycle_multipass_backtrack
23692 = core2i7_first_cycle_multipass_backtrack;
23693 targetm.sched.first_cycle_multipass_end
23694 = core2i7_first_cycle_multipass_end;
23695 targetm.sched.first_cycle_multipass_fini
23696 = core2i7_first_cycle_multipass_fini;
23697
23698 /* Set decoder parameters. */
23699 core2i7_secondary_decoder_max_insn_size = 8;
23700 core2i7_ifetch_block_size = 16;
23701 core2i7_ifetch_block_max_insns = 6;
23702 break;
23703
23704 default:
23705 targetm.sched.dfa_post_advance_cycle = NULL;
23706 targetm.sched.first_cycle_multipass_init = NULL;
23707 targetm.sched.first_cycle_multipass_begin = NULL;
23708 targetm.sched.first_cycle_multipass_issue = NULL;
23709 targetm.sched.first_cycle_multipass_backtrack = NULL;
23710 targetm.sched.first_cycle_multipass_end = NULL;
23711 targetm.sched.first_cycle_multipass_fini = NULL;
23712 break;
23713 }
23714 }
23715
23716 \f
23717 /* Compute the alignment given to a constant that is being placed in memory.
23718 EXP is the constant and ALIGN is the alignment that the object would
23719 ordinarily have.
23720 The value of this function is used instead of that alignment to align
23721 the object. */
23722
23723 int
23724 ix86_constant_alignment (tree exp, int align)
23725 {
23726 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
23727 || TREE_CODE (exp) == INTEGER_CST)
23728 {
23729 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
23730 return 64;
23731 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
23732 return 128;
23733 }
23734 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
23735 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
23736 return BITS_PER_WORD;
23737
23738 return align;
23739 }
23740
23741 /* Compute the alignment for a static variable.
23742 TYPE is the data type, and ALIGN is the alignment that
23743 the object would ordinarily have. The value of this function is used
23744 instead of that alignment to align the object. */
23745
23746 int
23747 ix86_data_alignment (tree type, int align)
23748 {
23749 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
23750
23751 if (AGGREGATE_TYPE_P (type)
23752 && TYPE_SIZE (type)
23753 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23754 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
23755 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
23756 && align < max_align)
23757 align = max_align;
23758
23759 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
23760 to 16byte boundary. */
23761 if (TARGET_64BIT)
23762 {
23763 if (AGGREGATE_TYPE_P (type)
23764 && TYPE_SIZE (type)
23765 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23766 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
23767 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
23768 return 128;
23769 }
23770
23771 if (TREE_CODE (type) == ARRAY_TYPE)
23772 {
23773 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
23774 return 64;
23775 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
23776 return 128;
23777 }
23778 else if (TREE_CODE (type) == COMPLEX_TYPE)
23779 {
23780
23781 if (TYPE_MODE (type) == DCmode && align < 64)
23782 return 64;
23783 if ((TYPE_MODE (type) == XCmode
23784 || TYPE_MODE (type) == TCmode) && align < 128)
23785 return 128;
23786 }
23787 else if ((TREE_CODE (type) == RECORD_TYPE
23788 || TREE_CODE (type) == UNION_TYPE
23789 || TREE_CODE (type) == QUAL_UNION_TYPE)
23790 && TYPE_FIELDS (type))
23791 {
23792 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
23793 return 64;
23794 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
23795 return 128;
23796 }
23797 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
23798 || TREE_CODE (type) == INTEGER_TYPE)
23799 {
23800 if (TYPE_MODE (type) == DFmode && align < 64)
23801 return 64;
23802 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
23803 return 128;
23804 }
23805
23806 return align;
23807 }
23808
23809 /* Compute the alignment for a local variable or a stack slot. EXP is
23810 the data type or decl itself, MODE is the widest mode available and
23811 ALIGN is the alignment that the object would ordinarily have. The
23812 value of this macro is used instead of that alignment to align the
23813 object. */
23814
23815 unsigned int
23816 ix86_local_alignment (tree exp, enum machine_mode mode,
23817 unsigned int align)
23818 {
23819 tree type, decl;
23820
23821 if (exp && DECL_P (exp))
23822 {
23823 type = TREE_TYPE (exp);
23824 decl = exp;
23825 }
23826 else
23827 {
23828 type = exp;
23829 decl = NULL;
23830 }
23831
23832 /* Don't do dynamic stack realignment for long long objects with
23833 -mpreferred-stack-boundary=2. */
23834 if (!TARGET_64BIT
23835 && align == 64
23836 && ix86_preferred_stack_boundary < 64
23837 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
23838 && (!type || !TYPE_USER_ALIGN (type))
23839 && (!decl || !DECL_USER_ALIGN (decl)))
23840 align = 32;
23841
23842 /* If TYPE is NULL, we are allocating a stack slot for caller-save
23843 register in MODE. We will return the largest alignment of XF
23844 and DF. */
23845 if (!type)
23846 {
23847 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
23848 align = GET_MODE_ALIGNMENT (DFmode);
23849 return align;
23850 }
23851
23852 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
23853 to 16byte boundary. Exact wording is:
23854
23855 An array uses the same alignment as its elements, except that a local or
23856 global array variable of length at least 16 bytes or
23857 a C99 variable-length array variable always has alignment of at least 16 bytes.
23858
23859 This was added to allow use of aligned SSE instructions at arrays. This
23860 rule is meant for static storage (where compiler can not do the analysis
23861 by itself). We follow it for automatic variables only when convenient.
23862 We fully control everything in the function compiled and functions from
23863 other unit can not rely on the alignment.
23864
23865 Exclude va_list type. It is the common case of local array where
23866 we can not benefit from the alignment. */
23867 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
23868 && TARGET_SSE)
23869 {
23870 if (AGGREGATE_TYPE_P (type)
23871 && (va_list_type_node == NULL_TREE
23872 || (TYPE_MAIN_VARIANT (type)
23873 != TYPE_MAIN_VARIANT (va_list_type_node)))
23874 && TYPE_SIZE (type)
23875 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23876 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
23877 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
23878 return 128;
23879 }
23880 if (TREE_CODE (type) == ARRAY_TYPE)
23881 {
23882 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
23883 return 64;
23884 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
23885 return 128;
23886 }
23887 else if (TREE_CODE (type) == COMPLEX_TYPE)
23888 {
23889 if (TYPE_MODE (type) == DCmode && align < 64)
23890 return 64;
23891 if ((TYPE_MODE (type) == XCmode
23892 || TYPE_MODE (type) == TCmode) && align < 128)
23893 return 128;
23894 }
23895 else if ((TREE_CODE (type) == RECORD_TYPE
23896 || TREE_CODE (type) == UNION_TYPE
23897 || TREE_CODE (type) == QUAL_UNION_TYPE)
23898 && TYPE_FIELDS (type))
23899 {
23900 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
23901 return 64;
23902 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
23903 return 128;
23904 }
23905 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
23906 || TREE_CODE (type) == INTEGER_TYPE)
23907 {
23908
23909 if (TYPE_MODE (type) == DFmode && align < 64)
23910 return 64;
23911 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
23912 return 128;
23913 }
23914 return align;
23915 }
23916
23917 /* Compute the minimum required alignment for dynamic stack realignment
23918 purposes for a local variable, parameter or a stack slot. EXP is
23919 the data type or decl itself, MODE is its mode and ALIGN is the
23920 alignment that the object would ordinarily have. */
23921
23922 unsigned int
23923 ix86_minimum_alignment (tree exp, enum machine_mode mode,
23924 unsigned int align)
23925 {
23926 tree type, decl;
23927
23928 if (exp && DECL_P (exp))
23929 {
23930 type = TREE_TYPE (exp);
23931 decl = exp;
23932 }
23933 else
23934 {
23935 type = exp;
23936 decl = NULL;
23937 }
23938
23939 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
23940 return align;
23941
23942 /* Don't do dynamic stack realignment for long long objects with
23943 -mpreferred-stack-boundary=2. */
23944 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
23945 && (!type || !TYPE_USER_ALIGN (type))
23946 && (!decl || !DECL_USER_ALIGN (decl)))
23947 return 32;
23948
23949 return align;
23950 }
23951 \f
23952 /* Find a location for the static chain incoming to a nested function.
23953 This is a register, unless all free registers are used by arguments. */
23954
23955 static rtx
23956 ix86_static_chain (const_tree fndecl, bool incoming_p)
23957 {
23958 unsigned regno;
23959
23960 if (!DECL_STATIC_CHAIN (fndecl))
23961 return NULL;
23962
23963 if (TARGET_64BIT)
23964 {
23965 /* We always use R10 in 64-bit mode. */
23966 regno = R10_REG;
23967 }
23968 else
23969 {
23970 tree fntype;
23971 unsigned int ccvt;
23972
23973 /* By default in 32-bit mode we use ECX to pass the static chain. */
23974 regno = CX_REG;
23975
23976 fntype = TREE_TYPE (fndecl);
23977 ccvt = ix86_get_callcvt (fntype);
23978 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
23979 {
23980 /* Fastcall functions use ecx/edx for arguments, which leaves
23981 us with EAX for the static chain.
23982 Thiscall functions use ecx for arguments, which also
23983 leaves us with EAX for the static chain. */
23984 regno = AX_REG;
23985 }
23986 else if (ix86_function_regparm (fntype, fndecl) == 3)
23987 {
23988 /* For regparm 3, we have no free call-clobbered registers in
23989 which to store the static chain. In order to implement this,
23990 we have the trampoline push the static chain to the stack.
23991 However, we can't push a value below the return address when
23992 we call the nested function directly, so we have to use an
23993 alternate entry point. For this we use ESI, and have the
23994 alternate entry point push ESI, so that things appear the
23995 same once we're executing the nested function. */
23996 if (incoming_p)
23997 {
23998 if (fndecl == current_function_decl)
23999 ix86_static_chain_on_stack = true;
24000 return gen_frame_mem (SImode,
24001 plus_constant (arg_pointer_rtx, -8));
24002 }
24003 regno = SI_REG;
24004 }
24005 }
24006
24007 return gen_rtx_REG (Pmode, regno);
24008 }
24009
24010 /* Emit RTL insns to initialize the variable parts of a trampoline.
24011 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24012 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24013 to be passed to the target function. */
24014
24015 static void
24016 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24017 {
24018 rtx mem, fnaddr;
24019 int opcode;
24020 int offset = 0;
24021
24022 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24023
24024 if (TARGET_64BIT)
24025 {
24026 int size;
24027
24028 /* Load the function address to r11. Try to load address using
24029 the shorter movl instead of movabs. We may want to support
24030 movq for kernel mode, but kernel does not use trampolines at
24031 the moment. */
24032 if (x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24033 {
24034 fnaddr = copy_to_mode_reg (DImode, fnaddr);
24035
24036 mem = adjust_address (m_tramp, HImode, offset);
24037 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24038
24039 mem = adjust_address (m_tramp, SImode, offset + 2);
24040 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24041 offset += 6;
24042 }
24043 else
24044 {
24045 mem = adjust_address (m_tramp, HImode, offset);
24046 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24047
24048 mem = adjust_address (m_tramp, DImode, offset + 2);
24049 emit_move_insn (mem, fnaddr);
24050 offset += 10;
24051 }
24052
24053 /* Load static chain using movabs to r10. Use the
24054 shorter movl instead of movabs for x32. */
24055 if (TARGET_X32)
24056 {
24057 opcode = 0xba41;
24058 size = 6;
24059 }
24060 else
24061 {
24062 opcode = 0xba49;
24063 size = 10;
24064 }
24065
24066 mem = adjust_address (m_tramp, HImode, offset);
24067 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24068
24069 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24070 emit_move_insn (mem, chain_value);
24071 offset += size;
24072
24073 /* Jump to r11; the last (unused) byte is a nop, only there to
24074 pad the write out to a single 32-bit store. */
24075 mem = adjust_address (m_tramp, SImode, offset);
24076 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24077 offset += 4;
24078 }
24079 else
24080 {
24081 rtx disp, chain;
24082
24083 /* Depending on the static chain location, either load a register
24084 with a constant, or push the constant to the stack. All of the
24085 instructions are the same size. */
24086 chain = ix86_static_chain (fndecl, true);
24087 if (REG_P (chain))
24088 {
24089 switch (REGNO (chain))
24090 {
24091 case AX_REG:
24092 opcode = 0xb8; break;
24093 case CX_REG:
24094 opcode = 0xb9; break;
24095 default:
24096 gcc_unreachable ();
24097 }
24098 }
24099 else
24100 opcode = 0x68;
24101
24102 mem = adjust_address (m_tramp, QImode, offset);
24103 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24104
24105 mem = adjust_address (m_tramp, SImode, offset + 1);
24106 emit_move_insn (mem, chain_value);
24107 offset += 5;
24108
24109 mem = adjust_address (m_tramp, QImode, offset);
24110 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24111
24112 mem = adjust_address (m_tramp, SImode, offset + 1);
24113
24114 /* Compute offset from the end of the jmp to the target function.
24115 In the case in which the trampoline stores the static chain on
24116 the stack, we need to skip the first insn which pushes the
24117 (call-saved) register static chain; this push is 1 byte. */
24118 offset += 5;
24119 disp = expand_binop (SImode, sub_optab, fnaddr,
24120 plus_constant (XEXP (m_tramp, 0),
24121 offset - (MEM_P (chain) ? 1 : 0)),
24122 NULL_RTX, 1, OPTAB_DIRECT);
24123 emit_move_insn (mem, disp);
24124 }
24125
24126 gcc_assert (offset <= TRAMPOLINE_SIZE);
24127
24128 #ifdef HAVE_ENABLE_EXECUTE_STACK
24129 #ifdef CHECK_EXECUTE_STACK_ENABLED
24130 if (CHECK_EXECUTE_STACK_ENABLED)
24131 #endif
24132 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24133 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24134 #endif
24135 }
24136 \f
24137 /* The following file contains several enumerations and data structures
24138 built from the definitions in i386-builtin-types.def. */
24139
24140 #include "i386-builtin-types.inc"
24141
24142 /* Table for the ix86 builtin non-function types. */
24143 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24144
24145 /* Retrieve an element from the above table, building some of
24146 the types lazily. */
24147
24148 static tree
24149 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24150 {
24151 unsigned int index;
24152 tree type, itype;
24153
24154 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24155
24156 type = ix86_builtin_type_tab[(int) tcode];
24157 if (type != NULL)
24158 return type;
24159
24160 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24161 if (tcode <= IX86_BT_LAST_VECT)
24162 {
24163 enum machine_mode mode;
24164
24165 index = tcode - IX86_BT_LAST_PRIM - 1;
24166 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24167 mode = ix86_builtin_type_vect_mode[index];
24168
24169 type = build_vector_type_for_mode (itype, mode);
24170 }
24171 else
24172 {
24173 int quals;
24174
24175 index = tcode - IX86_BT_LAST_VECT - 1;
24176 if (tcode <= IX86_BT_LAST_PTR)
24177 quals = TYPE_UNQUALIFIED;
24178 else
24179 quals = TYPE_QUAL_CONST;
24180
24181 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24182 if (quals != TYPE_UNQUALIFIED)
24183 itype = build_qualified_type (itype, quals);
24184
24185 type = build_pointer_type (itype);
24186 }
24187
24188 ix86_builtin_type_tab[(int) tcode] = type;
24189 return type;
24190 }
24191
24192 /* Table for the ix86 builtin function types. */
24193 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24194
24195 /* Retrieve an element from the above table, building some of
24196 the types lazily. */
24197
24198 static tree
24199 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24200 {
24201 tree type;
24202
24203 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24204
24205 type = ix86_builtin_func_type_tab[(int) tcode];
24206 if (type != NULL)
24207 return type;
24208
24209 if (tcode <= IX86_BT_LAST_FUNC)
24210 {
24211 unsigned start = ix86_builtin_func_start[(int) tcode];
24212 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24213 tree rtype, atype, args = void_list_node;
24214 unsigned i;
24215
24216 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24217 for (i = after - 1; i > start; --i)
24218 {
24219 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24220 args = tree_cons (NULL, atype, args);
24221 }
24222
24223 type = build_function_type (rtype, args);
24224 }
24225 else
24226 {
24227 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24228 enum ix86_builtin_func_type icode;
24229
24230 icode = ix86_builtin_func_alias_base[index];
24231 type = ix86_get_builtin_func_type (icode);
24232 }
24233
24234 ix86_builtin_func_type_tab[(int) tcode] = type;
24235 return type;
24236 }
24237
24238
24239 /* Codes for all the SSE/MMX builtins. */
24240 enum ix86_builtins
24241 {
24242 IX86_BUILTIN_ADDPS,
24243 IX86_BUILTIN_ADDSS,
24244 IX86_BUILTIN_DIVPS,
24245 IX86_BUILTIN_DIVSS,
24246 IX86_BUILTIN_MULPS,
24247 IX86_BUILTIN_MULSS,
24248 IX86_BUILTIN_SUBPS,
24249 IX86_BUILTIN_SUBSS,
24250
24251 IX86_BUILTIN_CMPEQPS,
24252 IX86_BUILTIN_CMPLTPS,
24253 IX86_BUILTIN_CMPLEPS,
24254 IX86_BUILTIN_CMPGTPS,
24255 IX86_BUILTIN_CMPGEPS,
24256 IX86_BUILTIN_CMPNEQPS,
24257 IX86_BUILTIN_CMPNLTPS,
24258 IX86_BUILTIN_CMPNLEPS,
24259 IX86_BUILTIN_CMPNGTPS,
24260 IX86_BUILTIN_CMPNGEPS,
24261 IX86_BUILTIN_CMPORDPS,
24262 IX86_BUILTIN_CMPUNORDPS,
24263 IX86_BUILTIN_CMPEQSS,
24264 IX86_BUILTIN_CMPLTSS,
24265 IX86_BUILTIN_CMPLESS,
24266 IX86_BUILTIN_CMPNEQSS,
24267 IX86_BUILTIN_CMPNLTSS,
24268 IX86_BUILTIN_CMPNLESS,
24269 IX86_BUILTIN_CMPNGTSS,
24270 IX86_BUILTIN_CMPNGESS,
24271 IX86_BUILTIN_CMPORDSS,
24272 IX86_BUILTIN_CMPUNORDSS,
24273
24274 IX86_BUILTIN_COMIEQSS,
24275 IX86_BUILTIN_COMILTSS,
24276 IX86_BUILTIN_COMILESS,
24277 IX86_BUILTIN_COMIGTSS,
24278 IX86_BUILTIN_COMIGESS,
24279 IX86_BUILTIN_COMINEQSS,
24280 IX86_BUILTIN_UCOMIEQSS,
24281 IX86_BUILTIN_UCOMILTSS,
24282 IX86_BUILTIN_UCOMILESS,
24283 IX86_BUILTIN_UCOMIGTSS,
24284 IX86_BUILTIN_UCOMIGESS,
24285 IX86_BUILTIN_UCOMINEQSS,
24286
24287 IX86_BUILTIN_CVTPI2PS,
24288 IX86_BUILTIN_CVTPS2PI,
24289 IX86_BUILTIN_CVTSI2SS,
24290 IX86_BUILTIN_CVTSI642SS,
24291 IX86_BUILTIN_CVTSS2SI,
24292 IX86_BUILTIN_CVTSS2SI64,
24293 IX86_BUILTIN_CVTTPS2PI,
24294 IX86_BUILTIN_CVTTSS2SI,
24295 IX86_BUILTIN_CVTTSS2SI64,
24296
24297 IX86_BUILTIN_MAXPS,
24298 IX86_BUILTIN_MAXSS,
24299 IX86_BUILTIN_MINPS,
24300 IX86_BUILTIN_MINSS,
24301
24302 IX86_BUILTIN_LOADUPS,
24303 IX86_BUILTIN_STOREUPS,
24304 IX86_BUILTIN_MOVSS,
24305
24306 IX86_BUILTIN_MOVHLPS,
24307 IX86_BUILTIN_MOVLHPS,
24308 IX86_BUILTIN_LOADHPS,
24309 IX86_BUILTIN_LOADLPS,
24310 IX86_BUILTIN_STOREHPS,
24311 IX86_BUILTIN_STORELPS,
24312
24313 IX86_BUILTIN_MASKMOVQ,
24314 IX86_BUILTIN_MOVMSKPS,
24315 IX86_BUILTIN_PMOVMSKB,
24316
24317 IX86_BUILTIN_MOVNTPS,
24318 IX86_BUILTIN_MOVNTQ,
24319
24320 IX86_BUILTIN_LOADDQU,
24321 IX86_BUILTIN_STOREDQU,
24322
24323 IX86_BUILTIN_PACKSSWB,
24324 IX86_BUILTIN_PACKSSDW,
24325 IX86_BUILTIN_PACKUSWB,
24326
24327 IX86_BUILTIN_PADDB,
24328 IX86_BUILTIN_PADDW,
24329 IX86_BUILTIN_PADDD,
24330 IX86_BUILTIN_PADDQ,
24331 IX86_BUILTIN_PADDSB,
24332 IX86_BUILTIN_PADDSW,
24333 IX86_BUILTIN_PADDUSB,
24334 IX86_BUILTIN_PADDUSW,
24335 IX86_BUILTIN_PSUBB,
24336 IX86_BUILTIN_PSUBW,
24337 IX86_BUILTIN_PSUBD,
24338 IX86_BUILTIN_PSUBQ,
24339 IX86_BUILTIN_PSUBSB,
24340 IX86_BUILTIN_PSUBSW,
24341 IX86_BUILTIN_PSUBUSB,
24342 IX86_BUILTIN_PSUBUSW,
24343
24344 IX86_BUILTIN_PAND,
24345 IX86_BUILTIN_PANDN,
24346 IX86_BUILTIN_POR,
24347 IX86_BUILTIN_PXOR,
24348
24349 IX86_BUILTIN_PAVGB,
24350 IX86_BUILTIN_PAVGW,
24351
24352 IX86_BUILTIN_PCMPEQB,
24353 IX86_BUILTIN_PCMPEQW,
24354 IX86_BUILTIN_PCMPEQD,
24355 IX86_BUILTIN_PCMPGTB,
24356 IX86_BUILTIN_PCMPGTW,
24357 IX86_BUILTIN_PCMPGTD,
24358
24359 IX86_BUILTIN_PMADDWD,
24360
24361 IX86_BUILTIN_PMAXSW,
24362 IX86_BUILTIN_PMAXUB,
24363 IX86_BUILTIN_PMINSW,
24364 IX86_BUILTIN_PMINUB,
24365
24366 IX86_BUILTIN_PMULHUW,
24367 IX86_BUILTIN_PMULHW,
24368 IX86_BUILTIN_PMULLW,
24369
24370 IX86_BUILTIN_PSADBW,
24371 IX86_BUILTIN_PSHUFW,
24372
24373 IX86_BUILTIN_PSLLW,
24374 IX86_BUILTIN_PSLLD,
24375 IX86_BUILTIN_PSLLQ,
24376 IX86_BUILTIN_PSRAW,
24377 IX86_BUILTIN_PSRAD,
24378 IX86_BUILTIN_PSRLW,
24379 IX86_BUILTIN_PSRLD,
24380 IX86_BUILTIN_PSRLQ,
24381 IX86_BUILTIN_PSLLWI,
24382 IX86_BUILTIN_PSLLDI,
24383 IX86_BUILTIN_PSLLQI,
24384 IX86_BUILTIN_PSRAWI,
24385 IX86_BUILTIN_PSRADI,
24386 IX86_BUILTIN_PSRLWI,
24387 IX86_BUILTIN_PSRLDI,
24388 IX86_BUILTIN_PSRLQI,
24389
24390 IX86_BUILTIN_PUNPCKHBW,
24391 IX86_BUILTIN_PUNPCKHWD,
24392 IX86_BUILTIN_PUNPCKHDQ,
24393 IX86_BUILTIN_PUNPCKLBW,
24394 IX86_BUILTIN_PUNPCKLWD,
24395 IX86_BUILTIN_PUNPCKLDQ,
24396
24397 IX86_BUILTIN_SHUFPS,
24398
24399 IX86_BUILTIN_RCPPS,
24400 IX86_BUILTIN_RCPSS,
24401 IX86_BUILTIN_RSQRTPS,
24402 IX86_BUILTIN_RSQRTPS_NR,
24403 IX86_BUILTIN_RSQRTSS,
24404 IX86_BUILTIN_RSQRTF,
24405 IX86_BUILTIN_SQRTPS,
24406 IX86_BUILTIN_SQRTPS_NR,
24407 IX86_BUILTIN_SQRTSS,
24408
24409 IX86_BUILTIN_UNPCKHPS,
24410 IX86_BUILTIN_UNPCKLPS,
24411
24412 IX86_BUILTIN_ANDPS,
24413 IX86_BUILTIN_ANDNPS,
24414 IX86_BUILTIN_ORPS,
24415 IX86_BUILTIN_XORPS,
24416
24417 IX86_BUILTIN_EMMS,
24418 IX86_BUILTIN_LDMXCSR,
24419 IX86_BUILTIN_STMXCSR,
24420 IX86_BUILTIN_SFENCE,
24421
24422 /* 3DNow! Original */
24423 IX86_BUILTIN_FEMMS,
24424 IX86_BUILTIN_PAVGUSB,
24425 IX86_BUILTIN_PF2ID,
24426 IX86_BUILTIN_PFACC,
24427 IX86_BUILTIN_PFADD,
24428 IX86_BUILTIN_PFCMPEQ,
24429 IX86_BUILTIN_PFCMPGE,
24430 IX86_BUILTIN_PFCMPGT,
24431 IX86_BUILTIN_PFMAX,
24432 IX86_BUILTIN_PFMIN,
24433 IX86_BUILTIN_PFMUL,
24434 IX86_BUILTIN_PFRCP,
24435 IX86_BUILTIN_PFRCPIT1,
24436 IX86_BUILTIN_PFRCPIT2,
24437 IX86_BUILTIN_PFRSQIT1,
24438 IX86_BUILTIN_PFRSQRT,
24439 IX86_BUILTIN_PFSUB,
24440 IX86_BUILTIN_PFSUBR,
24441 IX86_BUILTIN_PI2FD,
24442 IX86_BUILTIN_PMULHRW,
24443
24444 /* 3DNow! Athlon Extensions */
24445 IX86_BUILTIN_PF2IW,
24446 IX86_BUILTIN_PFNACC,
24447 IX86_BUILTIN_PFPNACC,
24448 IX86_BUILTIN_PI2FW,
24449 IX86_BUILTIN_PSWAPDSI,
24450 IX86_BUILTIN_PSWAPDSF,
24451
24452 /* SSE2 */
24453 IX86_BUILTIN_ADDPD,
24454 IX86_BUILTIN_ADDSD,
24455 IX86_BUILTIN_DIVPD,
24456 IX86_BUILTIN_DIVSD,
24457 IX86_BUILTIN_MULPD,
24458 IX86_BUILTIN_MULSD,
24459 IX86_BUILTIN_SUBPD,
24460 IX86_BUILTIN_SUBSD,
24461
24462 IX86_BUILTIN_CMPEQPD,
24463 IX86_BUILTIN_CMPLTPD,
24464 IX86_BUILTIN_CMPLEPD,
24465 IX86_BUILTIN_CMPGTPD,
24466 IX86_BUILTIN_CMPGEPD,
24467 IX86_BUILTIN_CMPNEQPD,
24468 IX86_BUILTIN_CMPNLTPD,
24469 IX86_BUILTIN_CMPNLEPD,
24470 IX86_BUILTIN_CMPNGTPD,
24471 IX86_BUILTIN_CMPNGEPD,
24472 IX86_BUILTIN_CMPORDPD,
24473 IX86_BUILTIN_CMPUNORDPD,
24474 IX86_BUILTIN_CMPEQSD,
24475 IX86_BUILTIN_CMPLTSD,
24476 IX86_BUILTIN_CMPLESD,
24477 IX86_BUILTIN_CMPNEQSD,
24478 IX86_BUILTIN_CMPNLTSD,
24479 IX86_BUILTIN_CMPNLESD,
24480 IX86_BUILTIN_CMPORDSD,
24481 IX86_BUILTIN_CMPUNORDSD,
24482
24483 IX86_BUILTIN_COMIEQSD,
24484 IX86_BUILTIN_COMILTSD,
24485 IX86_BUILTIN_COMILESD,
24486 IX86_BUILTIN_COMIGTSD,
24487 IX86_BUILTIN_COMIGESD,
24488 IX86_BUILTIN_COMINEQSD,
24489 IX86_BUILTIN_UCOMIEQSD,
24490 IX86_BUILTIN_UCOMILTSD,
24491 IX86_BUILTIN_UCOMILESD,
24492 IX86_BUILTIN_UCOMIGTSD,
24493 IX86_BUILTIN_UCOMIGESD,
24494 IX86_BUILTIN_UCOMINEQSD,
24495
24496 IX86_BUILTIN_MAXPD,
24497 IX86_BUILTIN_MAXSD,
24498 IX86_BUILTIN_MINPD,
24499 IX86_BUILTIN_MINSD,
24500
24501 IX86_BUILTIN_ANDPD,
24502 IX86_BUILTIN_ANDNPD,
24503 IX86_BUILTIN_ORPD,
24504 IX86_BUILTIN_XORPD,
24505
24506 IX86_BUILTIN_SQRTPD,
24507 IX86_BUILTIN_SQRTSD,
24508
24509 IX86_BUILTIN_UNPCKHPD,
24510 IX86_BUILTIN_UNPCKLPD,
24511
24512 IX86_BUILTIN_SHUFPD,
24513
24514 IX86_BUILTIN_LOADUPD,
24515 IX86_BUILTIN_STOREUPD,
24516 IX86_BUILTIN_MOVSD,
24517
24518 IX86_BUILTIN_LOADHPD,
24519 IX86_BUILTIN_LOADLPD,
24520
24521 IX86_BUILTIN_CVTDQ2PD,
24522 IX86_BUILTIN_CVTDQ2PS,
24523
24524 IX86_BUILTIN_CVTPD2DQ,
24525 IX86_BUILTIN_CVTPD2PI,
24526 IX86_BUILTIN_CVTPD2PS,
24527 IX86_BUILTIN_CVTTPD2DQ,
24528 IX86_BUILTIN_CVTTPD2PI,
24529
24530 IX86_BUILTIN_CVTPI2PD,
24531 IX86_BUILTIN_CVTSI2SD,
24532 IX86_BUILTIN_CVTSI642SD,
24533
24534 IX86_BUILTIN_CVTSD2SI,
24535 IX86_BUILTIN_CVTSD2SI64,
24536 IX86_BUILTIN_CVTSD2SS,
24537 IX86_BUILTIN_CVTSS2SD,
24538 IX86_BUILTIN_CVTTSD2SI,
24539 IX86_BUILTIN_CVTTSD2SI64,
24540
24541 IX86_BUILTIN_CVTPS2DQ,
24542 IX86_BUILTIN_CVTPS2PD,
24543 IX86_BUILTIN_CVTTPS2DQ,
24544
24545 IX86_BUILTIN_MOVNTI,
24546 IX86_BUILTIN_MOVNTPD,
24547 IX86_BUILTIN_MOVNTDQ,
24548
24549 IX86_BUILTIN_MOVQ128,
24550
24551 /* SSE2 MMX */
24552 IX86_BUILTIN_MASKMOVDQU,
24553 IX86_BUILTIN_MOVMSKPD,
24554 IX86_BUILTIN_PMOVMSKB128,
24555
24556 IX86_BUILTIN_PACKSSWB128,
24557 IX86_BUILTIN_PACKSSDW128,
24558 IX86_BUILTIN_PACKUSWB128,
24559
24560 IX86_BUILTIN_PADDB128,
24561 IX86_BUILTIN_PADDW128,
24562 IX86_BUILTIN_PADDD128,
24563 IX86_BUILTIN_PADDQ128,
24564 IX86_BUILTIN_PADDSB128,
24565 IX86_BUILTIN_PADDSW128,
24566 IX86_BUILTIN_PADDUSB128,
24567 IX86_BUILTIN_PADDUSW128,
24568 IX86_BUILTIN_PSUBB128,
24569 IX86_BUILTIN_PSUBW128,
24570 IX86_BUILTIN_PSUBD128,
24571 IX86_BUILTIN_PSUBQ128,
24572 IX86_BUILTIN_PSUBSB128,
24573 IX86_BUILTIN_PSUBSW128,
24574 IX86_BUILTIN_PSUBUSB128,
24575 IX86_BUILTIN_PSUBUSW128,
24576
24577 IX86_BUILTIN_PAND128,
24578 IX86_BUILTIN_PANDN128,
24579 IX86_BUILTIN_POR128,
24580 IX86_BUILTIN_PXOR128,
24581
24582 IX86_BUILTIN_PAVGB128,
24583 IX86_BUILTIN_PAVGW128,
24584
24585 IX86_BUILTIN_PCMPEQB128,
24586 IX86_BUILTIN_PCMPEQW128,
24587 IX86_BUILTIN_PCMPEQD128,
24588 IX86_BUILTIN_PCMPGTB128,
24589 IX86_BUILTIN_PCMPGTW128,
24590 IX86_BUILTIN_PCMPGTD128,
24591
24592 IX86_BUILTIN_PMADDWD128,
24593
24594 IX86_BUILTIN_PMAXSW128,
24595 IX86_BUILTIN_PMAXUB128,
24596 IX86_BUILTIN_PMINSW128,
24597 IX86_BUILTIN_PMINUB128,
24598
24599 IX86_BUILTIN_PMULUDQ,
24600 IX86_BUILTIN_PMULUDQ128,
24601 IX86_BUILTIN_PMULHUW128,
24602 IX86_BUILTIN_PMULHW128,
24603 IX86_BUILTIN_PMULLW128,
24604
24605 IX86_BUILTIN_PSADBW128,
24606 IX86_BUILTIN_PSHUFHW,
24607 IX86_BUILTIN_PSHUFLW,
24608 IX86_BUILTIN_PSHUFD,
24609
24610 IX86_BUILTIN_PSLLDQI128,
24611 IX86_BUILTIN_PSLLWI128,
24612 IX86_BUILTIN_PSLLDI128,
24613 IX86_BUILTIN_PSLLQI128,
24614 IX86_BUILTIN_PSRAWI128,
24615 IX86_BUILTIN_PSRADI128,
24616 IX86_BUILTIN_PSRLDQI128,
24617 IX86_BUILTIN_PSRLWI128,
24618 IX86_BUILTIN_PSRLDI128,
24619 IX86_BUILTIN_PSRLQI128,
24620
24621 IX86_BUILTIN_PSLLDQ128,
24622 IX86_BUILTIN_PSLLW128,
24623 IX86_BUILTIN_PSLLD128,
24624 IX86_BUILTIN_PSLLQ128,
24625 IX86_BUILTIN_PSRAW128,
24626 IX86_BUILTIN_PSRAD128,
24627 IX86_BUILTIN_PSRLW128,
24628 IX86_BUILTIN_PSRLD128,
24629 IX86_BUILTIN_PSRLQ128,
24630
24631 IX86_BUILTIN_PUNPCKHBW128,
24632 IX86_BUILTIN_PUNPCKHWD128,
24633 IX86_BUILTIN_PUNPCKHDQ128,
24634 IX86_BUILTIN_PUNPCKHQDQ128,
24635 IX86_BUILTIN_PUNPCKLBW128,
24636 IX86_BUILTIN_PUNPCKLWD128,
24637 IX86_BUILTIN_PUNPCKLDQ128,
24638 IX86_BUILTIN_PUNPCKLQDQ128,
24639
24640 IX86_BUILTIN_CLFLUSH,
24641 IX86_BUILTIN_MFENCE,
24642 IX86_BUILTIN_LFENCE,
24643 IX86_BUILTIN_PAUSE,
24644
24645 IX86_BUILTIN_BSRSI,
24646 IX86_BUILTIN_BSRDI,
24647 IX86_BUILTIN_RDPMC,
24648 IX86_BUILTIN_RDTSC,
24649 IX86_BUILTIN_RDTSCP,
24650 IX86_BUILTIN_ROLQI,
24651 IX86_BUILTIN_ROLHI,
24652 IX86_BUILTIN_RORQI,
24653 IX86_BUILTIN_RORHI,
24654
24655 /* SSE3. */
24656 IX86_BUILTIN_ADDSUBPS,
24657 IX86_BUILTIN_HADDPS,
24658 IX86_BUILTIN_HSUBPS,
24659 IX86_BUILTIN_MOVSHDUP,
24660 IX86_BUILTIN_MOVSLDUP,
24661 IX86_BUILTIN_ADDSUBPD,
24662 IX86_BUILTIN_HADDPD,
24663 IX86_BUILTIN_HSUBPD,
24664 IX86_BUILTIN_LDDQU,
24665
24666 IX86_BUILTIN_MONITOR,
24667 IX86_BUILTIN_MWAIT,
24668
24669 /* SSSE3. */
24670 IX86_BUILTIN_PHADDW,
24671 IX86_BUILTIN_PHADDD,
24672 IX86_BUILTIN_PHADDSW,
24673 IX86_BUILTIN_PHSUBW,
24674 IX86_BUILTIN_PHSUBD,
24675 IX86_BUILTIN_PHSUBSW,
24676 IX86_BUILTIN_PMADDUBSW,
24677 IX86_BUILTIN_PMULHRSW,
24678 IX86_BUILTIN_PSHUFB,
24679 IX86_BUILTIN_PSIGNB,
24680 IX86_BUILTIN_PSIGNW,
24681 IX86_BUILTIN_PSIGND,
24682 IX86_BUILTIN_PALIGNR,
24683 IX86_BUILTIN_PABSB,
24684 IX86_BUILTIN_PABSW,
24685 IX86_BUILTIN_PABSD,
24686
24687 IX86_BUILTIN_PHADDW128,
24688 IX86_BUILTIN_PHADDD128,
24689 IX86_BUILTIN_PHADDSW128,
24690 IX86_BUILTIN_PHSUBW128,
24691 IX86_BUILTIN_PHSUBD128,
24692 IX86_BUILTIN_PHSUBSW128,
24693 IX86_BUILTIN_PMADDUBSW128,
24694 IX86_BUILTIN_PMULHRSW128,
24695 IX86_BUILTIN_PSHUFB128,
24696 IX86_BUILTIN_PSIGNB128,
24697 IX86_BUILTIN_PSIGNW128,
24698 IX86_BUILTIN_PSIGND128,
24699 IX86_BUILTIN_PALIGNR128,
24700 IX86_BUILTIN_PABSB128,
24701 IX86_BUILTIN_PABSW128,
24702 IX86_BUILTIN_PABSD128,
24703
24704 /* AMDFAM10 - SSE4A New Instructions. */
24705 IX86_BUILTIN_MOVNTSD,
24706 IX86_BUILTIN_MOVNTSS,
24707 IX86_BUILTIN_EXTRQI,
24708 IX86_BUILTIN_EXTRQ,
24709 IX86_BUILTIN_INSERTQI,
24710 IX86_BUILTIN_INSERTQ,
24711
24712 /* SSE4.1. */
24713 IX86_BUILTIN_BLENDPD,
24714 IX86_BUILTIN_BLENDPS,
24715 IX86_BUILTIN_BLENDVPD,
24716 IX86_BUILTIN_BLENDVPS,
24717 IX86_BUILTIN_PBLENDVB128,
24718 IX86_BUILTIN_PBLENDW128,
24719
24720 IX86_BUILTIN_DPPD,
24721 IX86_BUILTIN_DPPS,
24722
24723 IX86_BUILTIN_INSERTPS128,
24724
24725 IX86_BUILTIN_MOVNTDQA,
24726 IX86_BUILTIN_MPSADBW128,
24727 IX86_BUILTIN_PACKUSDW128,
24728 IX86_BUILTIN_PCMPEQQ,
24729 IX86_BUILTIN_PHMINPOSUW128,
24730
24731 IX86_BUILTIN_PMAXSB128,
24732 IX86_BUILTIN_PMAXSD128,
24733 IX86_BUILTIN_PMAXUD128,
24734 IX86_BUILTIN_PMAXUW128,
24735
24736 IX86_BUILTIN_PMINSB128,
24737 IX86_BUILTIN_PMINSD128,
24738 IX86_BUILTIN_PMINUD128,
24739 IX86_BUILTIN_PMINUW128,
24740
24741 IX86_BUILTIN_PMOVSXBW128,
24742 IX86_BUILTIN_PMOVSXBD128,
24743 IX86_BUILTIN_PMOVSXBQ128,
24744 IX86_BUILTIN_PMOVSXWD128,
24745 IX86_BUILTIN_PMOVSXWQ128,
24746 IX86_BUILTIN_PMOVSXDQ128,
24747
24748 IX86_BUILTIN_PMOVZXBW128,
24749 IX86_BUILTIN_PMOVZXBD128,
24750 IX86_BUILTIN_PMOVZXBQ128,
24751 IX86_BUILTIN_PMOVZXWD128,
24752 IX86_BUILTIN_PMOVZXWQ128,
24753 IX86_BUILTIN_PMOVZXDQ128,
24754
24755 IX86_BUILTIN_PMULDQ128,
24756 IX86_BUILTIN_PMULLD128,
24757
24758 IX86_BUILTIN_ROUNDPD,
24759 IX86_BUILTIN_ROUNDPS,
24760 IX86_BUILTIN_ROUNDSD,
24761 IX86_BUILTIN_ROUNDSS,
24762
24763 IX86_BUILTIN_FLOORPD,
24764 IX86_BUILTIN_CEILPD,
24765 IX86_BUILTIN_TRUNCPD,
24766 IX86_BUILTIN_RINTPD,
24767 IX86_BUILTIN_ROUNDPD_AZ,
24768 IX86_BUILTIN_FLOORPS,
24769 IX86_BUILTIN_CEILPS,
24770 IX86_BUILTIN_TRUNCPS,
24771 IX86_BUILTIN_RINTPS,
24772 IX86_BUILTIN_ROUNDPS_AZ,
24773
24774 IX86_BUILTIN_PTESTZ,
24775 IX86_BUILTIN_PTESTC,
24776 IX86_BUILTIN_PTESTNZC,
24777
24778 IX86_BUILTIN_VEC_INIT_V2SI,
24779 IX86_BUILTIN_VEC_INIT_V4HI,
24780 IX86_BUILTIN_VEC_INIT_V8QI,
24781 IX86_BUILTIN_VEC_EXT_V2DF,
24782 IX86_BUILTIN_VEC_EXT_V2DI,
24783 IX86_BUILTIN_VEC_EXT_V4SF,
24784 IX86_BUILTIN_VEC_EXT_V4SI,
24785 IX86_BUILTIN_VEC_EXT_V8HI,
24786 IX86_BUILTIN_VEC_EXT_V2SI,
24787 IX86_BUILTIN_VEC_EXT_V4HI,
24788 IX86_BUILTIN_VEC_EXT_V16QI,
24789 IX86_BUILTIN_VEC_SET_V2DI,
24790 IX86_BUILTIN_VEC_SET_V4SF,
24791 IX86_BUILTIN_VEC_SET_V4SI,
24792 IX86_BUILTIN_VEC_SET_V8HI,
24793 IX86_BUILTIN_VEC_SET_V4HI,
24794 IX86_BUILTIN_VEC_SET_V16QI,
24795
24796 IX86_BUILTIN_VEC_PACK_SFIX,
24797
24798 /* SSE4.2. */
24799 IX86_BUILTIN_CRC32QI,
24800 IX86_BUILTIN_CRC32HI,
24801 IX86_BUILTIN_CRC32SI,
24802 IX86_BUILTIN_CRC32DI,
24803
24804 IX86_BUILTIN_PCMPESTRI128,
24805 IX86_BUILTIN_PCMPESTRM128,
24806 IX86_BUILTIN_PCMPESTRA128,
24807 IX86_BUILTIN_PCMPESTRC128,
24808 IX86_BUILTIN_PCMPESTRO128,
24809 IX86_BUILTIN_PCMPESTRS128,
24810 IX86_BUILTIN_PCMPESTRZ128,
24811 IX86_BUILTIN_PCMPISTRI128,
24812 IX86_BUILTIN_PCMPISTRM128,
24813 IX86_BUILTIN_PCMPISTRA128,
24814 IX86_BUILTIN_PCMPISTRC128,
24815 IX86_BUILTIN_PCMPISTRO128,
24816 IX86_BUILTIN_PCMPISTRS128,
24817 IX86_BUILTIN_PCMPISTRZ128,
24818
24819 IX86_BUILTIN_PCMPGTQ,
24820
24821 /* AES instructions */
24822 IX86_BUILTIN_AESENC128,
24823 IX86_BUILTIN_AESENCLAST128,
24824 IX86_BUILTIN_AESDEC128,
24825 IX86_BUILTIN_AESDECLAST128,
24826 IX86_BUILTIN_AESIMC128,
24827 IX86_BUILTIN_AESKEYGENASSIST128,
24828
24829 /* PCLMUL instruction */
24830 IX86_BUILTIN_PCLMULQDQ128,
24831
24832 /* AVX */
24833 IX86_BUILTIN_ADDPD256,
24834 IX86_BUILTIN_ADDPS256,
24835 IX86_BUILTIN_ADDSUBPD256,
24836 IX86_BUILTIN_ADDSUBPS256,
24837 IX86_BUILTIN_ANDPD256,
24838 IX86_BUILTIN_ANDPS256,
24839 IX86_BUILTIN_ANDNPD256,
24840 IX86_BUILTIN_ANDNPS256,
24841 IX86_BUILTIN_BLENDPD256,
24842 IX86_BUILTIN_BLENDPS256,
24843 IX86_BUILTIN_BLENDVPD256,
24844 IX86_BUILTIN_BLENDVPS256,
24845 IX86_BUILTIN_DIVPD256,
24846 IX86_BUILTIN_DIVPS256,
24847 IX86_BUILTIN_DPPS256,
24848 IX86_BUILTIN_HADDPD256,
24849 IX86_BUILTIN_HADDPS256,
24850 IX86_BUILTIN_HSUBPD256,
24851 IX86_BUILTIN_HSUBPS256,
24852 IX86_BUILTIN_MAXPD256,
24853 IX86_BUILTIN_MAXPS256,
24854 IX86_BUILTIN_MINPD256,
24855 IX86_BUILTIN_MINPS256,
24856 IX86_BUILTIN_MULPD256,
24857 IX86_BUILTIN_MULPS256,
24858 IX86_BUILTIN_ORPD256,
24859 IX86_BUILTIN_ORPS256,
24860 IX86_BUILTIN_SHUFPD256,
24861 IX86_BUILTIN_SHUFPS256,
24862 IX86_BUILTIN_SUBPD256,
24863 IX86_BUILTIN_SUBPS256,
24864 IX86_BUILTIN_XORPD256,
24865 IX86_BUILTIN_XORPS256,
24866 IX86_BUILTIN_CMPSD,
24867 IX86_BUILTIN_CMPSS,
24868 IX86_BUILTIN_CMPPD,
24869 IX86_BUILTIN_CMPPS,
24870 IX86_BUILTIN_CMPPD256,
24871 IX86_BUILTIN_CMPPS256,
24872 IX86_BUILTIN_CVTDQ2PD256,
24873 IX86_BUILTIN_CVTDQ2PS256,
24874 IX86_BUILTIN_CVTPD2PS256,
24875 IX86_BUILTIN_CVTPS2DQ256,
24876 IX86_BUILTIN_CVTPS2PD256,
24877 IX86_BUILTIN_CVTTPD2DQ256,
24878 IX86_BUILTIN_CVTPD2DQ256,
24879 IX86_BUILTIN_CVTTPS2DQ256,
24880 IX86_BUILTIN_EXTRACTF128PD256,
24881 IX86_BUILTIN_EXTRACTF128PS256,
24882 IX86_BUILTIN_EXTRACTF128SI256,
24883 IX86_BUILTIN_VZEROALL,
24884 IX86_BUILTIN_VZEROUPPER,
24885 IX86_BUILTIN_VPERMILVARPD,
24886 IX86_BUILTIN_VPERMILVARPS,
24887 IX86_BUILTIN_VPERMILVARPD256,
24888 IX86_BUILTIN_VPERMILVARPS256,
24889 IX86_BUILTIN_VPERMILPD,
24890 IX86_BUILTIN_VPERMILPS,
24891 IX86_BUILTIN_VPERMILPD256,
24892 IX86_BUILTIN_VPERMILPS256,
24893 IX86_BUILTIN_VPERMIL2PD,
24894 IX86_BUILTIN_VPERMIL2PS,
24895 IX86_BUILTIN_VPERMIL2PD256,
24896 IX86_BUILTIN_VPERMIL2PS256,
24897 IX86_BUILTIN_VPERM2F128PD256,
24898 IX86_BUILTIN_VPERM2F128PS256,
24899 IX86_BUILTIN_VPERM2F128SI256,
24900 IX86_BUILTIN_VBROADCASTSS,
24901 IX86_BUILTIN_VBROADCASTSD256,
24902 IX86_BUILTIN_VBROADCASTSS256,
24903 IX86_BUILTIN_VBROADCASTPD256,
24904 IX86_BUILTIN_VBROADCASTPS256,
24905 IX86_BUILTIN_VINSERTF128PD256,
24906 IX86_BUILTIN_VINSERTF128PS256,
24907 IX86_BUILTIN_VINSERTF128SI256,
24908 IX86_BUILTIN_LOADUPD256,
24909 IX86_BUILTIN_LOADUPS256,
24910 IX86_BUILTIN_STOREUPD256,
24911 IX86_BUILTIN_STOREUPS256,
24912 IX86_BUILTIN_LDDQU256,
24913 IX86_BUILTIN_MOVNTDQ256,
24914 IX86_BUILTIN_MOVNTPD256,
24915 IX86_BUILTIN_MOVNTPS256,
24916 IX86_BUILTIN_LOADDQU256,
24917 IX86_BUILTIN_STOREDQU256,
24918 IX86_BUILTIN_MASKLOADPD,
24919 IX86_BUILTIN_MASKLOADPS,
24920 IX86_BUILTIN_MASKSTOREPD,
24921 IX86_BUILTIN_MASKSTOREPS,
24922 IX86_BUILTIN_MASKLOADPD256,
24923 IX86_BUILTIN_MASKLOADPS256,
24924 IX86_BUILTIN_MASKSTOREPD256,
24925 IX86_BUILTIN_MASKSTOREPS256,
24926 IX86_BUILTIN_MOVSHDUP256,
24927 IX86_BUILTIN_MOVSLDUP256,
24928 IX86_BUILTIN_MOVDDUP256,
24929
24930 IX86_BUILTIN_SQRTPD256,
24931 IX86_BUILTIN_SQRTPS256,
24932 IX86_BUILTIN_SQRTPS_NR256,
24933 IX86_BUILTIN_RSQRTPS256,
24934 IX86_BUILTIN_RSQRTPS_NR256,
24935
24936 IX86_BUILTIN_RCPPS256,
24937
24938 IX86_BUILTIN_ROUNDPD256,
24939 IX86_BUILTIN_ROUNDPS256,
24940
24941 IX86_BUILTIN_FLOORPD256,
24942 IX86_BUILTIN_CEILPD256,
24943 IX86_BUILTIN_TRUNCPD256,
24944 IX86_BUILTIN_RINTPD256,
24945 IX86_BUILTIN_ROUNDPD_AZ256,
24946 IX86_BUILTIN_FLOORPS256,
24947 IX86_BUILTIN_CEILPS256,
24948 IX86_BUILTIN_TRUNCPS256,
24949 IX86_BUILTIN_RINTPS256,
24950 IX86_BUILTIN_ROUNDPS_AZ256,
24951
24952 IX86_BUILTIN_UNPCKHPD256,
24953 IX86_BUILTIN_UNPCKLPD256,
24954 IX86_BUILTIN_UNPCKHPS256,
24955 IX86_BUILTIN_UNPCKLPS256,
24956
24957 IX86_BUILTIN_SI256_SI,
24958 IX86_BUILTIN_PS256_PS,
24959 IX86_BUILTIN_PD256_PD,
24960 IX86_BUILTIN_SI_SI256,
24961 IX86_BUILTIN_PS_PS256,
24962 IX86_BUILTIN_PD_PD256,
24963
24964 IX86_BUILTIN_VTESTZPD,
24965 IX86_BUILTIN_VTESTCPD,
24966 IX86_BUILTIN_VTESTNZCPD,
24967 IX86_BUILTIN_VTESTZPS,
24968 IX86_BUILTIN_VTESTCPS,
24969 IX86_BUILTIN_VTESTNZCPS,
24970 IX86_BUILTIN_VTESTZPD256,
24971 IX86_BUILTIN_VTESTCPD256,
24972 IX86_BUILTIN_VTESTNZCPD256,
24973 IX86_BUILTIN_VTESTZPS256,
24974 IX86_BUILTIN_VTESTCPS256,
24975 IX86_BUILTIN_VTESTNZCPS256,
24976 IX86_BUILTIN_PTESTZ256,
24977 IX86_BUILTIN_PTESTC256,
24978 IX86_BUILTIN_PTESTNZC256,
24979
24980 IX86_BUILTIN_MOVMSKPD256,
24981 IX86_BUILTIN_MOVMSKPS256,
24982
24983 /* AVX2 */
24984 IX86_BUILTIN_MPSADBW256,
24985 IX86_BUILTIN_PABSB256,
24986 IX86_BUILTIN_PABSW256,
24987 IX86_BUILTIN_PABSD256,
24988 IX86_BUILTIN_PACKSSDW256,
24989 IX86_BUILTIN_PACKSSWB256,
24990 IX86_BUILTIN_PACKUSDW256,
24991 IX86_BUILTIN_PACKUSWB256,
24992 IX86_BUILTIN_PADDB256,
24993 IX86_BUILTIN_PADDW256,
24994 IX86_BUILTIN_PADDD256,
24995 IX86_BUILTIN_PADDQ256,
24996 IX86_BUILTIN_PADDSB256,
24997 IX86_BUILTIN_PADDSW256,
24998 IX86_BUILTIN_PADDUSB256,
24999 IX86_BUILTIN_PADDUSW256,
25000 IX86_BUILTIN_PALIGNR256,
25001 IX86_BUILTIN_AND256I,
25002 IX86_BUILTIN_ANDNOT256I,
25003 IX86_BUILTIN_PAVGB256,
25004 IX86_BUILTIN_PAVGW256,
25005 IX86_BUILTIN_PBLENDVB256,
25006 IX86_BUILTIN_PBLENDVW256,
25007 IX86_BUILTIN_PCMPEQB256,
25008 IX86_BUILTIN_PCMPEQW256,
25009 IX86_BUILTIN_PCMPEQD256,
25010 IX86_BUILTIN_PCMPEQQ256,
25011 IX86_BUILTIN_PCMPGTB256,
25012 IX86_BUILTIN_PCMPGTW256,
25013 IX86_BUILTIN_PCMPGTD256,
25014 IX86_BUILTIN_PCMPGTQ256,
25015 IX86_BUILTIN_PHADDW256,
25016 IX86_BUILTIN_PHADDD256,
25017 IX86_BUILTIN_PHADDSW256,
25018 IX86_BUILTIN_PHSUBW256,
25019 IX86_BUILTIN_PHSUBD256,
25020 IX86_BUILTIN_PHSUBSW256,
25021 IX86_BUILTIN_PMADDUBSW256,
25022 IX86_BUILTIN_PMADDWD256,
25023 IX86_BUILTIN_PMAXSB256,
25024 IX86_BUILTIN_PMAXSW256,
25025 IX86_BUILTIN_PMAXSD256,
25026 IX86_BUILTIN_PMAXUB256,
25027 IX86_BUILTIN_PMAXUW256,
25028 IX86_BUILTIN_PMAXUD256,
25029 IX86_BUILTIN_PMINSB256,
25030 IX86_BUILTIN_PMINSW256,
25031 IX86_BUILTIN_PMINSD256,
25032 IX86_BUILTIN_PMINUB256,
25033 IX86_BUILTIN_PMINUW256,
25034 IX86_BUILTIN_PMINUD256,
25035 IX86_BUILTIN_PMOVMSKB256,
25036 IX86_BUILTIN_PMOVSXBW256,
25037 IX86_BUILTIN_PMOVSXBD256,
25038 IX86_BUILTIN_PMOVSXBQ256,
25039 IX86_BUILTIN_PMOVSXWD256,
25040 IX86_BUILTIN_PMOVSXWQ256,
25041 IX86_BUILTIN_PMOVSXDQ256,
25042 IX86_BUILTIN_PMOVZXBW256,
25043 IX86_BUILTIN_PMOVZXBD256,
25044 IX86_BUILTIN_PMOVZXBQ256,
25045 IX86_BUILTIN_PMOVZXWD256,
25046 IX86_BUILTIN_PMOVZXWQ256,
25047 IX86_BUILTIN_PMOVZXDQ256,
25048 IX86_BUILTIN_PMULDQ256,
25049 IX86_BUILTIN_PMULHRSW256,
25050 IX86_BUILTIN_PMULHUW256,
25051 IX86_BUILTIN_PMULHW256,
25052 IX86_BUILTIN_PMULLW256,
25053 IX86_BUILTIN_PMULLD256,
25054 IX86_BUILTIN_PMULUDQ256,
25055 IX86_BUILTIN_POR256,
25056 IX86_BUILTIN_PSADBW256,
25057 IX86_BUILTIN_PSHUFB256,
25058 IX86_BUILTIN_PSHUFD256,
25059 IX86_BUILTIN_PSHUFHW256,
25060 IX86_BUILTIN_PSHUFLW256,
25061 IX86_BUILTIN_PSIGNB256,
25062 IX86_BUILTIN_PSIGNW256,
25063 IX86_BUILTIN_PSIGND256,
25064 IX86_BUILTIN_PSLLDQI256,
25065 IX86_BUILTIN_PSLLWI256,
25066 IX86_BUILTIN_PSLLW256,
25067 IX86_BUILTIN_PSLLDI256,
25068 IX86_BUILTIN_PSLLD256,
25069 IX86_BUILTIN_PSLLQI256,
25070 IX86_BUILTIN_PSLLQ256,
25071 IX86_BUILTIN_PSRAWI256,
25072 IX86_BUILTIN_PSRAW256,
25073 IX86_BUILTIN_PSRADI256,
25074 IX86_BUILTIN_PSRAD256,
25075 IX86_BUILTIN_PSRLDQI256,
25076 IX86_BUILTIN_PSRLWI256,
25077 IX86_BUILTIN_PSRLW256,
25078 IX86_BUILTIN_PSRLDI256,
25079 IX86_BUILTIN_PSRLD256,
25080 IX86_BUILTIN_PSRLQI256,
25081 IX86_BUILTIN_PSRLQ256,
25082 IX86_BUILTIN_PSUBB256,
25083 IX86_BUILTIN_PSUBW256,
25084 IX86_BUILTIN_PSUBD256,
25085 IX86_BUILTIN_PSUBQ256,
25086 IX86_BUILTIN_PSUBSB256,
25087 IX86_BUILTIN_PSUBSW256,
25088 IX86_BUILTIN_PSUBUSB256,
25089 IX86_BUILTIN_PSUBUSW256,
25090 IX86_BUILTIN_PUNPCKHBW256,
25091 IX86_BUILTIN_PUNPCKHWD256,
25092 IX86_BUILTIN_PUNPCKHDQ256,
25093 IX86_BUILTIN_PUNPCKHQDQ256,
25094 IX86_BUILTIN_PUNPCKLBW256,
25095 IX86_BUILTIN_PUNPCKLWD256,
25096 IX86_BUILTIN_PUNPCKLDQ256,
25097 IX86_BUILTIN_PUNPCKLQDQ256,
25098 IX86_BUILTIN_PXOR256,
25099 IX86_BUILTIN_MOVNTDQA256,
25100 IX86_BUILTIN_VBROADCASTSS_PS,
25101 IX86_BUILTIN_VBROADCASTSS_PS256,
25102 IX86_BUILTIN_VBROADCASTSD_PD256,
25103 IX86_BUILTIN_VBROADCASTSI256,
25104 IX86_BUILTIN_PBLENDD256,
25105 IX86_BUILTIN_PBLENDD128,
25106 IX86_BUILTIN_PBROADCASTB256,
25107 IX86_BUILTIN_PBROADCASTW256,
25108 IX86_BUILTIN_PBROADCASTD256,
25109 IX86_BUILTIN_PBROADCASTQ256,
25110 IX86_BUILTIN_PBROADCASTB128,
25111 IX86_BUILTIN_PBROADCASTW128,
25112 IX86_BUILTIN_PBROADCASTD128,
25113 IX86_BUILTIN_PBROADCASTQ128,
25114 IX86_BUILTIN_VPERMVARSI256,
25115 IX86_BUILTIN_VPERMDF256,
25116 IX86_BUILTIN_VPERMVARSF256,
25117 IX86_BUILTIN_VPERMDI256,
25118 IX86_BUILTIN_VPERMTI256,
25119 IX86_BUILTIN_VEXTRACT128I256,
25120 IX86_BUILTIN_VINSERT128I256,
25121 IX86_BUILTIN_MASKLOADD,
25122 IX86_BUILTIN_MASKLOADQ,
25123 IX86_BUILTIN_MASKLOADD256,
25124 IX86_BUILTIN_MASKLOADQ256,
25125 IX86_BUILTIN_MASKSTORED,
25126 IX86_BUILTIN_MASKSTOREQ,
25127 IX86_BUILTIN_MASKSTORED256,
25128 IX86_BUILTIN_MASKSTOREQ256,
25129 IX86_BUILTIN_PSLLVV4DI,
25130 IX86_BUILTIN_PSLLVV2DI,
25131 IX86_BUILTIN_PSLLVV8SI,
25132 IX86_BUILTIN_PSLLVV4SI,
25133 IX86_BUILTIN_PSRAVV8SI,
25134 IX86_BUILTIN_PSRAVV4SI,
25135 IX86_BUILTIN_PSRLVV4DI,
25136 IX86_BUILTIN_PSRLVV2DI,
25137 IX86_BUILTIN_PSRLVV8SI,
25138 IX86_BUILTIN_PSRLVV4SI,
25139
25140 IX86_BUILTIN_GATHERSIV2DF,
25141 IX86_BUILTIN_GATHERSIV4DF,
25142 IX86_BUILTIN_GATHERDIV2DF,
25143 IX86_BUILTIN_GATHERDIV4DF,
25144 IX86_BUILTIN_GATHERSIV4SF,
25145 IX86_BUILTIN_GATHERSIV8SF,
25146 IX86_BUILTIN_GATHERDIV4SF,
25147 IX86_BUILTIN_GATHERDIV8SF,
25148 IX86_BUILTIN_GATHERSIV2DI,
25149 IX86_BUILTIN_GATHERSIV4DI,
25150 IX86_BUILTIN_GATHERDIV2DI,
25151 IX86_BUILTIN_GATHERDIV4DI,
25152 IX86_BUILTIN_GATHERSIV4SI,
25153 IX86_BUILTIN_GATHERSIV8SI,
25154 IX86_BUILTIN_GATHERDIV4SI,
25155 IX86_BUILTIN_GATHERDIV8SI,
25156
25157 /* TFmode support builtins. */
25158 IX86_BUILTIN_INFQ,
25159 IX86_BUILTIN_HUGE_VALQ,
25160 IX86_BUILTIN_FABSQ,
25161 IX86_BUILTIN_COPYSIGNQ,
25162
25163 /* Vectorizer support builtins. */
25164 IX86_BUILTIN_CPYSGNPS,
25165 IX86_BUILTIN_CPYSGNPD,
25166 IX86_BUILTIN_CPYSGNPS256,
25167 IX86_BUILTIN_CPYSGNPD256,
25168
25169 /* FMA4 instructions. */
25170 IX86_BUILTIN_VFMADDSS,
25171 IX86_BUILTIN_VFMADDSD,
25172 IX86_BUILTIN_VFMADDPS,
25173 IX86_BUILTIN_VFMADDPD,
25174 IX86_BUILTIN_VFMADDPS256,
25175 IX86_BUILTIN_VFMADDPD256,
25176 IX86_BUILTIN_VFMADDSUBPS,
25177 IX86_BUILTIN_VFMADDSUBPD,
25178 IX86_BUILTIN_VFMADDSUBPS256,
25179 IX86_BUILTIN_VFMADDSUBPD256,
25180
25181 /* FMA3 instructions. */
25182 IX86_BUILTIN_VFMADDSS3,
25183 IX86_BUILTIN_VFMADDSD3,
25184
25185 /* XOP instructions. */
25186 IX86_BUILTIN_VPCMOV,
25187 IX86_BUILTIN_VPCMOV_V2DI,
25188 IX86_BUILTIN_VPCMOV_V4SI,
25189 IX86_BUILTIN_VPCMOV_V8HI,
25190 IX86_BUILTIN_VPCMOV_V16QI,
25191 IX86_BUILTIN_VPCMOV_V4SF,
25192 IX86_BUILTIN_VPCMOV_V2DF,
25193 IX86_BUILTIN_VPCMOV256,
25194 IX86_BUILTIN_VPCMOV_V4DI256,
25195 IX86_BUILTIN_VPCMOV_V8SI256,
25196 IX86_BUILTIN_VPCMOV_V16HI256,
25197 IX86_BUILTIN_VPCMOV_V32QI256,
25198 IX86_BUILTIN_VPCMOV_V8SF256,
25199 IX86_BUILTIN_VPCMOV_V4DF256,
25200
25201 IX86_BUILTIN_VPPERM,
25202
25203 IX86_BUILTIN_VPMACSSWW,
25204 IX86_BUILTIN_VPMACSWW,
25205 IX86_BUILTIN_VPMACSSWD,
25206 IX86_BUILTIN_VPMACSWD,
25207 IX86_BUILTIN_VPMACSSDD,
25208 IX86_BUILTIN_VPMACSDD,
25209 IX86_BUILTIN_VPMACSSDQL,
25210 IX86_BUILTIN_VPMACSSDQH,
25211 IX86_BUILTIN_VPMACSDQL,
25212 IX86_BUILTIN_VPMACSDQH,
25213 IX86_BUILTIN_VPMADCSSWD,
25214 IX86_BUILTIN_VPMADCSWD,
25215
25216 IX86_BUILTIN_VPHADDBW,
25217 IX86_BUILTIN_VPHADDBD,
25218 IX86_BUILTIN_VPHADDBQ,
25219 IX86_BUILTIN_VPHADDWD,
25220 IX86_BUILTIN_VPHADDWQ,
25221 IX86_BUILTIN_VPHADDDQ,
25222 IX86_BUILTIN_VPHADDUBW,
25223 IX86_BUILTIN_VPHADDUBD,
25224 IX86_BUILTIN_VPHADDUBQ,
25225 IX86_BUILTIN_VPHADDUWD,
25226 IX86_BUILTIN_VPHADDUWQ,
25227 IX86_BUILTIN_VPHADDUDQ,
25228 IX86_BUILTIN_VPHSUBBW,
25229 IX86_BUILTIN_VPHSUBWD,
25230 IX86_BUILTIN_VPHSUBDQ,
25231
25232 IX86_BUILTIN_VPROTB,
25233 IX86_BUILTIN_VPROTW,
25234 IX86_BUILTIN_VPROTD,
25235 IX86_BUILTIN_VPROTQ,
25236 IX86_BUILTIN_VPROTB_IMM,
25237 IX86_BUILTIN_VPROTW_IMM,
25238 IX86_BUILTIN_VPROTD_IMM,
25239 IX86_BUILTIN_VPROTQ_IMM,
25240
25241 IX86_BUILTIN_VPSHLB,
25242 IX86_BUILTIN_VPSHLW,
25243 IX86_BUILTIN_VPSHLD,
25244 IX86_BUILTIN_VPSHLQ,
25245 IX86_BUILTIN_VPSHAB,
25246 IX86_BUILTIN_VPSHAW,
25247 IX86_BUILTIN_VPSHAD,
25248 IX86_BUILTIN_VPSHAQ,
25249
25250 IX86_BUILTIN_VFRCZSS,
25251 IX86_BUILTIN_VFRCZSD,
25252 IX86_BUILTIN_VFRCZPS,
25253 IX86_BUILTIN_VFRCZPD,
25254 IX86_BUILTIN_VFRCZPS256,
25255 IX86_BUILTIN_VFRCZPD256,
25256
25257 IX86_BUILTIN_VPCOMEQUB,
25258 IX86_BUILTIN_VPCOMNEUB,
25259 IX86_BUILTIN_VPCOMLTUB,
25260 IX86_BUILTIN_VPCOMLEUB,
25261 IX86_BUILTIN_VPCOMGTUB,
25262 IX86_BUILTIN_VPCOMGEUB,
25263 IX86_BUILTIN_VPCOMFALSEUB,
25264 IX86_BUILTIN_VPCOMTRUEUB,
25265
25266 IX86_BUILTIN_VPCOMEQUW,
25267 IX86_BUILTIN_VPCOMNEUW,
25268 IX86_BUILTIN_VPCOMLTUW,
25269 IX86_BUILTIN_VPCOMLEUW,
25270 IX86_BUILTIN_VPCOMGTUW,
25271 IX86_BUILTIN_VPCOMGEUW,
25272 IX86_BUILTIN_VPCOMFALSEUW,
25273 IX86_BUILTIN_VPCOMTRUEUW,
25274
25275 IX86_BUILTIN_VPCOMEQUD,
25276 IX86_BUILTIN_VPCOMNEUD,
25277 IX86_BUILTIN_VPCOMLTUD,
25278 IX86_BUILTIN_VPCOMLEUD,
25279 IX86_BUILTIN_VPCOMGTUD,
25280 IX86_BUILTIN_VPCOMGEUD,
25281 IX86_BUILTIN_VPCOMFALSEUD,
25282 IX86_BUILTIN_VPCOMTRUEUD,
25283
25284 IX86_BUILTIN_VPCOMEQUQ,
25285 IX86_BUILTIN_VPCOMNEUQ,
25286 IX86_BUILTIN_VPCOMLTUQ,
25287 IX86_BUILTIN_VPCOMLEUQ,
25288 IX86_BUILTIN_VPCOMGTUQ,
25289 IX86_BUILTIN_VPCOMGEUQ,
25290 IX86_BUILTIN_VPCOMFALSEUQ,
25291 IX86_BUILTIN_VPCOMTRUEUQ,
25292
25293 IX86_BUILTIN_VPCOMEQB,
25294 IX86_BUILTIN_VPCOMNEB,
25295 IX86_BUILTIN_VPCOMLTB,
25296 IX86_BUILTIN_VPCOMLEB,
25297 IX86_BUILTIN_VPCOMGTB,
25298 IX86_BUILTIN_VPCOMGEB,
25299 IX86_BUILTIN_VPCOMFALSEB,
25300 IX86_BUILTIN_VPCOMTRUEB,
25301
25302 IX86_BUILTIN_VPCOMEQW,
25303 IX86_BUILTIN_VPCOMNEW,
25304 IX86_BUILTIN_VPCOMLTW,
25305 IX86_BUILTIN_VPCOMLEW,
25306 IX86_BUILTIN_VPCOMGTW,
25307 IX86_BUILTIN_VPCOMGEW,
25308 IX86_BUILTIN_VPCOMFALSEW,
25309 IX86_BUILTIN_VPCOMTRUEW,
25310
25311 IX86_BUILTIN_VPCOMEQD,
25312 IX86_BUILTIN_VPCOMNED,
25313 IX86_BUILTIN_VPCOMLTD,
25314 IX86_BUILTIN_VPCOMLED,
25315 IX86_BUILTIN_VPCOMGTD,
25316 IX86_BUILTIN_VPCOMGED,
25317 IX86_BUILTIN_VPCOMFALSED,
25318 IX86_BUILTIN_VPCOMTRUED,
25319
25320 IX86_BUILTIN_VPCOMEQQ,
25321 IX86_BUILTIN_VPCOMNEQ,
25322 IX86_BUILTIN_VPCOMLTQ,
25323 IX86_BUILTIN_VPCOMLEQ,
25324 IX86_BUILTIN_VPCOMGTQ,
25325 IX86_BUILTIN_VPCOMGEQ,
25326 IX86_BUILTIN_VPCOMFALSEQ,
25327 IX86_BUILTIN_VPCOMTRUEQ,
25328
25329 /* LWP instructions. */
25330 IX86_BUILTIN_LLWPCB,
25331 IX86_BUILTIN_SLWPCB,
25332 IX86_BUILTIN_LWPVAL32,
25333 IX86_BUILTIN_LWPVAL64,
25334 IX86_BUILTIN_LWPINS32,
25335 IX86_BUILTIN_LWPINS64,
25336
25337 IX86_BUILTIN_CLZS,
25338
25339 /* BMI instructions. */
25340 IX86_BUILTIN_BEXTR32,
25341 IX86_BUILTIN_BEXTR64,
25342 IX86_BUILTIN_CTZS,
25343
25344 /* TBM instructions. */
25345 IX86_BUILTIN_BEXTRI32,
25346 IX86_BUILTIN_BEXTRI64,
25347
25348 /* BMI2 instructions. */
25349 IX86_BUILTIN_BZHI32,
25350 IX86_BUILTIN_BZHI64,
25351 IX86_BUILTIN_PDEP32,
25352 IX86_BUILTIN_PDEP64,
25353 IX86_BUILTIN_PEXT32,
25354 IX86_BUILTIN_PEXT64,
25355
25356 /* FSGSBASE instructions. */
25357 IX86_BUILTIN_RDFSBASE32,
25358 IX86_BUILTIN_RDFSBASE64,
25359 IX86_BUILTIN_RDGSBASE32,
25360 IX86_BUILTIN_RDGSBASE64,
25361 IX86_BUILTIN_WRFSBASE32,
25362 IX86_BUILTIN_WRFSBASE64,
25363 IX86_BUILTIN_WRGSBASE32,
25364 IX86_BUILTIN_WRGSBASE64,
25365
25366 /* RDRND instructions. */
25367 IX86_BUILTIN_RDRAND16_STEP,
25368 IX86_BUILTIN_RDRAND32_STEP,
25369 IX86_BUILTIN_RDRAND64_STEP,
25370
25371 /* F16C instructions. */
25372 IX86_BUILTIN_CVTPH2PS,
25373 IX86_BUILTIN_CVTPH2PS256,
25374 IX86_BUILTIN_CVTPS2PH,
25375 IX86_BUILTIN_CVTPS2PH256,
25376
25377 /* CFString built-in for darwin */
25378 IX86_BUILTIN_CFSTRING,
25379
25380 IX86_BUILTIN_MAX
25381 };
25382
25383 /* Table for the ix86 builtin decls. */
25384 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25385
25386 /* Table of all of the builtin functions that are possible with different ISA's
25387 but are waiting to be built until a function is declared to use that
25388 ISA. */
25389 struct builtin_isa {
25390 const char *name; /* function name */
25391 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25392 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
25393 bool const_p; /* true if the declaration is constant */
25394 bool set_and_not_built_p;
25395 };
25396
25397 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25398
25399
25400 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
25401 of which isa_flags to use in the ix86_builtins_isa array. Stores the
25402 function decl in the ix86_builtins array. Returns the function decl or
25403 NULL_TREE, if the builtin was not added.
25404
25405 If the front end has a special hook for builtin functions, delay adding
25406 builtin functions that aren't in the current ISA until the ISA is changed
25407 with function specific optimization. Doing so, can save about 300K for the
25408 default compiler. When the builtin is expanded, check at that time whether
25409 it is valid.
25410
25411 If the front end doesn't have a special hook, record all builtins, even if
25412 it isn't an instruction set in the current ISA in case the user uses
25413 function specific options for a different ISA, so that we don't get scope
25414 errors if a builtin is added in the middle of a function scope. */
25415
25416 static inline tree
25417 def_builtin (HOST_WIDE_INT mask, const char *name,
25418 enum ix86_builtin_func_type tcode,
25419 enum ix86_builtins code)
25420 {
25421 tree decl = NULL_TREE;
25422
25423 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
25424 {
25425 ix86_builtins_isa[(int) code].isa = mask;
25426
25427 mask &= ~OPTION_MASK_ISA_64BIT;
25428 if (mask == 0
25429 || (mask & ix86_isa_flags) != 0
25430 || (lang_hooks.builtin_function
25431 == lang_hooks.builtin_function_ext_scope))
25432
25433 {
25434 tree type = ix86_get_builtin_func_type (tcode);
25435 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
25436 NULL, NULL_TREE);
25437 ix86_builtins[(int) code] = decl;
25438 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
25439 }
25440 else
25441 {
25442 ix86_builtins[(int) code] = NULL_TREE;
25443 ix86_builtins_isa[(int) code].tcode = tcode;
25444 ix86_builtins_isa[(int) code].name = name;
25445 ix86_builtins_isa[(int) code].const_p = false;
25446 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
25447 }
25448 }
25449
25450 return decl;
25451 }
25452
25453 /* Like def_builtin, but also marks the function decl "const". */
25454
25455 static inline tree
25456 def_builtin_const (HOST_WIDE_INT mask, const char *name,
25457 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
25458 {
25459 tree decl = def_builtin (mask, name, tcode, code);
25460 if (decl)
25461 TREE_READONLY (decl) = 1;
25462 else
25463 ix86_builtins_isa[(int) code].const_p = true;
25464
25465 return decl;
25466 }
25467
25468 /* Add any new builtin functions for a given ISA that may not have been
25469 declared. This saves a bit of space compared to adding all of the
25470 declarations to the tree, even if we didn't use them. */
25471
25472 static void
25473 ix86_add_new_builtins (HOST_WIDE_INT isa)
25474 {
25475 int i;
25476
25477 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
25478 {
25479 if ((ix86_builtins_isa[i].isa & isa) != 0
25480 && ix86_builtins_isa[i].set_and_not_built_p)
25481 {
25482 tree decl, type;
25483
25484 /* Don't define the builtin again. */
25485 ix86_builtins_isa[i].set_and_not_built_p = false;
25486
25487 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
25488 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
25489 type, i, BUILT_IN_MD, NULL,
25490 NULL_TREE);
25491
25492 ix86_builtins[i] = decl;
25493 if (ix86_builtins_isa[i].const_p)
25494 TREE_READONLY (decl) = 1;
25495 }
25496 }
25497 }
25498
25499 /* Bits for builtin_description.flag. */
25500
25501 /* Set when we don't support the comparison natively, and should
25502 swap_comparison in order to support it. */
25503 #define BUILTIN_DESC_SWAP_OPERANDS 1
25504
25505 struct builtin_description
25506 {
25507 const HOST_WIDE_INT mask;
25508 const enum insn_code icode;
25509 const char *const name;
25510 const enum ix86_builtins code;
25511 const enum rtx_code comparison;
25512 const int flag;
25513 };
25514
25515 static const struct builtin_description bdesc_comi[] =
25516 {
25517 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
25518 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
25519 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
25520 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
25521 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
25522 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
25523 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
25524 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
25525 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
25526 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
25527 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
25528 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
25529 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
25530 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
25531 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
25532 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
25533 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
25534 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
25535 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
25536 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
25537 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
25538 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
25539 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
25540 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
25541 };
25542
25543 static const struct builtin_description bdesc_pcmpestr[] =
25544 {
25545 /* SSE4.2 */
25546 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
25547 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
25548 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
25549 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
25550 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
25551 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
25552 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
25553 };
25554
25555 static const struct builtin_description bdesc_pcmpistr[] =
25556 {
25557 /* SSE4.2 */
25558 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
25559 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
25560 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
25561 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
25562 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
25563 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
25564 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
25565 };
25566
25567 /* Special builtins with variable number of arguments. */
25568 static const struct builtin_description bdesc_special_args[] =
25569 {
25570 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
25571 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
25572 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
25573
25574 /* MMX */
25575 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25576
25577 /* 3DNow! */
25578 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25579
25580 /* SSE */
25581 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25582 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25583 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25584
25585 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25586 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25587 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25588 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25589
25590 /* SSE or 3DNow!A */
25591 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25592 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntdi, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
25593
25594 /* SSE2 */
25595 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25596 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25597 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25598 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
25599 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25600 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
25601 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntsi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
25602 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
25603 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25604
25605 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25606 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25607
25608 /* SSE3 */
25609 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25610
25611 /* SSE4.1 */
25612 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
25613
25614 /* SSE4A */
25615 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25616 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25617
25618 /* AVX */
25619 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
25620 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
25621
25622 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25623 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25624 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25625 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
25626 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
25627
25628 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25629 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25630 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
25631 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
25632 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
25633 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
25634 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
25635
25636 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
25637 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
25638 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
25639
25640 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
25641 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
25642 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
25643 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
25644 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
25645 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
25646 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
25647 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
25648
25649 /* AVX2 */
25650 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
25651 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
25652 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
25653 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
25654 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
25655 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
25656 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
25657 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
25658 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
25659
25660 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
25661 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
25662 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
25663 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
25664 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
25665 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
25666
25667 /* FSGSBASE */
25668 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
25669 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
25670 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
25671 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
25672 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
25673 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
25674 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
25675 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
25676 };
25677
25678 /* Builtins with variable number of arguments. */
25679 static const struct builtin_description bdesc_args[] =
25680 {
25681 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
25682 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
25683 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
25684 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
25685 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
25686 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
25687 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
25688
25689 /* MMX */
25690 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25691 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25692 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25693 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25694 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25695 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25696
25697 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25698 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25699 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25700 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25701 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25702 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25703 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25704 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25705
25706 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25707 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25708
25709 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25710 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25711 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25712 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25713
25714 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25715 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25716 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25717 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25718 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25719 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25720
25721 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25722 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25723 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25724 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25725 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
25726 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
25727
25728 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
25729 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
25730 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
25731
25732 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
25733
25734 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25735 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25736 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
25737 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25738 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25739 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
25740
25741 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25742 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25743 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
25744 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25745 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25746 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
25747
25748 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25749 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25750 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25751 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25752
25753 /* 3DNow! */
25754 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
25755 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
25756 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
25757 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
25758
25759 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25760 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25761 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25762 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
25763 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
25764 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
25765 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25766 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25767 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25768 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25769 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25770 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25771 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25772 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25773 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25774
25775 /* 3DNow!A */
25776 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
25777 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
25778 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
25779 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
25780 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25781 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25782
25783 /* SSE */
25784 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
25785 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25786 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25787 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25788 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25789 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25790 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
25791 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
25792 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
25793 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
25794 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
25795 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
25796
25797 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
25798
25799 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25800 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25801 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25802 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25803 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25804 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25805 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25806 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25807
25808 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
25809 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
25810 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
25811 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25812 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25813 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
25814 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
25815 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
25816 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
25817 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25818 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
25819 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
25820 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
25821 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
25822 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
25823 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
25824 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
25825 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
25826 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
25827 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25828 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25829 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
25830
25831 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25832 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25833 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25834 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25835
25836 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25837 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25838 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25839 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25840
25841 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25842
25843 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25844 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25845 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25846 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25847 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25848
25849 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
25850 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
25851 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
25852
25853 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
25854
25855 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
25856 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
25857 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
25858
25859 /* SSE MMX or 3Dnow!A */
25860 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25861 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25862 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25863
25864 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25865 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25866 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25867 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25868
25869 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
25870 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
25871
25872 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
25873
25874 /* SSE2 */
25875 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
25876
25877 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
25878 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
25879 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
25880 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
25881 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
25882
25883 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
25884 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
25885 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
25886 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
25887 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
25888
25889 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
25890
25891 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
25892 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
25893 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
25894 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
25895
25896 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
25897 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
25898 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
25899
25900 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25901 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25902 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25903 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25904 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25905 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25906 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25907 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25908
25909 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
25910 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
25911 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
25912 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
25913 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
25914 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
25915 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
25916 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
25917 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
25918 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
25919 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
25920 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
25921 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
25922 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
25923 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
25924 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
25925 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
25926 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
25927 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
25928 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
25929
25930 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25931 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25932 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25933 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25934
25935 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25936 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25937 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25938 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25939
25940 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25941
25942 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25943 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25944 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25945
25946 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
25947
25948 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25949 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25950 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
25951 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
25952 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25953 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25954 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
25955 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
25956
25957 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25958 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25959 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25960 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25961 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25962 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25963 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25964 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25965
25966 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25967 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
25968
25969 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
25970 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
25971 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
25972 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
25973
25974 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25975 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25976
25977 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25978 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25979 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
25980 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25981 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25982 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
25983
25984 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25985 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25986 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25987 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25988
25989 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25990 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25991 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
25992 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
25993 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25994 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25995 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
25996 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
25997
25998 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
25999 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26000 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26001
26002 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26003 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26004
26005 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26006 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26007
26008 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26009
26010 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26011 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26012 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26013 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26014
26015 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26016 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26017 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26018 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26019 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26020 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26021 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26022
26023 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26024 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26025 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26026 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26027 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26028 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26029 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26030
26031 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26032 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26033 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26034 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26035
26036 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26037 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26038 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26039
26040 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26041
26042 { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26043 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26044
26045 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26046
26047 /* SSE2 MMX */
26048 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26049 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26050
26051 /* SSE3 */
26052 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26053 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26054
26055 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26056 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26057 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26058 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26059 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26060 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26061
26062 /* SSSE3 */
26063 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26064 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26065 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26066 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26067 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26068 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26069
26070 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26071 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26072 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26073 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26074 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26075 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26076 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26077 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26078 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26079 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26080 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26081 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26082 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26083 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26084 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26085 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26086 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26087 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26088 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26089 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26090 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26091 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26092 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26093 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26094
26095 /* SSSE3. */
26096 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26097 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26098
26099 /* SSE4.1 */
26100 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26101 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26102 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26103 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26104 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26105 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26106 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26107 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26108 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26109 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26110
26111 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26112 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26113 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26114 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26115 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26116 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26117 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26118 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26119 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26120 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26121 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26122 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26123 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26124
26125 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26126 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26127 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26128 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26129 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26130 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26131 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26132 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26133 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26134 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26135 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26136 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26137
26138 /* SSE4.1 */
26139 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26140 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26141 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26142 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26143
26144 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26145 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26146 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26147 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26148
26149 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26150
26151 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26152 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26153 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26154 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26155
26156 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26157
26158 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26159 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26160 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26161
26162 /* SSE4.2 */
26163 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26164 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26165 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26166 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26167 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26168
26169 /* SSE4A */
26170 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26171 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26172 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26173 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26174
26175 /* AES */
26176 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26177 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26178
26179 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26180 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26181 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26182 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26183
26184 /* PCLMUL */
26185 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26186
26187 /* AVX */
26188 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26189 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26190 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26191 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26192 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26193 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26194 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26195 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26196 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26197 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26198 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26199 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26200 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26201 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26202 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26203 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26204 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26205 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26206 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26207 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26208 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26209 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26210 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26211 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26212 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26213 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26214
26215 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26216 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26217 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26218 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26219
26220 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26221 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26222 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26223 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26224 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26225 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26226 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26227 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26228 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26229 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26230 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26231 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26232 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26233 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26234 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26235 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26236 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26237 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26238 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26239 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26240 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26241 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26242 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26243 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26244 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26245 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26246 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26247 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26248 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26249 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26250 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26251 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26252 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26253 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26254
26255 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26256 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26257 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26258
26259 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26260 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26261 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26262 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26263 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26264
26265 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26266
26267 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26268 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26269
26270 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26271 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26272 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26273 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26274
26275 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26276
26277 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26278 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26279 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26280 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26281
26282 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26283
26284 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26285 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26286 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26287 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26288
26289 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26290 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26291 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26292 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26293 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26294 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26295
26296 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26297 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26298 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26299 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26300 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26301 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26302 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26303 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26304 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26305 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26306 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26307 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26308 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26309 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26310 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26311
26312 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
26313 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26314
26315 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26316 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26317
26318 /* AVX2 */
26319 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26320 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26321 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26322 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26323 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26324 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26325 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26326 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26327 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26328 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26329 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26330 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26331 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26332 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26333 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26334 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26335 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26336 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26337 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26338 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26339 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26340 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26341 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26342 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26343 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26344 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26345 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26346 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26347 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26348 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26349 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26350 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26351 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26352 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26353 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26354 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26355 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26356 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26357 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26358 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26359 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26360 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26361 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26362 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26363 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26364 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26365 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26366 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26367 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26368 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26369 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26370 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26371 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26372 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26373 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26374 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26375 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26376 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26377 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26378 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26379 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26380 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26381 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26382 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26383 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26384 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26385 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26386 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26387 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26388 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26389 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26390 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26391 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26392 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26393 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
26394 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26395 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26396 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26397 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26398 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26399 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26400 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26401 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26402 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26403 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26404 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26405 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26406 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26407 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26408 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26409 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26410 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26411 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26412 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26413 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26414 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26415 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26416 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26417 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26418 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26419 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26420 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26421 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26422 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26423 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26424 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26425 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26426 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26427 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26428 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26429 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26430 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26431 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26432 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26433 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26434 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26435 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26436 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26437 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26438 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
26439 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26440 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
26441 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
26442 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26443 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26444 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26445 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26446 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26447 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26448 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26449 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26450 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26451 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
26452 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
26453 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
26454 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
26455 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26456 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26457 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26458 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26459 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26460 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26461 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26462 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26463 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26464 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26465
26466 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26467
26468 /* BMI */
26469 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26470 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26471 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26472
26473 /* TBM */
26474 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26475 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26476
26477 /* F16C */
26478 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
26479 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
26480 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
26481 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
26482
26483 /* BMI2 */
26484 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26485 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26486 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26487 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26488 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26489 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26490 };
26491
26492 /* FMA4 and XOP. */
26493 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
26494 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
26495 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
26496 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
26497 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
26498 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
26499 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
26500 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
26501 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
26502 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
26503 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
26504 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
26505 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
26506 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
26507 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
26508 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
26509 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
26510 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
26511 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
26512 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
26513 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
26514 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
26515 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
26516 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
26517 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
26518 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
26519 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
26520 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
26521 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
26522 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
26523 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
26524 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
26525 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
26526 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
26527 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
26528 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
26529 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
26530 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
26531 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
26532 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
26533 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
26534 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
26535 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
26536 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
26537 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
26538 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
26539 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
26540 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
26541 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
26542 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
26543 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
26544 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
26545
26546 static const struct builtin_description bdesc_multi_arg[] =
26547 {
26548 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
26549 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
26550 UNKNOWN, (int)MULTI_ARG_3_SF },
26551 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
26552 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
26553 UNKNOWN, (int)MULTI_ARG_3_DF },
26554
26555 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
26556 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
26557 UNKNOWN, (int)MULTI_ARG_3_SF },
26558 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
26559 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
26560 UNKNOWN, (int)MULTI_ARG_3_DF },
26561
26562 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
26563 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
26564 UNKNOWN, (int)MULTI_ARG_3_SF },
26565 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
26566 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
26567 UNKNOWN, (int)MULTI_ARG_3_DF },
26568 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
26569 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
26570 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26571 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
26572 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
26573 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26574
26575 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
26576 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
26577 UNKNOWN, (int)MULTI_ARG_3_SF },
26578 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
26579 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
26580 UNKNOWN, (int)MULTI_ARG_3_DF },
26581 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
26582 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
26583 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26584 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
26585 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
26586 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26587
26588 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
26589 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
26590 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
26591 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
26592 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
26593 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
26594 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
26595
26596 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26597 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26598 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
26599 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
26600 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
26601 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
26602 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
26603
26604 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
26605
26606 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26607 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26608 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26609 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26610 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
26611 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
26612 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26613 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26614 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26615 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26616 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26617 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26618
26619 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26620 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
26621 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
26622 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
26623 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
26624 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
26625 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
26626 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
26627 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26628 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
26629 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
26630 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
26631 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26632 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
26633 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
26634 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
26635
26636 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
26637 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
26638 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
26639 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
26640 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
26641 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
26642
26643 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26644 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
26645 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
26646 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26647 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
26648 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26649 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26650 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
26651 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
26652 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26653 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
26654 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26655 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26656 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26657 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26658
26659 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
26660 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
26661 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
26662 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
26663 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
26664 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
26665 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
26666
26667 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
26668 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
26669 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
26670 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
26671 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
26672 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
26673 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
26674
26675 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
26676 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
26677 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
26678 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
26679 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
26680 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
26681 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
26682
26683 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
26684 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
26685 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
26686 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
26687 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
26688 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
26689 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
26690
26691 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
26692 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
26693 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
26694 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
26695 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
26696 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
26697 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
26698
26699 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
26700 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
26701 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
26702 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
26703 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
26704 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
26705 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
26706
26707 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
26708 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
26709 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
26710 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
26711 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
26712 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
26713 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
26714
26715 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
26716 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
26717 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
26718 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
26719 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
26720 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
26721 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
26722
26723 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
26724 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
26725 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
26726 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
26727 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
26728 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
26729 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
26730 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
26731
26732 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
26733 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
26734 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
26735 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
26736 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
26737 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
26738 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
26739 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
26740
26741 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
26742 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
26743 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
26744 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
26745
26746 };
26747
26748 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
26749 in the current target ISA to allow the user to compile particular modules
26750 with different target specific options that differ from the command line
26751 options. */
26752 static void
26753 ix86_init_mmx_sse_builtins (void)
26754 {
26755 const struct builtin_description * d;
26756 enum ix86_builtin_func_type ftype;
26757 size_t i;
26758
26759 /* Add all special builtins with variable number of operands. */
26760 for (i = 0, d = bdesc_special_args;
26761 i < ARRAY_SIZE (bdesc_special_args);
26762 i++, d++)
26763 {
26764 if (d->name == 0)
26765 continue;
26766
26767 ftype = (enum ix86_builtin_func_type) d->flag;
26768 def_builtin (d->mask, d->name, ftype, d->code);
26769 }
26770
26771 /* Add all builtins with variable number of operands. */
26772 for (i = 0, d = bdesc_args;
26773 i < ARRAY_SIZE (bdesc_args);
26774 i++, d++)
26775 {
26776 if (d->name == 0)
26777 continue;
26778
26779 ftype = (enum ix86_builtin_func_type) d->flag;
26780 def_builtin_const (d->mask, d->name, ftype, d->code);
26781 }
26782
26783 /* pcmpestr[im] insns. */
26784 for (i = 0, d = bdesc_pcmpestr;
26785 i < ARRAY_SIZE (bdesc_pcmpestr);
26786 i++, d++)
26787 {
26788 if (d->code == IX86_BUILTIN_PCMPESTRM128)
26789 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
26790 else
26791 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
26792 def_builtin_const (d->mask, d->name, ftype, d->code);
26793 }
26794
26795 /* pcmpistr[im] insns. */
26796 for (i = 0, d = bdesc_pcmpistr;
26797 i < ARRAY_SIZE (bdesc_pcmpistr);
26798 i++, d++)
26799 {
26800 if (d->code == IX86_BUILTIN_PCMPISTRM128)
26801 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
26802 else
26803 ftype = INT_FTYPE_V16QI_V16QI_INT;
26804 def_builtin_const (d->mask, d->name, ftype, d->code);
26805 }
26806
26807 /* comi/ucomi insns. */
26808 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
26809 {
26810 if (d->mask == OPTION_MASK_ISA_SSE2)
26811 ftype = INT_FTYPE_V2DF_V2DF;
26812 else
26813 ftype = INT_FTYPE_V4SF_V4SF;
26814 def_builtin_const (d->mask, d->name, ftype, d->code);
26815 }
26816
26817 /* SSE */
26818 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
26819 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
26820 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
26821 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
26822
26823 /* SSE or 3DNow!A */
26824 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
26825 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
26826 IX86_BUILTIN_MASKMOVQ);
26827
26828 /* SSE2 */
26829 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
26830 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
26831
26832 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
26833 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
26834 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
26835 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
26836
26837 /* SSE3. */
26838 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
26839 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
26840 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
26841 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
26842
26843 /* AES */
26844 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
26845 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
26846 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
26847 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
26848 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
26849 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
26850 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
26851 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
26852 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
26853 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
26854 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
26855 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
26856
26857 /* PCLMUL */
26858 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
26859 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
26860
26861 /* RDRND */
26862 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
26863 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
26864 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
26865 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
26866 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
26867 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
26868 IX86_BUILTIN_RDRAND64_STEP);
26869
26870 /* AVX2 */
26871 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
26872 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
26873 IX86_BUILTIN_GATHERSIV2DF);
26874
26875 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
26876 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
26877 IX86_BUILTIN_GATHERSIV4DF);
26878
26879 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
26880 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
26881 IX86_BUILTIN_GATHERDIV2DF);
26882
26883 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
26884 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
26885 IX86_BUILTIN_GATHERDIV4DF);
26886
26887 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
26888 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
26889 IX86_BUILTIN_GATHERSIV4SF);
26890
26891 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
26892 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
26893 IX86_BUILTIN_GATHERSIV8SF);
26894
26895 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
26896 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
26897 IX86_BUILTIN_GATHERDIV4SF);
26898
26899 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
26900 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
26901 IX86_BUILTIN_GATHERDIV8SF);
26902
26903 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
26904 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
26905 IX86_BUILTIN_GATHERSIV2DI);
26906
26907 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
26908 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
26909 IX86_BUILTIN_GATHERSIV4DI);
26910
26911 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
26912 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
26913 IX86_BUILTIN_GATHERDIV2DI);
26914
26915 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
26916 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
26917 IX86_BUILTIN_GATHERDIV4DI);
26918
26919 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
26920 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
26921 IX86_BUILTIN_GATHERSIV4SI);
26922
26923 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
26924 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
26925 IX86_BUILTIN_GATHERSIV8SI);
26926
26927 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
26928 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
26929 IX86_BUILTIN_GATHERDIV4SI);
26930
26931 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
26932 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
26933 IX86_BUILTIN_GATHERDIV8SI);
26934
26935 /* MMX access to the vec_init patterns. */
26936 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
26937 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
26938
26939 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
26940 V4HI_FTYPE_HI_HI_HI_HI,
26941 IX86_BUILTIN_VEC_INIT_V4HI);
26942
26943 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
26944 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
26945 IX86_BUILTIN_VEC_INIT_V8QI);
26946
26947 /* Access to the vec_extract patterns. */
26948 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
26949 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
26950 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
26951 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
26952 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
26953 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
26954 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
26955 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
26956 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
26957 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
26958
26959 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
26960 "__builtin_ia32_vec_ext_v4hi",
26961 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
26962
26963 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
26964 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
26965
26966 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
26967 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
26968
26969 /* Access to the vec_set patterns. */
26970 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
26971 "__builtin_ia32_vec_set_v2di",
26972 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
26973
26974 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
26975 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
26976
26977 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
26978 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
26979
26980 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
26981 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
26982
26983 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
26984 "__builtin_ia32_vec_set_v4hi",
26985 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
26986
26987 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
26988 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
26989
26990 /* Add FMA4 multi-arg argument instructions */
26991 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
26992 {
26993 if (d->name == 0)
26994 continue;
26995
26996 ftype = (enum ix86_builtin_func_type) d->flag;
26997 def_builtin_const (d->mask, d->name, ftype, d->code);
26998 }
26999 }
27000
27001 /* Internal method for ix86_init_builtins. */
27002
27003 static void
27004 ix86_init_builtins_va_builtins_abi (void)
27005 {
27006 tree ms_va_ref, sysv_va_ref;
27007 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
27008 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
27009 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
27010 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
27011
27012 if (!TARGET_64BIT)
27013 return;
27014 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
27015 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
27016 ms_va_ref = build_reference_type (ms_va_list_type_node);
27017 sysv_va_ref =
27018 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
27019
27020 fnvoid_va_end_ms =
27021 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27022 fnvoid_va_start_ms =
27023 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27024 fnvoid_va_end_sysv =
27025 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
27026 fnvoid_va_start_sysv =
27027 build_varargs_function_type_list (void_type_node, sysv_va_ref,
27028 NULL_TREE);
27029 fnvoid_va_copy_ms =
27030 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
27031 NULL_TREE);
27032 fnvoid_va_copy_sysv =
27033 build_function_type_list (void_type_node, sysv_va_ref,
27034 sysv_va_ref, NULL_TREE);
27035
27036 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
27037 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
27038 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
27039 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
27040 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
27041 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
27042 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
27043 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27044 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
27045 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27046 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
27047 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27048 }
27049
27050 static void
27051 ix86_init_builtin_types (void)
27052 {
27053 tree float128_type_node, float80_type_node;
27054
27055 /* The __float80 type. */
27056 float80_type_node = long_double_type_node;
27057 if (TYPE_MODE (float80_type_node) != XFmode)
27058 {
27059 /* The __float80 type. */
27060 float80_type_node = make_node (REAL_TYPE);
27061
27062 TYPE_PRECISION (float80_type_node) = 80;
27063 layout_type (float80_type_node);
27064 }
27065 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
27066
27067 /* The __float128 type. */
27068 float128_type_node = make_node (REAL_TYPE);
27069 TYPE_PRECISION (float128_type_node) = 128;
27070 layout_type (float128_type_node);
27071 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
27072
27073 /* This macro is built by i386-builtin-types.awk. */
27074 DEFINE_BUILTIN_PRIMITIVE_TYPES;
27075 }
27076
27077 static void
27078 ix86_init_builtins (void)
27079 {
27080 tree t;
27081
27082 ix86_init_builtin_types ();
27083
27084 /* TFmode support builtins. */
27085 def_builtin_const (0, "__builtin_infq",
27086 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
27087 def_builtin_const (0, "__builtin_huge_valq",
27088 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
27089
27090 /* We will expand them to normal call if SSE2 isn't available since
27091 they are used by libgcc. */
27092 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
27093 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
27094 BUILT_IN_MD, "__fabstf2", NULL_TREE);
27095 TREE_READONLY (t) = 1;
27096 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
27097
27098 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
27099 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
27100 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
27101 TREE_READONLY (t) = 1;
27102 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
27103
27104 ix86_init_mmx_sse_builtins ();
27105
27106 if (TARGET_LP64)
27107 ix86_init_builtins_va_builtins_abi ();
27108
27109 #ifdef SUBTARGET_INIT_BUILTINS
27110 SUBTARGET_INIT_BUILTINS;
27111 #endif
27112 }
27113
27114 /* Return the ix86 builtin for CODE. */
27115
27116 static tree
27117 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
27118 {
27119 if (code >= IX86_BUILTIN_MAX)
27120 return error_mark_node;
27121
27122 return ix86_builtins[code];
27123 }
27124
27125 /* Errors in the source file can cause expand_expr to return const0_rtx
27126 where we expect a vector. To avoid crashing, use one of the vector
27127 clear instructions. */
27128 static rtx
27129 safe_vector_operand (rtx x, enum machine_mode mode)
27130 {
27131 if (x == const0_rtx)
27132 x = CONST0_RTX (mode);
27133 return x;
27134 }
27135
27136 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
27137
27138 static rtx
27139 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
27140 {
27141 rtx pat;
27142 tree arg0 = CALL_EXPR_ARG (exp, 0);
27143 tree arg1 = CALL_EXPR_ARG (exp, 1);
27144 rtx op0 = expand_normal (arg0);
27145 rtx op1 = expand_normal (arg1);
27146 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27147 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27148 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
27149
27150 if (VECTOR_MODE_P (mode0))
27151 op0 = safe_vector_operand (op0, mode0);
27152 if (VECTOR_MODE_P (mode1))
27153 op1 = safe_vector_operand (op1, mode1);
27154
27155 if (optimize || !target
27156 || GET_MODE (target) != tmode
27157 || !insn_data[icode].operand[0].predicate (target, tmode))
27158 target = gen_reg_rtx (tmode);
27159
27160 if (GET_MODE (op1) == SImode && mode1 == TImode)
27161 {
27162 rtx x = gen_reg_rtx (V4SImode);
27163 emit_insn (gen_sse2_loadd (x, op1));
27164 op1 = gen_lowpart (TImode, x);
27165 }
27166
27167 if (!insn_data[icode].operand[1].predicate (op0, mode0))
27168 op0 = copy_to_mode_reg (mode0, op0);
27169 if (!insn_data[icode].operand[2].predicate (op1, mode1))
27170 op1 = copy_to_mode_reg (mode1, op1);
27171
27172 pat = GEN_FCN (icode) (target, op0, op1);
27173 if (! pat)
27174 return 0;
27175
27176 emit_insn (pat);
27177
27178 return target;
27179 }
27180
27181 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
27182
27183 static rtx
27184 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
27185 enum ix86_builtin_func_type m_type,
27186 enum rtx_code sub_code)
27187 {
27188 rtx pat;
27189 int i;
27190 int nargs;
27191 bool comparison_p = false;
27192 bool tf_p = false;
27193 bool last_arg_constant = false;
27194 int num_memory = 0;
27195 struct {
27196 rtx op;
27197 enum machine_mode mode;
27198 } args[4];
27199
27200 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27201
27202 switch (m_type)
27203 {
27204 case MULTI_ARG_4_DF2_DI_I:
27205 case MULTI_ARG_4_DF2_DI_I1:
27206 case MULTI_ARG_4_SF2_SI_I:
27207 case MULTI_ARG_4_SF2_SI_I1:
27208 nargs = 4;
27209 last_arg_constant = true;
27210 break;
27211
27212 case MULTI_ARG_3_SF:
27213 case MULTI_ARG_3_DF:
27214 case MULTI_ARG_3_SF2:
27215 case MULTI_ARG_3_DF2:
27216 case MULTI_ARG_3_DI:
27217 case MULTI_ARG_3_SI:
27218 case MULTI_ARG_3_SI_DI:
27219 case MULTI_ARG_3_HI:
27220 case MULTI_ARG_3_HI_SI:
27221 case MULTI_ARG_3_QI:
27222 case MULTI_ARG_3_DI2:
27223 case MULTI_ARG_3_SI2:
27224 case MULTI_ARG_3_HI2:
27225 case MULTI_ARG_3_QI2:
27226 nargs = 3;
27227 break;
27228
27229 case MULTI_ARG_2_SF:
27230 case MULTI_ARG_2_DF:
27231 case MULTI_ARG_2_DI:
27232 case MULTI_ARG_2_SI:
27233 case MULTI_ARG_2_HI:
27234 case MULTI_ARG_2_QI:
27235 nargs = 2;
27236 break;
27237
27238 case MULTI_ARG_2_DI_IMM:
27239 case MULTI_ARG_2_SI_IMM:
27240 case MULTI_ARG_2_HI_IMM:
27241 case MULTI_ARG_2_QI_IMM:
27242 nargs = 2;
27243 last_arg_constant = true;
27244 break;
27245
27246 case MULTI_ARG_1_SF:
27247 case MULTI_ARG_1_DF:
27248 case MULTI_ARG_1_SF2:
27249 case MULTI_ARG_1_DF2:
27250 case MULTI_ARG_1_DI:
27251 case MULTI_ARG_1_SI:
27252 case MULTI_ARG_1_HI:
27253 case MULTI_ARG_1_QI:
27254 case MULTI_ARG_1_SI_DI:
27255 case MULTI_ARG_1_HI_DI:
27256 case MULTI_ARG_1_HI_SI:
27257 case MULTI_ARG_1_QI_DI:
27258 case MULTI_ARG_1_QI_SI:
27259 case MULTI_ARG_1_QI_HI:
27260 nargs = 1;
27261 break;
27262
27263 case MULTI_ARG_2_DI_CMP:
27264 case MULTI_ARG_2_SI_CMP:
27265 case MULTI_ARG_2_HI_CMP:
27266 case MULTI_ARG_2_QI_CMP:
27267 nargs = 2;
27268 comparison_p = true;
27269 break;
27270
27271 case MULTI_ARG_2_SF_TF:
27272 case MULTI_ARG_2_DF_TF:
27273 case MULTI_ARG_2_DI_TF:
27274 case MULTI_ARG_2_SI_TF:
27275 case MULTI_ARG_2_HI_TF:
27276 case MULTI_ARG_2_QI_TF:
27277 nargs = 2;
27278 tf_p = true;
27279 break;
27280
27281 default:
27282 gcc_unreachable ();
27283 }
27284
27285 if (optimize || !target
27286 || GET_MODE (target) != tmode
27287 || !insn_data[icode].operand[0].predicate (target, tmode))
27288 target = gen_reg_rtx (tmode);
27289
27290 gcc_assert (nargs <= 4);
27291
27292 for (i = 0; i < nargs; i++)
27293 {
27294 tree arg = CALL_EXPR_ARG (exp, i);
27295 rtx op = expand_normal (arg);
27296 int adjust = (comparison_p) ? 1 : 0;
27297 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
27298
27299 if (last_arg_constant && i == nargs - 1)
27300 {
27301 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
27302 {
27303 enum insn_code new_icode = icode;
27304 switch (icode)
27305 {
27306 case CODE_FOR_xop_vpermil2v2df3:
27307 case CODE_FOR_xop_vpermil2v4sf3:
27308 case CODE_FOR_xop_vpermil2v4df3:
27309 case CODE_FOR_xop_vpermil2v8sf3:
27310 error ("the last argument must be a 2-bit immediate");
27311 return gen_reg_rtx (tmode);
27312 case CODE_FOR_xop_rotlv2di3:
27313 new_icode = CODE_FOR_rotlv2di3;
27314 goto xop_rotl;
27315 case CODE_FOR_xop_rotlv4si3:
27316 new_icode = CODE_FOR_rotlv4si3;
27317 goto xop_rotl;
27318 case CODE_FOR_xop_rotlv8hi3:
27319 new_icode = CODE_FOR_rotlv8hi3;
27320 goto xop_rotl;
27321 case CODE_FOR_xop_rotlv16qi3:
27322 new_icode = CODE_FOR_rotlv16qi3;
27323 xop_rotl:
27324 if (CONST_INT_P (op))
27325 {
27326 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
27327 op = GEN_INT (INTVAL (op) & mask);
27328 gcc_checking_assert
27329 (insn_data[icode].operand[i + 1].predicate (op, mode));
27330 }
27331 else
27332 {
27333 gcc_checking_assert
27334 (nargs == 2
27335 && insn_data[new_icode].operand[0].mode == tmode
27336 && insn_data[new_icode].operand[1].mode == tmode
27337 && insn_data[new_icode].operand[2].mode == mode
27338 && insn_data[new_icode].operand[0].predicate
27339 == insn_data[icode].operand[0].predicate
27340 && insn_data[new_icode].operand[1].predicate
27341 == insn_data[icode].operand[1].predicate);
27342 icode = new_icode;
27343 goto non_constant;
27344 }
27345 break;
27346 default:
27347 gcc_unreachable ();
27348 }
27349 }
27350 }
27351 else
27352 {
27353 non_constant:
27354 if (VECTOR_MODE_P (mode))
27355 op = safe_vector_operand (op, mode);
27356
27357 /* If we aren't optimizing, only allow one memory operand to be
27358 generated. */
27359 if (memory_operand (op, mode))
27360 num_memory++;
27361
27362 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
27363
27364 if (optimize
27365 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
27366 || num_memory > 1)
27367 op = force_reg (mode, op);
27368 }
27369
27370 args[i].op = op;
27371 args[i].mode = mode;
27372 }
27373
27374 switch (nargs)
27375 {
27376 case 1:
27377 pat = GEN_FCN (icode) (target, args[0].op);
27378 break;
27379
27380 case 2:
27381 if (tf_p)
27382 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
27383 GEN_INT ((int)sub_code));
27384 else if (! comparison_p)
27385 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
27386 else
27387 {
27388 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
27389 args[0].op,
27390 args[1].op);
27391
27392 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
27393 }
27394 break;
27395
27396 case 3:
27397 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
27398 break;
27399
27400 case 4:
27401 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
27402 break;
27403
27404 default:
27405 gcc_unreachable ();
27406 }
27407
27408 if (! pat)
27409 return 0;
27410
27411 emit_insn (pat);
27412 return target;
27413 }
27414
27415 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
27416 insns with vec_merge. */
27417
27418 static rtx
27419 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
27420 rtx target)
27421 {
27422 rtx pat;
27423 tree arg0 = CALL_EXPR_ARG (exp, 0);
27424 rtx op1, op0 = expand_normal (arg0);
27425 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27426 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27427
27428 if (optimize || !target
27429 || GET_MODE (target) != tmode
27430 || !insn_data[icode].operand[0].predicate (target, tmode))
27431 target = gen_reg_rtx (tmode);
27432
27433 if (VECTOR_MODE_P (mode0))
27434 op0 = safe_vector_operand (op0, mode0);
27435
27436 if ((optimize && !register_operand (op0, mode0))
27437 || !insn_data[icode].operand[1].predicate (op0, mode0))
27438 op0 = copy_to_mode_reg (mode0, op0);
27439
27440 op1 = op0;
27441 if (!insn_data[icode].operand[2].predicate (op1, mode0))
27442 op1 = copy_to_mode_reg (mode0, op1);
27443
27444 pat = GEN_FCN (icode) (target, op0, op1);
27445 if (! pat)
27446 return 0;
27447 emit_insn (pat);
27448 return target;
27449 }
27450
27451 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
27452
27453 static rtx
27454 ix86_expand_sse_compare (const struct builtin_description *d,
27455 tree exp, rtx target, bool swap)
27456 {
27457 rtx pat;
27458 tree arg0 = CALL_EXPR_ARG (exp, 0);
27459 tree arg1 = CALL_EXPR_ARG (exp, 1);
27460 rtx op0 = expand_normal (arg0);
27461 rtx op1 = expand_normal (arg1);
27462 rtx op2;
27463 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
27464 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
27465 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
27466 enum rtx_code comparison = d->comparison;
27467
27468 if (VECTOR_MODE_P (mode0))
27469 op0 = safe_vector_operand (op0, mode0);
27470 if (VECTOR_MODE_P (mode1))
27471 op1 = safe_vector_operand (op1, mode1);
27472
27473 /* Swap operands if we have a comparison that isn't available in
27474 hardware. */
27475 if (swap)
27476 {
27477 rtx tmp = gen_reg_rtx (mode1);
27478 emit_move_insn (tmp, op1);
27479 op1 = op0;
27480 op0 = tmp;
27481 }
27482
27483 if (optimize || !target
27484 || GET_MODE (target) != tmode
27485 || !insn_data[d->icode].operand[0].predicate (target, tmode))
27486 target = gen_reg_rtx (tmode);
27487
27488 if ((optimize && !register_operand (op0, mode0))
27489 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
27490 op0 = copy_to_mode_reg (mode0, op0);
27491 if ((optimize && !register_operand (op1, mode1))
27492 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
27493 op1 = copy_to_mode_reg (mode1, op1);
27494
27495 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
27496 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
27497 if (! pat)
27498 return 0;
27499 emit_insn (pat);
27500 return target;
27501 }
27502
27503 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
27504
27505 static rtx
27506 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
27507 rtx target)
27508 {
27509 rtx pat;
27510 tree arg0 = CALL_EXPR_ARG (exp, 0);
27511 tree arg1 = CALL_EXPR_ARG (exp, 1);
27512 rtx op0 = expand_normal (arg0);
27513 rtx op1 = expand_normal (arg1);
27514 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
27515 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
27516 enum rtx_code comparison = d->comparison;
27517
27518 if (VECTOR_MODE_P (mode0))
27519 op0 = safe_vector_operand (op0, mode0);
27520 if (VECTOR_MODE_P (mode1))
27521 op1 = safe_vector_operand (op1, mode1);
27522
27523 /* Swap operands if we have a comparison that isn't available in
27524 hardware. */
27525 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
27526 {
27527 rtx tmp = op1;
27528 op1 = op0;
27529 op0 = tmp;
27530 }
27531
27532 target = gen_reg_rtx (SImode);
27533 emit_move_insn (target, const0_rtx);
27534 target = gen_rtx_SUBREG (QImode, target, 0);
27535
27536 if ((optimize && !register_operand (op0, mode0))
27537 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
27538 op0 = copy_to_mode_reg (mode0, op0);
27539 if ((optimize && !register_operand (op1, mode1))
27540 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
27541 op1 = copy_to_mode_reg (mode1, op1);
27542
27543 pat = GEN_FCN (d->icode) (op0, op1);
27544 if (! pat)
27545 return 0;
27546 emit_insn (pat);
27547 emit_insn (gen_rtx_SET (VOIDmode,
27548 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
27549 gen_rtx_fmt_ee (comparison, QImode,
27550 SET_DEST (pat),
27551 const0_rtx)));
27552
27553 return SUBREG_REG (target);
27554 }
27555
27556 /* Subroutine of ix86_expand_args_builtin to take care of round insns. */
27557
27558 static rtx
27559 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
27560 rtx target)
27561 {
27562 rtx pat;
27563 tree arg0 = CALL_EXPR_ARG (exp, 0);
27564 rtx op1, op0 = expand_normal (arg0);
27565 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
27566 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
27567
27568 if (optimize || target == 0
27569 || GET_MODE (target) != tmode
27570 || !insn_data[d->icode].operand[0].predicate (target, tmode))
27571 target = gen_reg_rtx (tmode);
27572
27573 if (VECTOR_MODE_P (mode0))
27574 op0 = safe_vector_operand (op0, mode0);
27575
27576 if ((optimize && !register_operand (op0, mode0))
27577 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
27578 op0 = copy_to_mode_reg (mode0, op0);
27579
27580 op1 = GEN_INT (d->comparison);
27581
27582 pat = GEN_FCN (d->icode) (target, op0, op1);
27583 if (! pat)
27584 return 0;
27585 emit_insn (pat);
27586 return target;
27587 }
27588
27589 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
27590
27591 static rtx
27592 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
27593 rtx target)
27594 {
27595 rtx pat;
27596 tree arg0 = CALL_EXPR_ARG (exp, 0);
27597 tree arg1 = CALL_EXPR_ARG (exp, 1);
27598 rtx op0 = expand_normal (arg0);
27599 rtx op1 = expand_normal (arg1);
27600 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
27601 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
27602 enum rtx_code comparison = d->comparison;
27603
27604 if (VECTOR_MODE_P (mode0))
27605 op0 = safe_vector_operand (op0, mode0);
27606 if (VECTOR_MODE_P (mode1))
27607 op1 = safe_vector_operand (op1, mode1);
27608
27609 target = gen_reg_rtx (SImode);
27610 emit_move_insn (target, const0_rtx);
27611 target = gen_rtx_SUBREG (QImode, target, 0);
27612
27613 if ((optimize && !register_operand (op0, mode0))
27614 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
27615 op0 = copy_to_mode_reg (mode0, op0);
27616 if ((optimize && !register_operand (op1, mode1))
27617 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
27618 op1 = copy_to_mode_reg (mode1, op1);
27619
27620 pat = GEN_FCN (d->icode) (op0, op1);
27621 if (! pat)
27622 return 0;
27623 emit_insn (pat);
27624 emit_insn (gen_rtx_SET (VOIDmode,
27625 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
27626 gen_rtx_fmt_ee (comparison, QImode,
27627 SET_DEST (pat),
27628 const0_rtx)));
27629
27630 return SUBREG_REG (target);
27631 }
27632
27633 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
27634
27635 static rtx
27636 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
27637 tree exp, rtx target)
27638 {
27639 rtx pat;
27640 tree arg0 = CALL_EXPR_ARG (exp, 0);
27641 tree arg1 = CALL_EXPR_ARG (exp, 1);
27642 tree arg2 = CALL_EXPR_ARG (exp, 2);
27643 tree arg3 = CALL_EXPR_ARG (exp, 3);
27644 tree arg4 = CALL_EXPR_ARG (exp, 4);
27645 rtx scratch0, scratch1;
27646 rtx op0 = expand_normal (arg0);
27647 rtx op1 = expand_normal (arg1);
27648 rtx op2 = expand_normal (arg2);
27649 rtx op3 = expand_normal (arg3);
27650 rtx op4 = expand_normal (arg4);
27651 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
27652
27653 tmode0 = insn_data[d->icode].operand[0].mode;
27654 tmode1 = insn_data[d->icode].operand[1].mode;
27655 modev2 = insn_data[d->icode].operand[2].mode;
27656 modei3 = insn_data[d->icode].operand[3].mode;
27657 modev4 = insn_data[d->icode].operand[4].mode;
27658 modei5 = insn_data[d->icode].operand[5].mode;
27659 modeimm = insn_data[d->icode].operand[6].mode;
27660
27661 if (VECTOR_MODE_P (modev2))
27662 op0 = safe_vector_operand (op0, modev2);
27663 if (VECTOR_MODE_P (modev4))
27664 op2 = safe_vector_operand (op2, modev4);
27665
27666 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
27667 op0 = copy_to_mode_reg (modev2, op0);
27668 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
27669 op1 = copy_to_mode_reg (modei3, op1);
27670 if ((optimize && !register_operand (op2, modev4))
27671 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
27672 op2 = copy_to_mode_reg (modev4, op2);
27673 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
27674 op3 = copy_to_mode_reg (modei5, op3);
27675
27676 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
27677 {
27678 error ("the fifth argument must be an 8-bit immediate");
27679 return const0_rtx;
27680 }
27681
27682 if (d->code == IX86_BUILTIN_PCMPESTRI128)
27683 {
27684 if (optimize || !target
27685 || GET_MODE (target) != tmode0
27686 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
27687 target = gen_reg_rtx (tmode0);
27688
27689 scratch1 = gen_reg_rtx (tmode1);
27690
27691 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
27692 }
27693 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
27694 {
27695 if (optimize || !target
27696 || GET_MODE (target) != tmode1
27697 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
27698 target = gen_reg_rtx (tmode1);
27699
27700 scratch0 = gen_reg_rtx (tmode0);
27701
27702 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
27703 }
27704 else
27705 {
27706 gcc_assert (d->flag);
27707
27708 scratch0 = gen_reg_rtx (tmode0);
27709 scratch1 = gen_reg_rtx (tmode1);
27710
27711 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
27712 }
27713
27714 if (! pat)
27715 return 0;
27716
27717 emit_insn (pat);
27718
27719 if (d->flag)
27720 {
27721 target = gen_reg_rtx (SImode);
27722 emit_move_insn (target, const0_rtx);
27723 target = gen_rtx_SUBREG (QImode, target, 0);
27724
27725 emit_insn
27726 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
27727 gen_rtx_fmt_ee (EQ, QImode,
27728 gen_rtx_REG ((enum machine_mode) d->flag,
27729 FLAGS_REG),
27730 const0_rtx)));
27731 return SUBREG_REG (target);
27732 }
27733 else
27734 return target;
27735 }
27736
27737
27738 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
27739
27740 static rtx
27741 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
27742 tree exp, rtx target)
27743 {
27744 rtx pat;
27745 tree arg0 = CALL_EXPR_ARG (exp, 0);
27746 tree arg1 = CALL_EXPR_ARG (exp, 1);
27747 tree arg2 = CALL_EXPR_ARG (exp, 2);
27748 rtx scratch0, scratch1;
27749 rtx op0 = expand_normal (arg0);
27750 rtx op1 = expand_normal (arg1);
27751 rtx op2 = expand_normal (arg2);
27752 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
27753
27754 tmode0 = insn_data[d->icode].operand[0].mode;
27755 tmode1 = insn_data[d->icode].operand[1].mode;
27756 modev2 = insn_data[d->icode].operand[2].mode;
27757 modev3 = insn_data[d->icode].operand[3].mode;
27758 modeimm = insn_data[d->icode].operand[4].mode;
27759
27760 if (VECTOR_MODE_P (modev2))
27761 op0 = safe_vector_operand (op0, modev2);
27762 if (VECTOR_MODE_P (modev3))
27763 op1 = safe_vector_operand (op1, modev3);
27764
27765 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
27766 op0 = copy_to_mode_reg (modev2, op0);
27767 if ((optimize && !register_operand (op1, modev3))
27768 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
27769 op1 = copy_to_mode_reg (modev3, op1);
27770
27771 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
27772 {
27773 error ("the third argument must be an 8-bit immediate");
27774 return const0_rtx;
27775 }
27776
27777 if (d->code == IX86_BUILTIN_PCMPISTRI128)
27778 {
27779 if (optimize || !target
27780 || GET_MODE (target) != tmode0
27781 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
27782 target = gen_reg_rtx (tmode0);
27783
27784 scratch1 = gen_reg_rtx (tmode1);
27785
27786 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
27787 }
27788 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
27789 {
27790 if (optimize || !target
27791 || GET_MODE (target) != tmode1
27792 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
27793 target = gen_reg_rtx (tmode1);
27794
27795 scratch0 = gen_reg_rtx (tmode0);
27796
27797 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
27798 }
27799 else
27800 {
27801 gcc_assert (d->flag);
27802
27803 scratch0 = gen_reg_rtx (tmode0);
27804 scratch1 = gen_reg_rtx (tmode1);
27805
27806 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
27807 }
27808
27809 if (! pat)
27810 return 0;
27811
27812 emit_insn (pat);
27813
27814 if (d->flag)
27815 {
27816 target = gen_reg_rtx (SImode);
27817 emit_move_insn (target, const0_rtx);
27818 target = gen_rtx_SUBREG (QImode, target, 0);
27819
27820 emit_insn
27821 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
27822 gen_rtx_fmt_ee (EQ, QImode,
27823 gen_rtx_REG ((enum machine_mode) d->flag,
27824 FLAGS_REG),
27825 const0_rtx)));
27826 return SUBREG_REG (target);
27827 }
27828 else
27829 return target;
27830 }
27831
27832 /* Subroutine of ix86_expand_builtin to take care of insns with
27833 variable number of operands. */
27834
27835 static rtx
27836 ix86_expand_args_builtin (const struct builtin_description *d,
27837 tree exp, rtx target)
27838 {
27839 rtx pat, real_target;
27840 unsigned int i, nargs;
27841 unsigned int nargs_constant = 0;
27842 int num_memory = 0;
27843 struct
27844 {
27845 rtx op;
27846 enum machine_mode mode;
27847 } args[4];
27848 bool last_arg_count = false;
27849 enum insn_code icode = d->icode;
27850 const struct insn_data_d *insn_p = &insn_data[icode];
27851 enum machine_mode tmode = insn_p->operand[0].mode;
27852 enum machine_mode rmode = VOIDmode;
27853 bool swap = false;
27854 enum rtx_code comparison = d->comparison;
27855
27856 switch ((enum ix86_builtin_func_type) d->flag)
27857 {
27858 case V2DF_FTYPE_V2DF_ROUND:
27859 case V4DF_FTYPE_V4DF_ROUND:
27860 case V4SF_FTYPE_V4SF_ROUND:
27861 case V8SF_FTYPE_V8SF_ROUND:
27862 return ix86_expand_sse_round (d, exp, target);
27863 case INT_FTYPE_V8SF_V8SF_PTEST:
27864 case INT_FTYPE_V4DI_V4DI_PTEST:
27865 case INT_FTYPE_V4DF_V4DF_PTEST:
27866 case INT_FTYPE_V4SF_V4SF_PTEST:
27867 case INT_FTYPE_V2DI_V2DI_PTEST:
27868 case INT_FTYPE_V2DF_V2DF_PTEST:
27869 return ix86_expand_sse_ptest (d, exp, target);
27870 case FLOAT128_FTYPE_FLOAT128:
27871 case FLOAT_FTYPE_FLOAT:
27872 case INT_FTYPE_INT:
27873 case UINT64_FTYPE_INT:
27874 case UINT16_FTYPE_UINT16:
27875 case INT64_FTYPE_INT64:
27876 case INT64_FTYPE_V4SF:
27877 case INT64_FTYPE_V2DF:
27878 case INT_FTYPE_V16QI:
27879 case INT_FTYPE_V8QI:
27880 case INT_FTYPE_V8SF:
27881 case INT_FTYPE_V4DF:
27882 case INT_FTYPE_V4SF:
27883 case INT_FTYPE_V2DF:
27884 case INT_FTYPE_V32QI:
27885 case V16QI_FTYPE_V16QI:
27886 case V8SI_FTYPE_V8SF:
27887 case V8SI_FTYPE_V4SI:
27888 case V8HI_FTYPE_V8HI:
27889 case V8HI_FTYPE_V16QI:
27890 case V8QI_FTYPE_V8QI:
27891 case V8SF_FTYPE_V8SF:
27892 case V8SF_FTYPE_V8SI:
27893 case V8SF_FTYPE_V4SF:
27894 case V8SF_FTYPE_V8HI:
27895 case V4SI_FTYPE_V4SI:
27896 case V4SI_FTYPE_V16QI:
27897 case V4SI_FTYPE_V4SF:
27898 case V4SI_FTYPE_V8SI:
27899 case V4SI_FTYPE_V8HI:
27900 case V4SI_FTYPE_V4DF:
27901 case V4SI_FTYPE_V2DF:
27902 case V4HI_FTYPE_V4HI:
27903 case V4DF_FTYPE_V4DF:
27904 case V4DF_FTYPE_V4SI:
27905 case V4DF_FTYPE_V4SF:
27906 case V4DF_FTYPE_V2DF:
27907 case V4SF_FTYPE_V4SF:
27908 case V4SF_FTYPE_V4SI:
27909 case V4SF_FTYPE_V8SF:
27910 case V4SF_FTYPE_V4DF:
27911 case V4SF_FTYPE_V8HI:
27912 case V4SF_FTYPE_V2DF:
27913 case V2DI_FTYPE_V2DI:
27914 case V2DI_FTYPE_V16QI:
27915 case V2DI_FTYPE_V8HI:
27916 case V2DI_FTYPE_V4SI:
27917 case V2DF_FTYPE_V2DF:
27918 case V2DF_FTYPE_V4SI:
27919 case V2DF_FTYPE_V4DF:
27920 case V2DF_FTYPE_V4SF:
27921 case V2DF_FTYPE_V2SI:
27922 case V2SI_FTYPE_V2SI:
27923 case V2SI_FTYPE_V4SF:
27924 case V2SI_FTYPE_V2SF:
27925 case V2SI_FTYPE_V2DF:
27926 case V2SF_FTYPE_V2SF:
27927 case V2SF_FTYPE_V2SI:
27928 case V32QI_FTYPE_V32QI:
27929 case V32QI_FTYPE_V16QI:
27930 case V16HI_FTYPE_V16HI:
27931 case V16HI_FTYPE_V8HI:
27932 case V8SI_FTYPE_V8SI:
27933 case V16HI_FTYPE_V16QI:
27934 case V8SI_FTYPE_V16QI:
27935 case V4DI_FTYPE_V16QI:
27936 case V8SI_FTYPE_V8HI:
27937 case V4DI_FTYPE_V8HI:
27938 case V4DI_FTYPE_V4SI:
27939 case V4DI_FTYPE_V2DI:
27940 nargs = 1;
27941 break;
27942 case V4SF_FTYPE_V4SF_VEC_MERGE:
27943 case V2DF_FTYPE_V2DF_VEC_MERGE:
27944 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
27945 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
27946 case V16QI_FTYPE_V16QI_V16QI:
27947 case V16QI_FTYPE_V8HI_V8HI:
27948 case V8QI_FTYPE_V8QI_V8QI:
27949 case V8QI_FTYPE_V4HI_V4HI:
27950 case V8HI_FTYPE_V8HI_V8HI:
27951 case V8HI_FTYPE_V16QI_V16QI:
27952 case V8HI_FTYPE_V4SI_V4SI:
27953 case V8SF_FTYPE_V8SF_V8SF:
27954 case V8SF_FTYPE_V8SF_V8SI:
27955 case V4SI_FTYPE_V4SI_V4SI:
27956 case V4SI_FTYPE_V8HI_V8HI:
27957 case V4SI_FTYPE_V4SF_V4SF:
27958 case V4SI_FTYPE_V2DF_V2DF:
27959 case V4HI_FTYPE_V4HI_V4HI:
27960 case V4HI_FTYPE_V8QI_V8QI:
27961 case V4HI_FTYPE_V2SI_V2SI:
27962 case V4DF_FTYPE_V4DF_V4DF:
27963 case V4DF_FTYPE_V4DF_V4DI:
27964 case V4SF_FTYPE_V4SF_V4SF:
27965 case V4SF_FTYPE_V4SF_V4SI:
27966 case V4SF_FTYPE_V4SF_V2SI:
27967 case V4SF_FTYPE_V4SF_V2DF:
27968 case V4SF_FTYPE_V4SF_DI:
27969 case V4SF_FTYPE_V4SF_SI:
27970 case V2DI_FTYPE_V2DI_V2DI:
27971 case V2DI_FTYPE_V16QI_V16QI:
27972 case V2DI_FTYPE_V4SI_V4SI:
27973 case V2DI_FTYPE_V2DI_V16QI:
27974 case V2DI_FTYPE_V2DF_V2DF:
27975 case V2SI_FTYPE_V2SI_V2SI:
27976 case V2SI_FTYPE_V4HI_V4HI:
27977 case V2SI_FTYPE_V2SF_V2SF:
27978 case V2DF_FTYPE_V2DF_V2DF:
27979 case V2DF_FTYPE_V2DF_V4SF:
27980 case V2DF_FTYPE_V2DF_V2DI:
27981 case V2DF_FTYPE_V2DF_DI:
27982 case V2DF_FTYPE_V2DF_SI:
27983 case V2SF_FTYPE_V2SF_V2SF:
27984 case V1DI_FTYPE_V1DI_V1DI:
27985 case V1DI_FTYPE_V8QI_V8QI:
27986 case V1DI_FTYPE_V2SI_V2SI:
27987 case V32QI_FTYPE_V16HI_V16HI:
27988 case V16HI_FTYPE_V8SI_V8SI:
27989 case V32QI_FTYPE_V32QI_V32QI:
27990 case V16HI_FTYPE_V32QI_V32QI:
27991 case V16HI_FTYPE_V16HI_V16HI:
27992 case V8SI_FTYPE_V8SI_V8SI:
27993 case V8SI_FTYPE_V16HI_V16HI:
27994 case V4DI_FTYPE_V4DI_V4DI:
27995 case V4DI_FTYPE_V8SI_V8SI:
27996 if (comparison == UNKNOWN)
27997 return ix86_expand_binop_builtin (icode, exp, target);
27998 nargs = 2;
27999 break;
28000 case V4SF_FTYPE_V4SF_V4SF_SWAP:
28001 case V2DF_FTYPE_V2DF_V2DF_SWAP:
28002 gcc_assert (comparison != UNKNOWN);
28003 nargs = 2;
28004 swap = true;
28005 break;
28006 case V16HI_FTYPE_V16HI_V8HI_COUNT:
28007 case V16HI_FTYPE_V16HI_SI_COUNT:
28008 case V8SI_FTYPE_V8SI_V4SI_COUNT:
28009 case V8SI_FTYPE_V8SI_SI_COUNT:
28010 case V4DI_FTYPE_V4DI_V2DI_COUNT:
28011 case V4DI_FTYPE_V4DI_INT_COUNT:
28012 case V8HI_FTYPE_V8HI_V8HI_COUNT:
28013 case V8HI_FTYPE_V8HI_SI_COUNT:
28014 case V4SI_FTYPE_V4SI_V4SI_COUNT:
28015 case V4SI_FTYPE_V4SI_SI_COUNT:
28016 case V4HI_FTYPE_V4HI_V4HI_COUNT:
28017 case V4HI_FTYPE_V4HI_SI_COUNT:
28018 case V2DI_FTYPE_V2DI_V2DI_COUNT:
28019 case V2DI_FTYPE_V2DI_SI_COUNT:
28020 case V2SI_FTYPE_V2SI_V2SI_COUNT:
28021 case V2SI_FTYPE_V2SI_SI_COUNT:
28022 case V1DI_FTYPE_V1DI_V1DI_COUNT:
28023 case V1DI_FTYPE_V1DI_SI_COUNT:
28024 nargs = 2;
28025 last_arg_count = true;
28026 break;
28027 case UINT64_FTYPE_UINT64_UINT64:
28028 case UINT_FTYPE_UINT_UINT:
28029 case UINT_FTYPE_UINT_USHORT:
28030 case UINT_FTYPE_UINT_UCHAR:
28031 case UINT16_FTYPE_UINT16_INT:
28032 case UINT8_FTYPE_UINT8_INT:
28033 nargs = 2;
28034 break;
28035 case V2DI_FTYPE_V2DI_INT_CONVERT:
28036 nargs = 2;
28037 rmode = V1TImode;
28038 nargs_constant = 1;
28039 break;
28040 case V4DI_FTYPE_V4DI_INT_CONVERT:
28041 nargs = 2;
28042 rmode = V2TImode;
28043 nargs_constant = 1;
28044 break;
28045 case V8HI_FTYPE_V8HI_INT:
28046 case V8HI_FTYPE_V8SF_INT:
28047 case V8HI_FTYPE_V4SF_INT:
28048 case V8SF_FTYPE_V8SF_INT:
28049 case V4SI_FTYPE_V4SI_INT:
28050 case V4SI_FTYPE_V8SI_INT:
28051 case V4HI_FTYPE_V4HI_INT:
28052 case V4DF_FTYPE_V4DF_INT:
28053 case V4SF_FTYPE_V4SF_INT:
28054 case V4SF_FTYPE_V8SF_INT:
28055 case V2DI_FTYPE_V2DI_INT:
28056 case V2DF_FTYPE_V2DF_INT:
28057 case V2DF_FTYPE_V4DF_INT:
28058 case V16HI_FTYPE_V16HI_INT:
28059 case V8SI_FTYPE_V8SI_INT:
28060 case V4DI_FTYPE_V4DI_INT:
28061 case V2DI_FTYPE_V4DI_INT:
28062 nargs = 2;
28063 nargs_constant = 1;
28064 break;
28065 case V16QI_FTYPE_V16QI_V16QI_V16QI:
28066 case V8SF_FTYPE_V8SF_V8SF_V8SF:
28067 case V4DF_FTYPE_V4DF_V4DF_V4DF:
28068 case V4SF_FTYPE_V4SF_V4SF_V4SF:
28069 case V2DF_FTYPE_V2DF_V2DF_V2DF:
28070 case V32QI_FTYPE_V32QI_V32QI_V32QI:
28071 nargs = 3;
28072 break;
28073 case V32QI_FTYPE_V32QI_V32QI_INT:
28074 case V16HI_FTYPE_V16HI_V16HI_INT:
28075 case V16QI_FTYPE_V16QI_V16QI_INT:
28076 case V4DI_FTYPE_V4DI_V4DI_INT:
28077 case V8HI_FTYPE_V8HI_V8HI_INT:
28078 case V8SI_FTYPE_V8SI_V8SI_INT:
28079 case V8SI_FTYPE_V8SI_V4SI_INT:
28080 case V8SF_FTYPE_V8SF_V8SF_INT:
28081 case V8SF_FTYPE_V8SF_V4SF_INT:
28082 case V4SI_FTYPE_V4SI_V4SI_INT:
28083 case V4DF_FTYPE_V4DF_V4DF_INT:
28084 case V4DF_FTYPE_V4DF_V2DF_INT:
28085 case V4SF_FTYPE_V4SF_V4SF_INT:
28086 case V2DI_FTYPE_V2DI_V2DI_INT:
28087 case V4DI_FTYPE_V4DI_V2DI_INT:
28088 case V2DF_FTYPE_V2DF_V2DF_INT:
28089 nargs = 3;
28090 nargs_constant = 1;
28091 break;
28092 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
28093 nargs = 3;
28094 rmode = V4DImode;
28095 nargs_constant = 1;
28096 break;
28097 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
28098 nargs = 3;
28099 rmode = V2DImode;
28100 nargs_constant = 1;
28101 break;
28102 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
28103 nargs = 3;
28104 rmode = DImode;
28105 nargs_constant = 1;
28106 break;
28107 case V2DI_FTYPE_V2DI_UINT_UINT:
28108 nargs = 3;
28109 nargs_constant = 2;
28110 break;
28111 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
28112 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
28113 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
28114 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
28115 nargs = 4;
28116 nargs_constant = 1;
28117 break;
28118 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
28119 nargs = 4;
28120 nargs_constant = 2;
28121 break;
28122 default:
28123 gcc_unreachable ();
28124 }
28125
28126 gcc_assert (nargs <= ARRAY_SIZE (args));
28127
28128 if (comparison != UNKNOWN)
28129 {
28130 gcc_assert (nargs == 2);
28131 return ix86_expand_sse_compare (d, exp, target, swap);
28132 }
28133
28134 if (rmode == VOIDmode || rmode == tmode)
28135 {
28136 if (optimize
28137 || target == 0
28138 || GET_MODE (target) != tmode
28139 || !insn_p->operand[0].predicate (target, tmode))
28140 target = gen_reg_rtx (tmode);
28141 real_target = target;
28142 }
28143 else
28144 {
28145 target = gen_reg_rtx (rmode);
28146 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
28147 }
28148
28149 for (i = 0; i < nargs; i++)
28150 {
28151 tree arg = CALL_EXPR_ARG (exp, i);
28152 rtx op = expand_normal (arg);
28153 enum machine_mode mode = insn_p->operand[i + 1].mode;
28154 bool match = insn_p->operand[i + 1].predicate (op, mode);
28155
28156 if (last_arg_count && (i + 1) == nargs)
28157 {
28158 /* SIMD shift insns take either an 8-bit immediate or
28159 register as count. But builtin functions take int as
28160 count. If count doesn't match, we put it in register. */
28161 if (!match)
28162 {
28163 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
28164 if (!insn_p->operand[i + 1].predicate (op, mode))
28165 op = copy_to_reg (op);
28166 }
28167 }
28168 else if ((nargs - i) <= nargs_constant)
28169 {
28170 if (!match)
28171 switch (icode)
28172 {
28173 case CODE_FOR_avx2_inserti128:
28174 case CODE_FOR_avx2_extracti128:
28175 error ("the last argument must be an 1-bit immediate");
28176 return const0_rtx;
28177
28178 case CODE_FOR_sse4_1_roundpd:
28179 case CODE_FOR_sse4_1_roundps:
28180 case CODE_FOR_sse4_1_roundsd:
28181 case CODE_FOR_sse4_1_roundss:
28182 case CODE_FOR_sse4_1_blendps:
28183 case CODE_FOR_avx_blendpd256:
28184 case CODE_FOR_avx_vpermilv4df:
28185 case CODE_FOR_avx_roundpd256:
28186 case CODE_FOR_avx_roundps256:
28187 error ("the last argument must be a 4-bit immediate");
28188 return const0_rtx;
28189
28190 case CODE_FOR_sse4_1_blendpd:
28191 case CODE_FOR_avx_vpermilv2df:
28192 case CODE_FOR_xop_vpermil2v2df3:
28193 case CODE_FOR_xop_vpermil2v4sf3:
28194 case CODE_FOR_xop_vpermil2v4df3:
28195 case CODE_FOR_xop_vpermil2v8sf3:
28196 error ("the last argument must be a 2-bit immediate");
28197 return const0_rtx;
28198
28199 case CODE_FOR_avx_vextractf128v4df:
28200 case CODE_FOR_avx_vextractf128v8sf:
28201 case CODE_FOR_avx_vextractf128v8si:
28202 case CODE_FOR_avx_vinsertf128v4df:
28203 case CODE_FOR_avx_vinsertf128v8sf:
28204 case CODE_FOR_avx_vinsertf128v8si:
28205 error ("the last argument must be a 1-bit immediate");
28206 return const0_rtx;
28207
28208 case CODE_FOR_avx_vmcmpv2df3:
28209 case CODE_FOR_avx_vmcmpv4sf3:
28210 case CODE_FOR_avx_cmpv2df3:
28211 case CODE_FOR_avx_cmpv4sf3:
28212 case CODE_FOR_avx_cmpv4df3:
28213 case CODE_FOR_avx_cmpv8sf3:
28214 error ("the last argument must be a 5-bit immediate");
28215 return const0_rtx;
28216
28217 default:
28218 switch (nargs_constant)
28219 {
28220 case 2:
28221 if ((nargs - i) == nargs_constant)
28222 {
28223 error ("the next to last argument must be an 8-bit immediate");
28224 break;
28225 }
28226 case 1:
28227 error ("the last argument must be an 8-bit immediate");
28228 break;
28229 default:
28230 gcc_unreachable ();
28231 }
28232 return const0_rtx;
28233 }
28234 }
28235 else
28236 {
28237 if (VECTOR_MODE_P (mode))
28238 op = safe_vector_operand (op, mode);
28239
28240 /* If we aren't optimizing, only allow one memory operand to
28241 be generated. */
28242 if (memory_operand (op, mode))
28243 num_memory++;
28244
28245 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
28246 {
28247 if (optimize || !match || num_memory > 1)
28248 op = copy_to_mode_reg (mode, op);
28249 }
28250 else
28251 {
28252 op = copy_to_reg (op);
28253 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
28254 }
28255 }
28256
28257 args[i].op = op;
28258 args[i].mode = mode;
28259 }
28260
28261 switch (nargs)
28262 {
28263 case 1:
28264 pat = GEN_FCN (icode) (real_target, args[0].op);
28265 break;
28266 case 2:
28267 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
28268 break;
28269 case 3:
28270 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28271 args[2].op);
28272 break;
28273 case 4:
28274 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28275 args[2].op, args[3].op);
28276 break;
28277 default:
28278 gcc_unreachable ();
28279 }
28280
28281 if (! pat)
28282 return 0;
28283
28284 emit_insn (pat);
28285 return target;
28286 }
28287
28288 /* Subroutine of ix86_expand_builtin to take care of special insns
28289 with variable number of operands. */
28290
28291 static rtx
28292 ix86_expand_special_args_builtin (const struct builtin_description *d,
28293 tree exp, rtx target)
28294 {
28295 tree arg;
28296 rtx pat, op;
28297 unsigned int i, nargs, arg_adjust, memory;
28298 struct
28299 {
28300 rtx op;
28301 enum machine_mode mode;
28302 } args[3];
28303 enum insn_code icode = d->icode;
28304 bool last_arg_constant = false;
28305 const struct insn_data_d *insn_p = &insn_data[icode];
28306 enum machine_mode tmode = insn_p->operand[0].mode;
28307 enum { load, store } klass;
28308
28309 switch ((enum ix86_builtin_func_type) d->flag)
28310 {
28311 case VOID_FTYPE_VOID:
28312 if (icode == CODE_FOR_avx_vzeroupper)
28313 target = GEN_INT (vzeroupper_intrinsic);
28314 emit_insn (GEN_FCN (icode) (target));
28315 return 0;
28316 case VOID_FTYPE_UINT64:
28317 case VOID_FTYPE_UNSIGNED:
28318 nargs = 0;
28319 klass = store;
28320 memory = 0;
28321 break;
28322 case UINT64_FTYPE_VOID:
28323 case UNSIGNED_FTYPE_VOID:
28324 nargs = 0;
28325 klass = load;
28326 memory = 0;
28327 break;
28328 case UINT64_FTYPE_PUNSIGNED:
28329 case V2DI_FTYPE_PV2DI:
28330 case V4DI_FTYPE_PV4DI:
28331 case V32QI_FTYPE_PCCHAR:
28332 case V16QI_FTYPE_PCCHAR:
28333 case V8SF_FTYPE_PCV4SF:
28334 case V8SF_FTYPE_PCFLOAT:
28335 case V4SF_FTYPE_PCFLOAT:
28336 case V4DF_FTYPE_PCV2DF:
28337 case V4DF_FTYPE_PCDOUBLE:
28338 case V2DF_FTYPE_PCDOUBLE:
28339 case VOID_FTYPE_PVOID:
28340 nargs = 1;
28341 klass = load;
28342 memory = 0;
28343 break;
28344 case VOID_FTYPE_PV2SF_V4SF:
28345 case VOID_FTYPE_PV4DI_V4DI:
28346 case VOID_FTYPE_PV2DI_V2DI:
28347 case VOID_FTYPE_PCHAR_V32QI:
28348 case VOID_FTYPE_PCHAR_V16QI:
28349 case VOID_FTYPE_PFLOAT_V8SF:
28350 case VOID_FTYPE_PFLOAT_V4SF:
28351 case VOID_FTYPE_PDOUBLE_V4DF:
28352 case VOID_FTYPE_PDOUBLE_V2DF:
28353 case VOID_FTYPE_PULONGLONG_ULONGLONG:
28354 case VOID_FTYPE_PINT_INT:
28355 nargs = 1;
28356 klass = store;
28357 /* Reserve memory operand for target. */
28358 memory = ARRAY_SIZE (args);
28359 break;
28360 case V4SF_FTYPE_V4SF_PCV2SF:
28361 case V2DF_FTYPE_V2DF_PCDOUBLE:
28362 nargs = 2;
28363 klass = load;
28364 memory = 1;
28365 break;
28366 case V8SF_FTYPE_PCV8SF_V8SI:
28367 case V4DF_FTYPE_PCV4DF_V4DI:
28368 case V4SF_FTYPE_PCV4SF_V4SI:
28369 case V2DF_FTYPE_PCV2DF_V2DI:
28370 case V8SI_FTYPE_PCV8SI_V8SI:
28371 case V4DI_FTYPE_PCV4DI_V4DI:
28372 case V4SI_FTYPE_PCV4SI_V4SI:
28373 case V2DI_FTYPE_PCV2DI_V2DI:
28374 nargs = 2;
28375 klass = load;
28376 memory = 0;
28377 break;
28378 case VOID_FTYPE_PV8SF_V8SI_V8SF:
28379 case VOID_FTYPE_PV4DF_V4DI_V4DF:
28380 case VOID_FTYPE_PV4SF_V4SI_V4SF:
28381 case VOID_FTYPE_PV2DF_V2DI_V2DF:
28382 case VOID_FTYPE_PV8SI_V8SI_V8SI:
28383 case VOID_FTYPE_PV4DI_V4DI_V4DI:
28384 case VOID_FTYPE_PV4SI_V4SI_V4SI:
28385 case VOID_FTYPE_PV2DI_V2DI_V2DI:
28386 nargs = 2;
28387 klass = store;
28388 /* Reserve memory operand for target. */
28389 memory = ARRAY_SIZE (args);
28390 break;
28391 case VOID_FTYPE_UINT_UINT_UINT:
28392 case VOID_FTYPE_UINT64_UINT_UINT:
28393 case UCHAR_FTYPE_UINT_UINT_UINT:
28394 case UCHAR_FTYPE_UINT64_UINT_UINT:
28395 nargs = 3;
28396 klass = load;
28397 memory = ARRAY_SIZE (args);
28398 last_arg_constant = true;
28399 break;
28400 default:
28401 gcc_unreachable ();
28402 }
28403
28404 gcc_assert (nargs <= ARRAY_SIZE (args));
28405
28406 if (klass == store)
28407 {
28408 arg = CALL_EXPR_ARG (exp, 0);
28409 op = expand_normal (arg);
28410 gcc_assert (target == 0);
28411 if (memory)
28412 {
28413 if (GET_MODE (op) != Pmode)
28414 op = convert_to_mode (Pmode, op, 1);
28415 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
28416 }
28417 else
28418 target = force_reg (tmode, op);
28419 arg_adjust = 1;
28420 }
28421 else
28422 {
28423 arg_adjust = 0;
28424 if (optimize
28425 || target == 0
28426 || GET_MODE (target) != tmode
28427 || !insn_p->operand[0].predicate (target, tmode))
28428 target = gen_reg_rtx (tmode);
28429 }
28430
28431 for (i = 0; i < nargs; i++)
28432 {
28433 enum machine_mode mode = insn_p->operand[i + 1].mode;
28434 bool match;
28435
28436 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
28437 op = expand_normal (arg);
28438 match = insn_p->operand[i + 1].predicate (op, mode);
28439
28440 if (last_arg_constant && (i + 1) == nargs)
28441 {
28442 if (!match)
28443 {
28444 if (icode == CODE_FOR_lwp_lwpvalsi3
28445 || icode == CODE_FOR_lwp_lwpinssi3
28446 || icode == CODE_FOR_lwp_lwpvaldi3
28447 || icode == CODE_FOR_lwp_lwpinsdi3)
28448 error ("the last argument must be a 32-bit immediate");
28449 else
28450 error ("the last argument must be an 8-bit immediate");
28451 return const0_rtx;
28452 }
28453 }
28454 else
28455 {
28456 if (i == memory)
28457 {
28458 /* This must be the memory operand. */
28459 if (GET_MODE (op) != Pmode)
28460 op = convert_to_mode (Pmode, op, 1);
28461 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
28462 gcc_assert (GET_MODE (op) == mode
28463 || GET_MODE (op) == VOIDmode);
28464 }
28465 else
28466 {
28467 /* This must be register. */
28468 if (VECTOR_MODE_P (mode))
28469 op = safe_vector_operand (op, mode);
28470
28471 gcc_assert (GET_MODE (op) == mode
28472 || GET_MODE (op) == VOIDmode);
28473 op = copy_to_mode_reg (mode, op);
28474 }
28475 }
28476
28477 args[i].op = op;
28478 args[i].mode = mode;
28479 }
28480
28481 switch (nargs)
28482 {
28483 case 0:
28484 pat = GEN_FCN (icode) (target);
28485 break;
28486 case 1:
28487 pat = GEN_FCN (icode) (target, args[0].op);
28488 break;
28489 case 2:
28490 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
28491 break;
28492 case 3:
28493 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
28494 break;
28495 default:
28496 gcc_unreachable ();
28497 }
28498
28499 if (! pat)
28500 return 0;
28501 emit_insn (pat);
28502 return klass == store ? 0 : target;
28503 }
28504
28505 /* Return the integer constant in ARG. Constrain it to be in the range
28506 of the subparts of VEC_TYPE; issue an error if not. */
28507
28508 static int
28509 get_element_number (tree vec_type, tree arg)
28510 {
28511 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
28512
28513 if (!host_integerp (arg, 1)
28514 || (elt = tree_low_cst (arg, 1), elt > max))
28515 {
28516 error ("selector must be an integer constant in the range 0..%wi", max);
28517 return 0;
28518 }
28519
28520 return elt;
28521 }
28522
28523 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
28524 ix86_expand_vector_init. We DO have language-level syntax for this, in
28525 the form of (type){ init-list }. Except that since we can't place emms
28526 instructions from inside the compiler, we can't allow the use of MMX
28527 registers unless the user explicitly asks for it. So we do *not* define
28528 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
28529 we have builtins invoked by mmintrin.h that gives us license to emit
28530 these sorts of instructions. */
28531
28532 static rtx
28533 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
28534 {
28535 enum machine_mode tmode = TYPE_MODE (type);
28536 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
28537 int i, n_elt = GET_MODE_NUNITS (tmode);
28538 rtvec v = rtvec_alloc (n_elt);
28539
28540 gcc_assert (VECTOR_MODE_P (tmode));
28541 gcc_assert (call_expr_nargs (exp) == n_elt);
28542
28543 for (i = 0; i < n_elt; ++i)
28544 {
28545 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
28546 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
28547 }
28548
28549 if (!target || !register_operand (target, tmode))
28550 target = gen_reg_rtx (tmode);
28551
28552 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
28553 return target;
28554 }
28555
28556 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
28557 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
28558 had a language-level syntax for referencing vector elements. */
28559
28560 static rtx
28561 ix86_expand_vec_ext_builtin (tree exp, rtx target)
28562 {
28563 enum machine_mode tmode, mode0;
28564 tree arg0, arg1;
28565 int elt;
28566 rtx op0;
28567
28568 arg0 = CALL_EXPR_ARG (exp, 0);
28569 arg1 = CALL_EXPR_ARG (exp, 1);
28570
28571 op0 = expand_normal (arg0);
28572 elt = get_element_number (TREE_TYPE (arg0), arg1);
28573
28574 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
28575 mode0 = TYPE_MODE (TREE_TYPE (arg0));
28576 gcc_assert (VECTOR_MODE_P (mode0));
28577
28578 op0 = force_reg (mode0, op0);
28579
28580 if (optimize || !target || !register_operand (target, tmode))
28581 target = gen_reg_rtx (tmode);
28582
28583 ix86_expand_vector_extract (true, target, op0, elt);
28584
28585 return target;
28586 }
28587
28588 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
28589 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
28590 a language-level syntax for referencing vector elements. */
28591
28592 static rtx
28593 ix86_expand_vec_set_builtin (tree exp)
28594 {
28595 enum machine_mode tmode, mode1;
28596 tree arg0, arg1, arg2;
28597 int elt;
28598 rtx op0, op1, target;
28599
28600 arg0 = CALL_EXPR_ARG (exp, 0);
28601 arg1 = CALL_EXPR_ARG (exp, 1);
28602 arg2 = CALL_EXPR_ARG (exp, 2);
28603
28604 tmode = TYPE_MODE (TREE_TYPE (arg0));
28605 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
28606 gcc_assert (VECTOR_MODE_P (tmode));
28607
28608 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
28609 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
28610 elt = get_element_number (TREE_TYPE (arg0), arg2);
28611
28612 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
28613 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
28614
28615 op0 = force_reg (tmode, op0);
28616 op1 = force_reg (mode1, op1);
28617
28618 /* OP0 is the source of these builtin functions and shouldn't be
28619 modified. Create a copy, use it and return it as target. */
28620 target = gen_reg_rtx (tmode);
28621 emit_move_insn (target, op0);
28622 ix86_expand_vector_set (true, target, op1, elt);
28623
28624 return target;
28625 }
28626
28627 /* Expand an expression EXP that calls a built-in function,
28628 with result going to TARGET if that's convenient
28629 (and in mode MODE if that's convenient).
28630 SUBTARGET may be used as the target for computing one of EXP's operands.
28631 IGNORE is nonzero if the value is to be ignored. */
28632
28633 static rtx
28634 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
28635 enum machine_mode mode ATTRIBUTE_UNUSED,
28636 int ignore ATTRIBUTE_UNUSED)
28637 {
28638 const struct builtin_description *d;
28639 size_t i;
28640 enum insn_code icode;
28641 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
28642 tree arg0, arg1, arg2, arg3, arg4;
28643 rtx op0, op1, op2, op3, op4, pat;
28644 enum machine_mode mode0, mode1, mode2, mode3, mode4;
28645 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
28646
28647 /* Determine whether the builtin function is available under the current ISA.
28648 Originally the builtin was not created if it wasn't applicable to the
28649 current ISA based on the command line switches. With function specific
28650 options, we need to check in the context of the function making the call
28651 whether it is supported. */
28652 if (ix86_builtins_isa[fcode].isa
28653 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
28654 {
28655 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
28656 NULL, (enum fpmath_unit) 0, false);
28657
28658 if (!opts)
28659 error ("%qE needs unknown isa option", fndecl);
28660 else
28661 {
28662 gcc_assert (opts != NULL);
28663 error ("%qE needs isa option %s", fndecl, opts);
28664 free (opts);
28665 }
28666 return const0_rtx;
28667 }
28668
28669 switch (fcode)
28670 {
28671 case IX86_BUILTIN_MASKMOVQ:
28672 case IX86_BUILTIN_MASKMOVDQU:
28673 icode = (fcode == IX86_BUILTIN_MASKMOVQ
28674 ? CODE_FOR_mmx_maskmovq
28675 : CODE_FOR_sse2_maskmovdqu);
28676 /* Note the arg order is different from the operand order. */
28677 arg1 = CALL_EXPR_ARG (exp, 0);
28678 arg2 = CALL_EXPR_ARG (exp, 1);
28679 arg0 = CALL_EXPR_ARG (exp, 2);
28680 op0 = expand_normal (arg0);
28681 op1 = expand_normal (arg1);
28682 op2 = expand_normal (arg2);
28683 mode0 = insn_data[icode].operand[0].mode;
28684 mode1 = insn_data[icode].operand[1].mode;
28685 mode2 = insn_data[icode].operand[2].mode;
28686
28687 if (GET_MODE (op0) != Pmode)
28688 op0 = convert_to_mode (Pmode, op0, 1);
28689 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
28690
28691 if (!insn_data[icode].operand[0].predicate (op0, mode0))
28692 op0 = copy_to_mode_reg (mode0, op0);
28693 if (!insn_data[icode].operand[1].predicate (op1, mode1))
28694 op1 = copy_to_mode_reg (mode1, op1);
28695 if (!insn_data[icode].operand[2].predicate (op2, mode2))
28696 op2 = copy_to_mode_reg (mode2, op2);
28697 pat = GEN_FCN (icode) (op0, op1, op2);
28698 if (! pat)
28699 return 0;
28700 emit_insn (pat);
28701 return 0;
28702
28703 case IX86_BUILTIN_LDMXCSR:
28704 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
28705 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
28706 emit_move_insn (target, op0);
28707 emit_insn (gen_sse_ldmxcsr (target));
28708 return 0;
28709
28710 case IX86_BUILTIN_STMXCSR:
28711 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
28712 emit_insn (gen_sse_stmxcsr (target));
28713 return copy_to_mode_reg (SImode, target);
28714
28715 case IX86_BUILTIN_CLFLUSH:
28716 arg0 = CALL_EXPR_ARG (exp, 0);
28717 op0 = expand_normal (arg0);
28718 icode = CODE_FOR_sse2_clflush;
28719 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
28720 {
28721 if (GET_MODE (op0) != Pmode)
28722 op0 = convert_to_mode (Pmode, op0, 1);
28723 op0 = force_reg (Pmode, op0);
28724 }
28725
28726 emit_insn (gen_sse2_clflush (op0));
28727 return 0;
28728
28729 case IX86_BUILTIN_MONITOR:
28730 arg0 = CALL_EXPR_ARG (exp, 0);
28731 arg1 = CALL_EXPR_ARG (exp, 1);
28732 arg2 = CALL_EXPR_ARG (exp, 2);
28733 op0 = expand_normal (arg0);
28734 op1 = expand_normal (arg1);
28735 op2 = expand_normal (arg2);
28736 if (!REG_P (op0))
28737 {
28738 if (GET_MODE (op0) != Pmode)
28739 op0 = convert_to_mode (Pmode, op0, 1);
28740 op0 = force_reg (Pmode, op0);
28741 }
28742 if (!REG_P (op1))
28743 op1 = copy_to_mode_reg (SImode, op1);
28744 if (!REG_P (op2))
28745 op2 = copy_to_mode_reg (SImode, op2);
28746 emit_insn (ix86_gen_monitor (op0, op1, op2));
28747 return 0;
28748
28749 case IX86_BUILTIN_MWAIT:
28750 arg0 = CALL_EXPR_ARG (exp, 0);
28751 arg1 = CALL_EXPR_ARG (exp, 1);
28752 op0 = expand_normal (arg0);
28753 op1 = expand_normal (arg1);
28754 if (!REG_P (op0))
28755 op0 = copy_to_mode_reg (SImode, op0);
28756 if (!REG_P (op1))
28757 op1 = copy_to_mode_reg (SImode, op1);
28758 emit_insn (gen_sse3_mwait (op0, op1));
28759 return 0;
28760
28761 case IX86_BUILTIN_VEC_INIT_V2SI:
28762 case IX86_BUILTIN_VEC_INIT_V4HI:
28763 case IX86_BUILTIN_VEC_INIT_V8QI:
28764 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
28765
28766 case IX86_BUILTIN_VEC_EXT_V2DF:
28767 case IX86_BUILTIN_VEC_EXT_V2DI:
28768 case IX86_BUILTIN_VEC_EXT_V4SF:
28769 case IX86_BUILTIN_VEC_EXT_V4SI:
28770 case IX86_BUILTIN_VEC_EXT_V8HI:
28771 case IX86_BUILTIN_VEC_EXT_V2SI:
28772 case IX86_BUILTIN_VEC_EXT_V4HI:
28773 case IX86_BUILTIN_VEC_EXT_V16QI:
28774 return ix86_expand_vec_ext_builtin (exp, target);
28775
28776 case IX86_BUILTIN_VEC_SET_V2DI:
28777 case IX86_BUILTIN_VEC_SET_V4SF:
28778 case IX86_BUILTIN_VEC_SET_V4SI:
28779 case IX86_BUILTIN_VEC_SET_V8HI:
28780 case IX86_BUILTIN_VEC_SET_V4HI:
28781 case IX86_BUILTIN_VEC_SET_V16QI:
28782 return ix86_expand_vec_set_builtin (exp);
28783
28784 case IX86_BUILTIN_INFQ:
28785 case IX86_BUILTIN_HUGE_VALQ:
28786 {
28787 REAL_VALUE_TYPE inf;
28788 rtx tmp;
28789
28790 real_inf (&inf);
28791 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
28792
28793 tmp = validize_mem (force_const_mem (mode, tmp));
28794
28795 if (target == 0)
28796 target = gen_reg_rtx (mode);
28797
28798 emit_move_insn (target, tmp);
28799 return target;
28800 }
28801
28802 case IX86_BUILTIN_LLWPCB:
28803 arg0 = CALL_EXPR_ARG (exp, 0);
28804 op0 = expand_normal (arg0);
28805 icode = CODE_FOR_lwp_llwpcb;
28806 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
28807 {
28808 if (GET_MODE (op0) != Pmode)
28809 op0 = convert_to_mode (Pmode, op0, 1);
28810 op0 = force_reg (Pmode, op0);
28811 }
28812 emit_insn (gen_lwp_llwpcb (op0));
28813 return 0;
28814
28815 case IX86_BUILTIN_SLWPCB:
28816 icode = CODE_FOR_lwp_slwpcb;
28817 if (!target
28818 || !insn_data[icode].operand[0].predicate (target, Pmode))
28819 target = gen_reg_rtx (Pmode);
28820 emit_insn (gen_lwp_slwpcb (target));
28821 return target;
28822
28823 case IX86_BUILTIN_BEXTRI32:
28824 case IX86_BUILTIN_BEXTRI64:
28825 arg0 = CALL_EXPR_ARG (exp, 0);
28826 arg1 = CALL_EXPR_ARG (exp, 1);
28827 op0 = expand_normal (arg0);
28828 op1 = expand_normal (arg1);
28829 icode = (fcode == IX86_BUILTIN_BEXTRI32
28830 ? CODE_FOR_tbm_bextri_si
28831 : CODE_FOR_tbm_bextri_di);
28832 if (!CONST_INT_P (op1))
28833 {
28834 error ("last argument must be an immediate");
28835 return const0_rtx;
28836 }
28837 else
28838 {
28839 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
28840 unsigned char lsb_index = INTVAL (op1) & 0xFF;
28841 op1 = GEN_INT (length);
28842 op2 = GEN_INT (lsb_index);
28843 pat = GEN_FCN (icode) (target, op0, op1, op2);
28844 if (pat)
28845 emit_insn (pat);
28846 return target;
28847 }
28848
28849 case IX86_BUILTIN_RDRAND16_STEP:
28850 icode = CODE_FOR_rdrandhi_1;
28851 mode0 = HImode;
28852 goto rdrand_step;
28853
28854 case IX86_BUILTIN_RDRAND32_STEP:
28855 icode = CODE_FOR_rdrandsi_1;
28856 mode0 = SImode;
28857 goto rdrand_step;
28858
28859 case IX86_BUILTIN_RDRAND64_STEP:
28860 icode = CODE_FOR_rdranddi_1;
28861 mode0 = DImode;
28862
28863 rdrand_step:
28864 op0 = gen_reg_rtx (mode0);
28865 emit_insn (GEN_FCN (icode) (op0));
28866
28867 arg0 = CALL_EXPR_ARG (exp, 0);
28868 op1 = expand_normal (arg0);
28869 if (!address_operand (op1, VOIDmode))
28870 {
28871 op1 = convert_memory_address (Pmode, op1);
28872 op1 = copy_addr_to_reg (op1);
28873 }
28874 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
28875
28876 op1 = gen_reg_rtx (SImode);
28877 emit_move_insn (op1, CONST1_RTX (SImode));
28878
28879 /* Emit SImode conditional move. */
28880 if (mode0 == HImode)
28881 {
28882 op2 = gen_reg_rtx (SImode);
28883 emit_insn (gen_zero_extendhisi2 (op2, op0));
28884 }
28885 else if (mode0 == SImode)
28886 op2 = op0;
28887 else
28888 op2 = gen_rtx_SUBREG (SImode, op0, 0);
28889
28890 if (target == 0)
28891 target = gen_reg_rtx (SImode);
28892
28893 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
28894 const0_rtx);
28895 emit_insn (gen_rtx_SET (VOIDmode, target,
28896 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
28897 return target;
28898
28899 case IX86_BUILTIN_GATHERSIV2DF:
28900 icode = CODE_FOR_avx2_gathersiv2df;
28901 goto gather_gen;
28902 case IX86_BUILTIN_GATHERSIV4DF:
28903 icode = CODE_FOR_avx2_gathersiv4df;
28904 goto gather_gen;
28905 case IX86_BUILTIN_GATHERDIV2DF:
28906 icode = CODE_FOR_avx2_gatherdiv2df;
28907 goto gather_gen;
28908 case IX86_BUILTIN_GATHERDIV4DF:
28909 icode = CODE_FOR_avx2_gatherdiv4df;
28910 goto gather_gen;
28911 case IX86_BUILTIN_GATHERSIV4SF:
28912 icode = CODE_FOR_avx2_gathersiv4sf;
28913 goto gather_gen;
28914 case IX86_BUILTIN_GATHERSIV8SF:
28915 icode = CODE_FOR_avx2_gathersiv8sf;
28916 goto gather_gen;
28917 case IX86_BUILTIN_GATHERDIV4SF:
28918 icode = CODE_FOR_avx2_gatherdiv4sf;
28919 goto gather_gen;
28920 case IX86_BUILTIN_GATHERDIV8SF:
28921 icode = CODE_FOR_avx2_gatherdiv4sf256;
28922 goto gather_gen;
28923 case IX86_BUILTIN_GATHERSIV2DI:
28924 icode = CODE_FOR_avx2_gathersiv2di;
28925 goto gather_gen;
28926 case IX86_BUILTIN_GATHERSIV4DI:
28927 icode = CODE_FOR_avx2_gathersiv4di;
28928 goto gather_gen;
28929 case IX86_BUILTIN_GATHERDIV2DI:
28930 icode = CODE_FOR_avx2_gatherdiv2di;
28931 goto gather_gen;
28932 case IX86_BUILTIN_GATHERDIV4DI:
28933 icode = CODE_FOR_avx2_gatherdiv4di;
28934 goto gather_gen;
28935 case IX86_BUILTIN_GATHERSIV4SI:
28936 icode = CODE_FOR_avx2_gathersiv4si;
28937 goto gather_gen;
28938 case IX86_BUILTIN_GATHERSIV8SI:
28939 icode = CODE_FOR_avx2_gathersiv8si;
28940 goto gather_gen;
28941 case IX86_BUILTIN_GATHERDIV4SI:
28942 icode = CODE_FOR_avx2_gatherdiv4si;
28943 goto gather_gen;
28944 case IX86_BUILTIN_GATHERDIV8SI:
28945 icode = CODE_FOR_avx2_gatherdiv4si256;
28946
28947 gather_gen:
28948 arg0 = CALL_EXPR_ARG (exp, 0);
28949 arg1 = CALL_EXPR_ARG (exp, 1);
28950 arg2 = CALL_EXPR_ARG (exp, 2);
28951 arg3 = CALL_EXPR_ARG (exp, 3);
28952 arg4 = CALL_EXPR_ARG (exp, 4);
28953 op0 = expand_normal (arg0);
28954 op1 = expand_normal (arg1);
28955 op2 = expand_normal (arg2);
28956 op3 = expand_normal (arg3);
28957 op4 = expand_normal (arg4);
28958 /* Note the arg order is different from the operand order. */
28959 mode0 = insn_data[icode].operand[1].mode;
28960 mode2 = insn_data[icode].operand[3].mode;
28961 mode3 = insn_data[icode].operand[4].mode;
28962 mode4 = insn_data[icode].operand[5].mode;
28963
28964 if (target == NULL_RTX)
28965 target = gen_reg_rtx (insn_data[icode].operand[0].mode);
28966
28967 /* Force memory operand only with base register here. But we
28968 don't want to do it on memory operand for other builtin
28969 functions. */
28970 if (GET_MODE (op1) != Pmode)
28971 op1 = convert_to_mode (Pmode, op1, 1);
28972 op1 = force_reg (Pmode, op1);
28973
28974 if (!insn_data[icode].operand[1].predicate (op0, mode0))
28975 op0 = copy_to_mode_reg (mode0, op0);
28976 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
28977 op1 = copy_to_mode_reg (Pmode, op1);
28978 if (!insn_data[icode].operand[3].predicate (op2, mode2))
28979 op2 = copy_to_mode_reg (mode2, op2);
28980 if (!insn_data[icode].operand[4].predicate (op3, mode3))
28981 op3 = copy_to_mode_reg (mode3, op3);
28982 if (!insn_data[icode].operand[5].predicate (op4, mode4))
28983 {
28984 error ("last argument must be scale 1, 2, 4, 8");
28985 return const0_rtx;
28986 }
28987 pat = GEN_FCN (icode) (target, op0, op1, op2, op3, op4);
28988 if (! pat)
28989 return const0_rtx;
28990 emit_insn (pat);
28991 return target;
28992
28993 default:
28994 break;
28995 }
28996
28997 for (i = 0, d = bdesc_special_args;
28998 i < ARRAY_SIZE (bdesc_special_args);
28999 i++, d++)
29000 if (d->code == fcode)
29001 return ix86_expand_special_args_builtin (d, exp, target);
29002
29003 for (i = 0, d = bdesc_args;
29004 i < ARRAY_SIZE (bdesc_args);
29005 i++, d++)
29006 if (d->code == fcode)
29007 switch (fcode)
29008 {
29009 case IX86_BUILTIN_FABSQ:
29010 case IX86_BUILTIN_COPYSIGNQ:
29011 if (!TARGET_SSE2)
29012 /* Emit a normal call if SSE2 isn't available. */
29013 return expand_call (exp, target, ignore);
29014 default:
29015 return ix86_expand_args_builtin (d, exp, target);
29016 }
29017
29018 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29019 if (d->code == fcode)
29020 return ix86_expand_sse_comi (d, exp, target);
29021
29022 for (i = 0, d = bdesc_pcmpestr;
29023 i < ARRAY_SIZE (bdesc_pcmpestr);
29024 i++, d++)
29025 if (d->code == fcode)
29026 return ix86_expand_sse_pcmpestr (d, exp, target);
29027
29028 for (i = 0, d = bdesc_pcmpistr;
29029 i < ARRAY_SIZE (bdesc_pcmpistr);
29030 i++, d++)
29031 if (d->code == fcode)
29032 return ix86_expand_sse_pcmpistr (d, exp, target);
29033
29034 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29035 if (d->code == fcode)
29036 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
29037 (enum ix86_builtin_func_type)
29038 d->flag, d->comparison);
29039
29040 gcc_unreachable ();
29041 }
29042
29043 /* Returns a function decl for a vectorized version of the builtin function
29044 with builtin function code FN and the result vector type TYPE, or NULL_TREE
29045 if it is not available. */
29046
29047 static tree
29048 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
29049 tree type_in)
29050 {
29051 enum machine_mode in_mode, out_mode;
29052 int in_n, out_n;
29053 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
29054
29055 if (TREE_CODE (type_out) != VECTOR_TYPE
29056 || TREE_CODE (type_in) != VECTOR_TYPE
29057 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
29058 return NULL_TREE;
29059
29060 out_mode = TYPE_MODE (TREE_TYPE (type_out));
29061 out_n = TYPE_VECTOR_SUBPARTS (type_out);
29062 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29063 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29064
29065 switch (fn)
29066 {
29067 case BUILT_IN_SQRT:
29068 if (out_mode == DFmode && in_mode == DFmode)
29069 {
29070 if (out_n == 2 && in_n == 2)
29071 return ix86_builtins[IX86_BUILTIN_SQRTPD];
29072 else if (out_n == 4 && in_n == 4)
29073 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
29074 }
29075 break;
29076
29077 case BUILT_IN_SQRTF:
29078 if (out_mode == SFmode && in_mode == SFmode)
29079 {
29080 if (out_n == 4 && in_n == 4)
29081 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
29082 else if (out_n == 8 && in_n == 8)
29083 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
29084 }
29085 break;
29086
29087 case BUILT_IN_LRINT:
29088 if (out_mode == SImode && out_n == 4
29089 && in_mode == DFmode && in_n == 2)
29090 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
29091 break;
29092
29093 case BUILT_IN_LRINTF:
29094 if (out_mode == SImode && in_mode == SFmode)
29095 {
29096 if (out_n == 4 && in_n == 4)
29097 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
29098 else if (out_n == 8 && in_n == 8)
29099 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
29100 }
29101 break;
29102
29103 case BUILT_IN_COPYSIGN:
29104 if (out_mode == DFmode && in_mode == DFmode)
29105 {
29106 if (out_n == 2 && in_n == 2)
29107 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
29108 else if (out_n == 4 && in_n == 4)
29109 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
29110 }
29111 break;
29112
29113 case BUILT_IN_COPYSIGNF:
29114 if (out_mode == SFmode && in_mode == SFmode)
29115 {
29116 if (out_n == 4 && in_n == 4)
29117 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
29118 else if (out_n == 8 && in_n == 8)
29119 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
29120 }
29121 break;
29122
29123 case BUILT_IN_FLOOR:
29124 /* The round insn does not trap on denormals. */
29125 if (flag_trapping_math || !TARGET_ROUND)
29126 break;
29127
29128 if (out_mode == DFmode && in_mode == DFmode)
29129 {
29130 if (out_n == 2 && in_n == 2)
29131 return ix86_builtins[IX86_BUILTIN_FLOORPD];
29132 else if (out_n == 4 && in_n == 4)
29133 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
29134 }
29135 break;
29136
29137 case BUILT_IN_FLOORF:
29138 /* The round insn does not trap on denormals. */
29139 if (flag_trapping_math || !TARGET_ROUND)
29140 break;
29141
29142 if (out_mode == SFmode && in_mode == SFmode)
29143 {
29144 if (out_n == 4 && in_n == 4)
29145 return ix86_builtins[IX86_BUILTIN_FLOORPS];
29146 else if (out_n == 8 && in_n == 8)
29147 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
29148 }
29149 break;
29150
29151 case BUILT_IN_CEIL:
29152 /* The round insn does not trap on denormals. */
29153 if (flag_trapping_math || !TARGET_ROUND)
29154 break;
29155
29156 if (out_mode == DFmode && in_mode == DFmode)
29157 {
29158 if (out_n == 2 && in_n == 2)
29159 return ix86_builtins[IX86_BUILTIN_CEILPD];
29160 else if (out_n == 4 && in_n == 4)
29161 return ix86_builtins[IX86_BUILTIN_CEILPD256];
29162 }
29163 break;
29164
29165 case BUILT_IN_CEILF:
29166 /* The round insn does not trap on denormals. */
29167 if (flag_trapping_math || !TARGET_ROUND)
29168 break;
29169
29170 if (out_mode == SFmode && in_mode == SFmode)
29171 {
29172 if (out_n == 4 && in_n == 4)
29173 return ix86_builtins[IX86_BUILTIN_CEILPS];
29174 else if (out_n == 8 && in_n == 8)
29175 return ix86_builtins[IX86_BUILTIN_CEILPS256];
29176 }
29177 break;
29178
29179 case BUILT_IN_TRUNC:
29180 /* The round insn does not trap on denormals. */
29181 if (flag_trapping_math || !TARGET_ROUND)
29182 break;
29183
29184 if (out_mode == DFmode && in_mode == DFmode)
29185 {
29186 if (out_n == 2 && in_n == 2)
29187 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
29188 else if (out_n == 4 && in_n == 4)
29189 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
29190 }
29191 break;
29192
29193 case BUILT_IN_TRUNCF:
29194 /* The round insn does not trap on denormals. */
29195 if (flag_trapping_math || !TARGET_ROUND)
29196 break;
29197
29198 if (out_mode == SFmode && in_mode == SFmode)
29199 {
29200 if (out_n == 4 && in_n == 4)
29201 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
29202 else if (out_n == 8 && in_n == 8)
29203 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
29204 }
29205 break;
29206
29207 case BUILT_IN_RINT:
29208 /* The round insn does not trap on denormals. */
29209 if (flag_trapping_math || !TARGET_ROUND)
29210 break;
29211
29212 if (out_mode == DFmode && in_mode == DFmode)
29213 {
29214 if (out_n == 2 && in_n == 2)
29215 return ix86_builtins[IX86_BUILTIN_RINTPD];
29216 else if (out_n == 4 && in_n == 4)
29217 return ix86_builtins[IX86_BUILTIN_RINTPD256];
29218 }
29219 break;
29220
29221 case BUILT_IN_RINTF:
29222 /* The round insn does not trap on denormals. */
29223 if (flag_trapping_math || !TARGET_ROUND)
29224 break;
29225
29226 if (out_mode == SFmode && in_mode == SFmode)
29227 {
29228 if (out_n == 4 && in_n == 4)
29229 return ix86_builtins[IX86_BUILTIN_RINTPS];
29230 else if (out_n == 8 && in_n == 8)
29231 return ix86_builtins[IX86_BUILTIN_RINTPS256];
29232 }
29233 break;
29234
29235 case BUILT_IN_ROUND:
29236 /* The round insn does not trap on denormals. */
29237 if (flag_trapping_math || !TARGET_ROUND)
29238 break;
29239
29240 if (out_mode == DFmode && in_mode == DFmode)
29241 {
29242 if (out_n == 2 && in_n == 2)
29243 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
29244 else if (out_n == 4 && in_n == 4)
29245 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
29246 }
29247 break;
29248
29249 case BUILT_IN_ROUNDF:
29250 /* The round insn does not trap on denormals. */
29251 if (flag_trapping_math || !TARGET_ROUND)
29252 break;
29253
29254 if (out_mode == SFmode && in_mode == SFmode)
29255 {
29256 if (out_n == 4 && in_n == 4)
29257 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
29258 else if (out_n == 8 && in_n == 8)
29259 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
29260 }
29261 break;
29262
29263 case BUILT_IN_FMA:
29264 if (out_mode == DFmode && in_mode == DFmode)
29265 {
29266 if (out_n == 2 && in_n == 2)
29267 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
29268 if (out_n == 4 && in_n == 4)
29269 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
29270 }
29271 break;
29272
29273 case BUILT_IN_FMAF:
29274 if (out_mode == SFmode && in_mode == SFmode)
29275 {
29276 if (out_n == 4 && in_n == 4)
29277 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
29278 if (out_n == 8 && in_n == 8)
29279 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
29280 }
29281 break;
29282
29283 default:
29284 break;
29285 }
29286
29287 /* Dispatch to a handler for a vectorization library. */
29288 if (ix86_veclib_handler)
29289 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
29290 type_in);
29291
29292 return NULL_TREE;
29293 }
29294
29295 /* Handler for an SVML-style interface to
29296 a library with vectorized intrinsics. */
29297
29298 static tree
29299 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
29300 {
29301 char name[20];
29302 tree fntype, new_fndecl, args;
29303 unsigned arity;
29304 const char *bname;
29305 enum machine_mode el_mode, in_mode;
29306 int n, in_n;
29307
29308 /* The SVML is suitable for unsafe math only. */
29309 if (!flag_unsafe_math_optimizations)
29310 return NULL_TREE;
29311
29312 el_mode = TYPE_MODE (TREE_TYPE (type_out));
29313 n = TYPE_VECTOR_SUBPARTS (type_out);
29314 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29315 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29316 if (el_mode != in_mode
29317 || n != in_n)
29318 return NULL_TREE;
29319
29320 switch (fn)
29321 {
29322 case BUILT_IN_EXP:
29323 case BUILT_IN_LOG:
29324 case BUILT_IN_LOG10:
29325 case BUILT_IN_POW:
29326 case BUILT_IN_TANH:
29327 case BUILT_IN_TAN:
29328 case BUILT_IN_ATAN:
29329 case BUILT_IN_ATAN2:
29330 case BUILT_IN_ATANH:
29331 case BUILT_IN_CBRT:
29332 case BUILT_IN_SINH:
29333 case BUILT_IN_SIN:
29334 case BUILT_IN_ASINH:
29335 case BUILT_IN_ASIN:
29336 case BUILT_IN_COSH:
29337 case BUILT_IN_COS:
29338 case BUILT_IN_ACOSH:
29339 case BUILT_IN_ACOS:
29340 if (el_mode != DFmode || n != 2)
29341 return NULL_TREE;
29342 break;
29343
29344 case BUILT_IN_EXPF:
29345 case BUILT_IN_LOGF:
29346 case BUILT_IN_LOG10F:
29347 case BUILT_IN_POWF:
29348 case BUILT_IN_TANHF:
29349 case BUILT_IN_TANF:
29350 case BUILT_IN_ATANF:
29351 case BUILT_IN_ATAN2F:
29352 case BUILT_IN_ATANHF:
29353 case BUILT_IN_CBRTF:
29354 case BUILT_IN_SINHF:
29355 case BUILT_IN_SINF:
29356 case BUILT_IN_ASINHF:
29357 case BUILT_IN_ASINF:
29358 case BUILT_IN_COSHF:
29359 case BUILT_IN_COSF:
29360 case BUILT_IN_ACOSHF:
29361 case BUILT_IN_ACOSF:
29362 if (el_mode != SFmode || n != 4)
29363 return NULL_TREE;
29364 break;
29365
29366 default:
29367 return NULL_TREE;
29368 }
29369
29370 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
29371
29372 if (fn == BUILT_IN_LOGF)
29373 strcpy (name, "vmlsLn4");
29374 else if (fn == BUILT_IN_LOG)
29375 strcpy (name, "vmldLn2");
29376 else if (n == 4)
29377 {
29378 sprintf (name, "vmls%s", bname+10);
29379 name[strlen (name)-1] = '4';
29380 }
29381 else
29382 sprintf (name, "vmld%s2", bname+10);
29383
29384 /* Convert to uppercase. */
29385 name[4] &= ~0x20;
29386
29387 arity = 0;
29388 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
29389 args;
29390 args = TREE_CHAIN (args))
29391 arity++;
29392
29393 if (arity == 1)
29394 fntype = build_function_type_list (type_out, type_in, NULL);
29395 else
29396 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
29397
29398 /* Build a function declaration for the vectorized function. */
29399 new_fndecl = build_decl (BUILTINS_LOCATION,
29400 FUNCTION_DECL, get_identifier (name), fntype);
29401 TREE_PUBLIC (new_fndecl) = 1;
29402 DECL_EXTERNAL (new_fndecl) = 1;
29403 DECL_IS_NOVOPS (new_fndecl) = 1;
29404 TREE_READONLY (new_fndecl) = 1;
29405
29406 return new_fndecl;
29407 }
29408
29409 /* Handler for an ACML-style interface to
29410 a library with vectorized intrinsics. */
29411
29412 static tree
29413 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
29414 {
29415 char name[20] = "__vr.._";
29416 tree fntype, new_fndecl, args;
29417 unsigned arity;
29418 const char *bname;
29419 enum machine_mode el_mode, in_mode;
29420 int n, in_n;
29421
29422 /* The ACML is 64bits only and suitable for unsafe math only as
29423 it does not correctly support parts of IEEE with the required
29424 precision such as denormals. */
29425 if (!TARGET_64BIT
29426 || !flag_unsafe_math_optimizations)
29427 return NULL_TREE;
29428
29429 el_mode = TYPE_MODE (TREE_TYPE (type_out));
29430 n = TYPE_VECTOR_SUBPARTS (type_out);
29431 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29432 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29433 if (el_mode != in_mode
29434 || n != in_n)
29435 return NULL_TREE;
29436
29437 switch (fn)
29438 {
29439 case BUILT_IN_SIN:
29440 case BUILT_IN_COS:
29441 case BUILT_IN_EXP:
29442 case BUILT_IN_LOG:
29443 case BUILT_IN_LOG2:
29444 case BUILT_IN_LOG10:
29445 name[4] = 'd';
29446 name[5] = '2';
29447 if (el_mode != DFmode
29448 || n != 2)
29449 return NULL_TREE;
29450 break;
29451
29452 case BUILT_IN_SINF:
29453 case BUILT_IN_COSF:
29454 case BUILT_IN_EXPF:
29455 case BUILT_IN_POWF:
29456 case BUILT_IN_LOGF:
29457 case BUILT_IN_LOG2F:
29458 case BUILT_IN_LOG10F:
29459 name[4] = 's';
29460 name[5] = '4';
29461 if (el_mode != SFmode
29462 || n != 4)
29463 return NULL_TREE;
29464 break;
29465
29466 default:
29467 return NULL_TREE;
29468 }
29469
29470 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
29471 sprintf (name + 7, "%s", bname+10);
29472
29473 arity = 0;
29474 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
29475 args;
29476 args = TREE_CHAIN (args))
29477 arity++;
29478
29479 if (arity == 1)
29480 fntype = build_function_type_list (type_out, type_in, NULL);
29481 else
29482 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
29483
29484 /* Build a function declaration for the vectorized function. */
29485 new_fndecl = build_decl (BUILTINS_LOCATION,
29486 FUNCTION_DECL, get_identifier (name), fntype);
29487 TREE_PUBLIC (new_fndecl) = 1;
29488 DECL_EXTERNAL (new_fndecl) = 1;
29489 DECL_IS_NOVOPS (new_fndecl) = 1;
29490 TREE_READONLY (new_fndecl) = 1;
29491
29492 return new_fndecl;
29493 }
29494
29495 /* Returns a code for a target-specific builtin that implements
29496 reciprocal of the function, or NULL_TREE if not available. */
29497
29498 static tree
29499 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
29500 bool sqrt ATTRIBUTE_UNUSED)
29501 {
29502 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
29503 && flag_finite_math_only && !flag_trapping_math
29504 && flag_unsafe_math_optimizations))
29505 return NULL_TREE;
29506
29507 if (md_fn)
29508 /* Machine dependent builtins. */
29509 switch (fn)
29510 {
29511 /* Vectorized version of sqrt to rsqrt conversion. */
29512 case IX86_BUILTIN_SQRTPS_NR:
29513 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
29514
29515 case IX86_BUILTIN_SQRTPS_NR256:
29516 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
29517
29518 default:
29519 return NULL_TREE;
29520 }
29521 else
29522 /* Normal builtins. */
29523 switch (fn)
29524 {
29525 /* Sqrt to rsqrt conversion. */
29526 case BUILT_IN_SQRTF:
29527 return ix86_builtins[IX86_BUILTIN_RSQRTF];
29528
29529 default:
29530 return NULL_TREE;
29531 }
29532 }
29533 \f
29534 /* Helper for avx_vpermilps256_operand et al. This is also used by
29535 the expansion functions to turn the parallel back into a mask.
29536 The return value is 0 for no match and the imm8+1 for a match. */
29537
29538 int
29539 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
29540 {
29541 unsigned i, nelt = GET_MODE_NUNITS (mode);
29542 unsigned mask = 0;
29543 unsigned char ipar[8];
29544
29545 if (XVECLEN (par, 0) != (int) nelt)
29546 return 0;
29547
29548 /* Validate that all of the elements are constants, and not totally
29549 out of range. Copy the data into an integral array to make the
29550 subsequent checks easier. */
29551 for (i = 0; i < nelt; ++i)
29552 {
29553 rtx er = XVECEXP (par, 0, i);
29554 unsigned HOST_WIDE_INT ei;
29555
29556 if (!CONST_INT_P (er))
29557 return 0;
29558 ei = INTVAL (er);
29559 if (ei >= nelt)
29560 return 0;
29561 ipar[i] = ei;
29562 }
29563
29564 switch (mode)
29565 {
29566 case V4DFmode:
29567 /* In the 256-bit DFmode case, we can only move elements within
29568 a 128-bit lane. */
29569 for (i = 0; i < 2; ++i)
29570 {
29571 if (ipar[i] >= 2)
29572 return 0;
29573 mask |= ipar[i] << i;
29574 }
29575 for (i = 2; i < 4; ++i)
29576 {
29577 if (ipar[i] < 2)
29578 return 0;
29579 mask |= (ipar[i] - 2) << i;
29580 }
29581 break;
29582
29583 case V8SFmode:
29584 /* In the 256-bit SFmode case, we have full freedom of movement
29585 within the low 128-bit lane, but the high 128-bit lane must
29586 mirror the exact same pattern. */
29587 for (i = 0; i < 4; ++i)
29588 if (ipar[i] + 4 != ipar[i + 4])
29589 return 0;
29590 nelt = 4;
29591 /* FALLTHRU */
29592
29593 case V2DFmode:
29594 case V4SFmode:
29595 /* In the 128-bit case, we've full freedom in the placement of
29596 the elements from the source operand. */
29597 for (i = 0; i < nelt; ++i)
29598 mask |= ipar[i] << (i * (nelt / 2));
29599 break;
29600
29601 default:
29602 gcc_unreachable ();
29603 }
29604
29605 /* Make sure success has a non-zero value by adding one. */
29606 return mask + 1;
29607 }
29608
29609 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
29610 the expansion functions to turn the parallel back into a mask.
29611 The return value is 0 for no match and the imm8+1 for a match. */
29612
29613 int
29614 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
29615 {
29616 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
29617 unsigned mask = 0;
29618 unsigned char ipar[8];
29619
29620 if (XVECLEN (par, 0) != (int) nelt)
29621 return 0;
29622
29623 /* Validate that all of the elements are constants, and not totally
29624 out of range. Copy the data into an integral array to make the
29625 subsequent checks easier. */
29626 for (i = 0; i < nelt; ++i)
29627 {
29628 rtx er = XVECEXP (par, 0, i);
29629 unsigned HOST_WIDE_INT ei;
29630
29631 if (!CONST_INT_P (er))
29632 return 0;
29633 ei = INTVAL (er);
29634 if (ei >= 2 * nelt)
29635 return 0;
29636 ipar[i] = ei;
29637 }
29638
29639 /* Validate that the halves of the permute are halves. */
29640 for (i = 0; i < nelt2 - 1; ++i)
29641 if (ipar[i] + 1 != ipar[i + 1])
29642 return 0;
29643 for (i = nelt2; i < nelt - 1; ++i)
29644 if (ipar[i] + 1 != ipar[i + 1])
29645 return 0;
29646
29647 /* Reconstruct the mask. */
29648 for (i = 0; i < 2; ++i)
29649 {
29650 unsigned e = ipar[i * nelt2];
29651 if (e % nelt2)
29652 return 0;
29653 e /= nelt2;
29654 mask |= e << (i * 4);
29655 }
29656
29657 /* Make sure success has a non-zero value by adding one. */
29658 return mask + 1;
29659 }
29660 \f
29661
29662 /* Store OPERAND to the memory after reload is completed. This means
29663 that we can't easily use assign_stack_local. */
29664 rtx
29665 ix86_force_to_memory (enum machine_mode mode, rtx operand)
29666 {
29667 rtx result;
29668
29669 gcc_assert (reload_completed);
29670 if (ix86_using_red_zone ())
29671 {
29672 result = gen_rtx_MEM (mode,
29673 gen_rtx_PLUS (Pmode,
29674 stack_pointer_rtx,
29675 GEN_INT (-RED_ZONE_SIZE)));
29676 emit_move_insn (result, operand);
29677 }
29678 else if (TARGET_64BIT)
29679 {
29680 switch (mode)
29681 {
29682 case HImode:
29683 case SImode:
29684 operand = gen_lowpart (DImode, operand);
29685 /* FALLTHRU */
29686 case DImode:
29687 emit_insn (
29688 gen_rtx_SET (VOIDmode,
29689 gen_rtx_MEM (DImode,
29690 gen_rtx_PRE_DEC (DImode,
29691 stack_pointer_rtx)),
29692 operand));
29693 break;
29694 default:
29695 gcc_unreachable ();
29696 }
29697 result = gen_rtx_MEM (mode, stack_pointer_rtx);
29698 }
29699 else
29700 {
29701 switch (mode)
29702 {
29703 case DImode:
29704 {
29705 rtx operands[2];
29706 split_double_mode (mode, &operand, 1, operands, operands + 1);
29707 emit_insn (
29708 gen_rtx_SET (VOIDmode,
29709 gen_rtx_MEM (SImode,
29710 gen_rtx_PRE_DEC (Pmode,
29711 stack_pointer_rtx)),
29712 operands[1]));
29713 emit_insn (
29714 gen_rtx_SET (VOIDmode,
29715 gen_rtx_MEM (SImode,
29716 gen_rtx_PRE_DEC (Pmode,
29717 stack_pointer_rtx)),
29718 operands[0]));
29719 }
29720 break;
29721 case HImode:
29722 /* Store HImodes as SImodes. */
29723 operand = gen_lowpart (SImode, operand);
29724 /* FALLTHRU */
29725 case SImode:
29726 emit_insn (
29727 gen_rtx_SET (VOIDmode,
29728 gen_rtx_MEM (GET_MODE (operand),
29729 gen_rtx_PRE_DEC (SImode,
29730 stack_pointer_rtx)),
29731 operand));
29732 break;
29733 default:
29734 gcc_unreachable ();
29735 }
29736 result = gen_rtx_MEM (mode, stack_pointer_rtx);
29737 }
29738 return result;
29739 }
29740
29741 /* Free operand from the memory. */
29742 void
29743 ix86_free_from_memory (enum machine_mode mode)
29744 {
29745 if (!ix86_using_red_zone ())
29746 {
29747 int size;
29748
29749 if (mode == DImode || TARGET_64BIT)
29750 size = 8;
29751 else
29752 size = 4;
29753 /* Use LEA to deallocate stack space. In peephole2 it will be converted
29754 to pop or add instruction if registers are available. */
29755 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
29756 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
29757 GEN_INT (size))));
29758 }
29759 }
29760
29761 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
29762
29763 Put float CONST_DOUBLE in the constant pool instead of fp regs.
29764 QImode must go into class Q_REGS.
29765 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
29766 movdf to do mem-to-mem moves through integer regs. */
29767
29768 static reg_class_t
29769 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
29770 {
29771 enum machine_mode mode = GET_MODE (x);
29772
29773 /* We're only allowed to return a subclass of CLASS. Many of the
29774 following checks fail for NO_REGS, so eliminate that early. */
29775 if (regclass == NO_REGS)
29776 return NO_REGS;
29777
29778 /* All classes can load zeros. */
29779 if (x == CONST0_RTX (mode))
29780 return regclass;
29781
29782 /* Force constants into memory if we are loading a (nonzero) constant into
29783 an MMX or SSE register. This is because there are no MMX/SSE instructions
29784 to load from a constant. */
29785 if (CONSTANT_P (x)
29786 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
29787 return NO_REGS;
29788
29789 /* Prefer SSE regs only, if we can use them for math. */
29790 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
29791 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
29792
29793 /* Floating-point constants need more complex checks. */
29794 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
29795 {
29796 /* General regs can load everything. */
29797 if (reg_class_subset_p (regclass, GENERAL_REGS))
29798 return regclass;
29799
29800 /* Floats can load 0 and 1 plus some others. Note that we eliminated
29801 zero above. We only want to wind up preferring 80387 registers if
29802 we plan on doing computation with them. */
29803 if (TARGET_80387
29804 && standard_80387_constant_p (x) > 0)
29805 {
29806 /* Limit class to non-sse. */
29807 if (regclass == FLOAT_SSE_REGS)
29808 return FLOAT_REGS;
29809 if (regclass == FP_TOP_SSE_REGS)
29810 return FP_TOP_REG;
29811 if (regclass == FP_SECOND_SSE_REGS)
29812 return FP_SECOND_REG;
29813 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
29814 return regclass;
29815 }
29816
29817 return NO_REGS;
29818 }
29819
29820 /* Generally when we see PLUS here, it's the function invariant
29821 (plus soft-fp const_int). Which can only be computed into general
29822 regs. */
29823 if (GET_CODE (x) == PLUS)
29824 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
29825
29826 /* QImode constants are easy to load, but non-constant QImode data
29827 must go into Q_REGS. */
29828 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
29829 {
29830 if (reg_class_subset_p (regclass, Q_REGS))
29831 return regclass;
29832 if (reg_class_subset_p (Q_REGS, regclass))
29833 return Q_REGS;
29834 return NO_REGS;
29835 }
29836
29837 return regclass;
29838 }
29839
29840 /* Discourage putting floating-point values in SSE registers unless
29841 SSE math is being used, and likewise for the 387 registers. */
29842 static reg_class_t
29843 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
29844 {
29845 enum machine_mode mode = GET_MODE (x);
29846
29847 /* Restrict the output reload class to the register bank that we are doing
29848 math on. If we would like not to return a subset of CLASS, reject this
29849 alternative: if reload cannot do this, it will still use its choice. */
29850 mode = GET_MODE (x);
29851 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
29852 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
29853
29854 if (X87_FLOAT_MODE_P (mode))
29855 {
29856 if (regclass == FP_TOP_SSE_REGS)
29857 return FP_TOP_REG;
29858 else if (regclass == FP_SECOND_SSE_REGS)
29859 return FP_SECOND_REG;
29860 else
29861 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
29862 }
29863
29864 return regclass;
29865 }
29866
29867 static reg_class_t
29868 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
29869 enum machine_mode mode, secondary_reload_info *sri)
29870 {
29871 /* Double-word spills from general registers to non-offsettable memory
29872 references (zero-extended addresses) require special handling. */
29873 if (TARGET_64BIT
29874 && MEM_P (x)
29875 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
29876 && rclass == GENERAL_REGS
29877 && !offsettable_memref_p (x))
29878 {
29879 sri->icode = (in_p
29880 ? CODE_FOR_reload_noff_load
29881 : CODE_FOR_reload_noff_store);
29882 /* Add the cost of moving address to a temporary. */
29883 sri->extra_cost = 1;
29884
29885 return NO_REGS;
29886 }
29887
29888 /* QImode spills from non-QI registers require
29889 intermediate register on 32bit targets. */
29890 if (!TARGET_64BIT
29891 && !in_p && mode == QImode
29892 && (rclass == GENERAL_REGS
29893 || rclass == LEGACY_REGS
29894 || rclass == INDEX_REGS))
29895 {
29896 int regno;
29897
29898 if (REG_P (x))
29899 regno = REGNO (x);
29900 else
29901 regno = -1;
29902
29903 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
29904 regno = true_regnum (x);
29905
29906 /* Return Q_REGS if the operand is in memory. */
29907 if (regno == -1)
29908 return Q_REGS;
29909 }
29910
29911 /* This condition handles corner case where an expression involving
29912 pointers gets vectorized. We're trying to use the address of a
29913 stack slot as a vector initializer.
29914
29915 (set (reg:V2DI 74 [ vect_cst_.2 ])
29916 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
29917
29918 Eventually frame gets turned into sp+offset like this:
29919
29920 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
29921 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
29922 (const_int 392 [0x188]))))
29923
29924 That later gets turned into:
29925
29926 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
29927 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
29928 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
29929
29930 We'll have the following reload recorded:
29931
29932 Reload 0: reload_in (DI) =
29933 (plus:DI (reg/f:DI 7 sp)
29934 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
29935 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
29936 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
29937 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
29938 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
29939 reload_reg_rtx: (reg:V2DI 22 xmm1)
29940
29941 Which isn't going to work since SSE instructions can't handle scalar
29942 additions. Returning GENERAL_REGS forces the addition into integer
29943 register and reload can handle subsequent reloads without problems. */
29944
29945 if (in_p && GET_CODE (x) == PLUS
29946 && SSE_CLASS_P (rclass)
29947 && SCALAR_INT_MODE_P (mode))
29948 return GENERAL_REGS;
29949
29950 return NO_REGS;
29951 }
29952
29953 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
29954
29955 static bool
29956 ix86_class_likely_spilled_p (reg_class_t rclass)
29957 {
29958 switch (rclass)
29959 {
29960 case AREG:
29961 case DREG:
29962 case CREG:
29963 case BREG:
29964 case AD_REGS:
29965 case SIREG:
29966 case DIREG:
29967 case SSE_FIRST_REG:
29968 case FP_TOP_REG:
29969 case FP_SECOND_REG:
29970 return true;
29971
29972 default:
29973 break;
29974 }
29975
29976 return false;
29977 }
29978
29979 /* If we are copying between general and FP registers, we need a memory
29980 location. The same is true for SSE and MMX registers.
29981
29982 To optimize register_move_cost performance, allow inline variant.
29983
29984 The macro can't work reliably when one of the CLASSES is class containing
29985 registers from multiple units (SSE, MMX, integer). We avoid this by never
29986 combining those units in single alternative in the machine description.
29987 Ensure that this constraint holds to avoid unexpected surprises.
29988
29989 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
29990 enforce these sanity checks. */
29991
29992 static inline bool
29993 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
29994 enum machine_mode mode, int strict)
29995 {
29996 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
29997 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
29998 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
29999 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
30000 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
30001 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
30002 {
30003 gcc_assert (!strict);
30004 return true;
30005 }
30006
30007 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
30008 return true;
30009
30010 /* ??? This is a lie. We do have moves between mmx/general, and for
30011 mmx/sse2. But by saying we need secondary memory we discourage the
30012 register allocator from using the mmx registers unless needed. */
30013 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
30014 return true;
30015
30016 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
30017 {
30018 /* SSE1 doesn't have any direct moves from other classes. */
30019 if (!TARGET_SSE2)
30020 return true;
30021
30022 /* If the target says that inter-unit moves are more expensive
30023 than moving through memory, then don't generate them. */
30024 if (!TARGET_INTER_UNIT_MOVES)
30025 return true;
30026
30027 /* Between SSE and general, we have moves no larger than word size. */
30028 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
30029 return true;
30030 }
30031
30032 return false;
30033 }
30034
30035 bool
30036 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30037 enum machine_mode mode, int strict)
30038 {
30039 return inline_secondary_memory_needed (class1, class2, mode, strict);
30040 }
30041
30042 /* Implement the TARGET_CLASS_MAX_NREGS hook.
30043
30044 On the 80386, this is the size of MODE in words,
30045 except in the FP regs, where a single reg is always enough. */
30046
30047 static unsigned char
30048 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
30049 {
30050 if (MAYBE_INTEGER_CLASS_P (rclass))
30051 {
30052 if (mode == XFmode)
30053 return (TARGET_64BIT ? 2 : 3);
30054 else if (mode == XCmode)
30055 return (TARGET_64BIT ? 4 : 6);
30056 else
30057 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
30058 }
30059 else
30060 {
30061 if (COMPLEX_MODE_P (mode))
30062 return 2;
30063 else
30064 return 1;
30065 }
30066 }
30067
30068 /* Return true if the registers in CLASS cannot represent the change from
30069 modes FROM to TO. */
30070
30071 bool
30072 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
30073 enum reg_class regclass)
30074 {
30075 if (from == to)
30076 return false;
30077
30078 /* x87 registers can't do subreg at all, as all values are reformatted
30079 to extended precision. */
30080 if (MAYBE_FLOAT_CLASS_P (regclass))
30081 return true;
30082
30083 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
30084 {
30085 /* Vector registers do not support QI or HImode loads. If we don't
30086 disallow a change to these modes, reload will assume it's ok to
30087 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
30088 the vec_dupv4hi pattern. */
30089 if (GET_MODE_SIZE (from) < 4)
30090 return true;
30091
30092 /* Vector registers do not support subreg with nonzero offsets, which
30093 are otherwise valid for integer registers. Since we can't see
30094 whether we have a nonzero offset from here, prohibit all
30095 nonparadoxical subregs changing size. */
30096 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
30097 return true;
30098 }
30099
30100 return false;
30101 }
30102
30103 /* Return the cost of moving data of mode M between a
30104 register and memory. A value of 2 is the default; this cost is
30105 relative to those in `REGISTER_MOVE_COST'.
30106
30107 This function is used extensively by register_move_cost that is used to
30108 build tables at startup. Make it inline in this case.
30109 When IN is 2, return maximum of in and out move cost.
30110
30111 If moving between registers and memory is more expensive than
30112 between two registers, you should define this macro to express the
30113 relative cost.
30114
30115 Model also increased moving costs of QImode registers in non
30116 Q_REGS classes.
30117 */
30118 static inline int
30119 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
30120 int in)
30121 {
30122 int cost;
30123 if (FLOAT_CLASS_P (regclass))
30124 {
30125 int index;
30126 switch (mode)
30127 {
30128 case SFmode:
30129 index = 0;
30130 break;
30131 case DFmode:
30132 index = 1;
30133 break;
30134 case XFmode:
30135 index = 2;
30136 break;
30137 default:
30138 return 100;
30139 }
30140 if (in == 2)
30141 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
30142 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
30143 }
30144 if (SSE_CLASS_P (regclass))
30145 {
30146 int index;
30147 switch (GET_MODE_SIZE (mode))
30148 {
30149 case 4:
30150 index = 0;
30151 break;
30152 case 8:
30153 index = 1;
30154 break;
30155 case 16:
30156 index = 2;
30157 break;
30158 default:
30159 return 100;
30160 }
30161 if (in == 2)
30162 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
30163 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
30164 }
30165 if (MMX_CLASS_P (regclass))
30166 {
30167 int index;
30168 switch (GET_MODE_SIZE (mode))
30169 {
30170 case 4:
30171 index = 0;
30172 break;
30173 case 8:
30174 index = 1;
30175 break;
30176 default:
30177 return 100;
30178 }
30179 if (in)
30180 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
30181 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
30182 }
30183 switch (GET_MODE_SIZE (mode))
30184 {
30185 case 1:
30186 if (Q_CLASS_P (regclass) || TARGET_64BIT)
30187 {
30188 if (!in)
30189 return ix86_cost->int_store[0];
30190 if (TARGET_PARTIAL_REG_DEPENDENCY
30191 && optimize_function_for_speed_p (cfun))
30192 cost = ix86_cost->movzbl_load;
30193 else
30194 cost = ix86_cost->int_load[0];
30195 if (in == 2)
30196 return MAX (cost, ix86_cost->int_store[0]);
30197 return cost;
30198 }
30199 else
30200 {
30201 if (in == 2)
30202 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
30203 if (in)
30204 return ix86_cost->movzbl_load;
30205 else
30206 return ix86_cost->int_store[0] + 4;
30207 }
30208 break;
30209 case 2:
30210 if (in == 2)
30211 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
30212 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
30213 default:
30214 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
30215 if (mode == TFmode)
30216 mode = XFmode;
30217 if (in == 2)
30218 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
30219 else if (in)
30220 cost = ix86_cost->int_load[2];
30221 else
30222 cost = ix86_cost->int_store[2];
30223 return (cost * (((int) GET_MODE_SIZE (mode)
30224 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
30225 }
30226 }
30227
30228 static int
30229 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
30230 bool in)
30231 {
30232 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
30233 }
30234
30235
30236 /* Return the cost of moving data from a register in class CLASS1 to
30237 one in class CLASS2.
30238
30239 It is not required that the cost always equal 2 when FROM is the same as TO;
30240 on some machines it is expensive to move between registers if they are not
30241 general registers. */
30242
30243 static int
30244 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
30245 reg_class_t class2_i)
30246 {
30247 enum reg_class class1 = (enum reg_class) class1_i;
30248 enum reg_class class2 = (enum reg_class) class2_i;
30249
30250 /* In case we require secondary memory, compute cost of the store followed
30251 by load. In order to avoid bad register allocation choices, we need
30252 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
30253
30254 if (inline_secondary_memory_needed (class1, class2, mode, 0))
30255 {
30256 int cost = 1;
30257
30258 cost += inline_memory_move_cost (mode, class1, 2);
30259 cost += inline_memory_move_cost (mode, class2, 2);
30260
30261 /* In case of copying from general_purpose_register we may emit multiple
30262 stores followed by single load causing memory size mismatch stall.
30263 Count this as arbitrarily high cost of 20. */
30264 if (targetm.class_max_nregs (class1, mode)
30265 > targetm.class_max_nregs (class2, mode))
30266 cost += 20;
30267
30268 /* In the case of FP/MMX moves, the registers actually overlap, and we
30269 have to switch modes in order to treat them differently. */
30270 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
30271 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
30272 cost += 20;
30273
30274 return cost;
30275 }
30276
30277 /* Moves between SSE/MMX and integer unit are expensive. */
30278 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
30279 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
30280
30281 /* ??? By keeping returned value relatively high, we limit the number
30282 of moves between integer and MMX/SSE registers for all targets.
30283 Additionally, high value prevents problem with x86_modes_tieable_p(),
30284 where integer modes in MMX/SSE registers are not tieable
30285 because of missing QImode and HImode moves to, from or between
30286 MMX/SSE registers. */
30287 return MAX (8, ix86_cost->mmxsse_to_integer);
30288
30289 if (MAYBE_FLOAT_CLASS_P (class1))
30290 return ix86_cost->fp_move;
30291 if (MAYBE_SSE_CLASS_P (class1))
30292 return ix86_cost->sse_move;
30293 if (MAYBE_MMX_CLASS_P (class1))
30294 return ix86_cost->mmx_move;
30295 return 2;
30296 }
30297
30298 /* Return TRUE if hard register REGNO can hold a value of machine-mode
30299 MODE. */
30300
30301 bool
30302 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
30303 {
30304 /* Flags and only flags can only hold CCmode values. */
30305 if (CC_REGNO_P (regno))
30306 return GET_MODE_CLASS (mode) == MODE_CC;
30307 if (GET_MODE_CLASS (mode) == MODE_CC
30308 || GET_MODE_CLASS (mode) == MODE_RANDOM
30309 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
30310 return false;
30311 if (FP_REGNO_P (regno))
30312 return VALID_FP_MODE_P (mode);
30313 if (SSE_REGNO_P (regno))
30314 {
30315 /* We implement the move patterns for all vector modes into and
30316 out of SSE registers, even when no operation instructions
30317 are available. OImode move is available only when AVX is
30318 enabled. */
30319 return ((TARGET_AVX && mode == OImode)
30320 || VALID_AVX256_REG_MODE (mode)
30321 || VALID_SSE_REG_MODE (mode)
30322 || VALID_SSE2_REG_MODE (mode)
30323 || VALID_MMX_REG_MODE (mode)
30324 || VALID_MMX_REG_MODE_3DNOW (mode));
30325 }
30326 if (MMX_REGNO_P (regno))
30327 {
30328 /* We implement the move patterns for 3DNOW modes even in MMX mode,
30329 so if the register is available at all, then we can move data of
30330 the given mode into or out of it. */
30331 return (VALID_MMX_REG_MODE (mode)
30332 || VALID_MMX_REG_MODE_3DNOW (mode));
30333 }
30334
30335 if (mode == QImode)
30336 {
30337 /* Take care for QImode values - they can be in non-QI regs,
30338 but then they do cause partial register stalls. */
30339 if (regno <= BX_REG || TARGET_64BIT)
30340 return true;
30341 if (!TARGET_PARTIAL_REG_STALL)
30342 return true;
30343 return !can_create_pseudo_p ();
30344 }
30345 /* We handle both integer and floats in the general purpose registers. */
30346 else if (VALID_INT_MODE_P (mode))
30347 return true;
30348 else if (VALID_FP_MODE_P (mode))
30349 return true;
30350 else if (VALID_DFP_MODE_P (mode))
30351 return true;
30352 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
30353 on to use that value in smaller contexts, this can easily force a
30354 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
30355 supporting DImode, allow it. */
30356 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
30357 return true;
30358
30359 return false;
30360 }
30361
30362 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
30363 tieable integer mode. */
30364
30365 static bool
30366 ix86_tieable_integer_mode_p (enum machine_mode mode)
30367 {
30368 switch (mode)
30369 {
30370 case HImode:
30371 case SImode:
30372 return true;
30373
30374 case QImode:
30375 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
30376
30377 case DImode:
30378 return TARGET_64BIT;
30379
30380 default:
30381 return false;
30382 }
30383 }
30384
30385 /* Return true if MODE1 is accessible in a register that can hold MODE2
30386 without copying. That is, all register classes that can hold MODE2
30387 can also hold MODE1. */
30388
30389 bool
30390 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
30391 {
30392 if (mode1 == mode2)
30393 return true;
30394
30395 if (ix86_tieable_integer_mode_p (mode1)
30396 && ix86_tieable_integer_mode_p (mode2))
30397 return true;
30398
30399 /* MODE2 being XFmode implies fp stack or general regs, which means we
30400 can tie any smaller floating point modes to it. Note that we do not
30401 tie this with TFmode. */
30402 if (mode2 == XFmode)
30403 return mode1 == SFmode || mode1 == DFmode;
30404
30405 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
30406 that we can tie it with SFmode. */
30407 if (mode2 == DFmode)
30408 return mode1 == SFmode;
30409
30410 /* If MODE2 is only appropriate for an SSE register, then tie with
30411 any other mode acceptable to SSE registers. */
30412 if (GET_MODE_SIZE (mode2) == 16
30413 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
30414 return (GET_MODE_SIZE (mode1) == 16
30415 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
30416
30417 /* If MODE2 is appropriate for an MMX register, then tie
30418 with any other mode acceptable to MMX registers. */
30419 if (GET_MODE_SIZE (mode2) == 8
30420 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
30421 return (GET_MODE_SIZE (mode1) == 8
30422 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
30423
30424 return false;
30425 }
30426
30427 /* Compute a (partial) cost for rtx X. Return true if the complete
30428 cost has been computed, and false if subexpressions should be
30429 scanned. In either case, *TOTAL contains the cost result. */
30430
30431 static bool
30432 ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
30433 bool speed)
30434 {
30435 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
30436 enum machine_mode mode = GET_MODE (x);
30437 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
30438
30439 switch (code)
30440 {
30441 case CONST_INT:
30442 case CONST:
30443 case LABEL_REF:
30444 case SYMBOL_REF:
30445 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
30446 *total = 3;
30447 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
30448 *total = 2;
30449 else if (flag_pic && SYMBOLIC_CONST (x)
30450 && (!TARGET_64BIT
30451 || (!GET_CODE (x) != LABEL_REF
30452 && (GET_CODE (x) != SYMBOL_REF
30453 || !SYMBOL_REF_LOCAL_P (x)))))
30454 *total = 1;
30455 else
30456 *total = 0;
30457 return true;
30458
30459 case CONST_DOUBLE:
30460 if (mode == VOIDmode)
30461 *total = 0;
30462 else
30463 switch (standard_80387_constant_p (x))
30464 {
30465 case 1: /* 0.0 */
30466 *total = 1;
30467 break;
30468 default: /* Other constants */
30469 *total = 2;
30470 break;
30471 case 0:
30472 case -1:
30473 /* Start with (MEM (SYMBOL_REF)), since that's where
30474 it'll probably end up. Add a penalty for size. */
30475 *total = (COSTS_N_INSNS (1)
30476 + (flag_pic != 0 && !TARGET_64BIT)
30477 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
30478 break;
30479 }
30480 return true;
30481
30482 case ZERO_EXTEND:
30483 /* The zero extensions is often completely free on x86_64, so make
30484 it as cheap as possible. */
30485 if (TARGET_64BIT && mode == DImode
30486 && GET_MODE (XEXP (x, 0)) == SImode)
30487 *total = 1;
30488 else if (TARGET_ZERO_EXTEND_WITH_AND)
30489 *total = cost->add;
30490 else
30491 *total = cost->movzx;
30492 return false;
30493
30494 case SIGN_EXTEND:
30495 *total = cost->movsx;
30496 return false;
30497
30498 case ASHIFT:
30499 if (CONST_INT_P (XEXP (x, 1))
30500 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
30501 {
30502 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
30503 if (value == 1)
30504 {
30505 *total = cost->add;
30506 return false;
30507 }
30508 if ((value == 2 || value == 3)
30509 && cost->lea <= cost->shift_const)
30510 {
30511 *total = cost->lea;
30512 return false;
30513 }
30514 }
30515 /* FALLTHRU */
30516
30517 case ROTATE:
30518 case ASHIFTRT:
30519 case LSHIFTRT:
30520 case ROTATERT:
30521 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
30522 {
30523 if (CONST_INT_P (XEXP (x, 1)))
30524 {
30525 if (INTVAL (XEXP (x, 1)) > 32)
30526 *total = cost->shift_const + COSTS_N_INSNS (2);
30527 else
30528 *total = cost->shift_const * 2;
30529 }
30530 else
30531 {
30532 if (GET_CODE (XEXP (x, 1)) == AND)
30533 *total = cost->shift_var * 2;
30534 else
30535 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
30536 }
30537 }
30538 else
30539 {
30540 if (CONST_INT_P (XEXP (x, 1)))
30541 *total = cost->shift_const;
30542 else
30543 *total = cost->shift_var;
30544 }
30545 return false;
30546
30547 case FMA:
30548 {
30549 rtx sub;
30550
30551 gcc_assert (FLOAT_MODE_P (mode));
30552 gcc_assert (TARGET_FMA || TARGET_FMA4);
30553
30554 /* ??? SSE scalar/vector cost should be used here. */
30555 /* ??? Bald assumption that fma has the same cost as fmul. */
30556 *total = cost->fmul;
30557 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
30558
30559 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
30560 sub = XEXP (x, 0);
30561 if (GET_CODE (sub) == NEG)
30562 sub = XEXP (sub, 0);
30563 *total += rtx_cost (sub, FMA, 0, speed);
30564
30565 sub = XEXP (x, 2);
30566 if (GET_CODE (sub) == NEG)
30567 sub = XEXP (sub, 0);
30568 *total += rtx_cost (sub, FMA, 2, speed);
30569 return true;
30570 }
30571
30572 case MULT:
30573 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
30574 {
30575 /* ??? SSE scalar cost should be used here. */
30576 *total = cost->fmul;
30577 return false;
30578 }
30579 else if (X87_FLOAT_MODE_P (mode))
30580 {
30581 *total = cost->fmul;
30582 return false;
30583 }
30584 else if (FLOAT_MODE_P (mode))
30585 {
30586 /* ??? SSE vector cost should be used here. */
30587 *total = cost->fmul;
30588 return false;
30589 }
30590 else
30591 {
30592 rtx op0 = XEXP (x, 0);
30593 rtx op1 = XEXP (x, 1);
30594 int nbits;
30595 if (CONST_INT_P (XEXP (x, 1)))
30596 {
30597 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
30598 for (nbits = 0; value != 0; value &= value - 1)
30599 nbits++;
30600 }
30601 else
30602 /* This is arbitrary. */
30603 nbits = 7;
30604
30605 /* Compute costs correctly for widening multiplication. */
30606 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
30607 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
30608 == GET_MODE_SIZE (mode))
30609 {
30610 int is_mulwiden = 0;
30611 enum machine_mode inner_mode = GET_MODE (op0);
30612
30613 if (GET_CODE (op0) == GET_CODE (op1))
30614 is_mulwiden = 1, op1 = XEXP (op1, 0);
30615 else if (CONST_INT_P (op1))
30616 {
30617 if (GET_CODE (op0) == SIGN_EXTEND)
30618 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
30619 == INTVAL (op1);
30620 else
30621 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
30622 }
30623
30624 if (is_mulwiden)
30625 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
30626 }
30627
30628 *total = (cost->mult_init[MODE_INDEX (mode)]
30629 + nbits * cost->mult_bit
30630 + rtx_cost (op0, outer_code, opno, speed)
30631 + rtx_cost (op1, outer_code, opno, speed));
30632
30633 return true;
30634 }
30635
30636 case DIV:
30637 case UDIV:
30638 case MOD:
30639 case UMOD:
30640 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
30641 /* ??? SSE cost should be used here. */
30642 *total = cost->fdiv;
30643 else if (X87_FLOAT_MODE_P (mode))
30644 *total = cost->fdiv;
30645 else if (FLOAT_MODE_P (mode))
30646 /* ??? SSE vector cost should be used here. */
30647 *total = cost->fdiv;
30648 else
30649 *total = cost->divide[MODE_INDEX (mode)];
30650 return false;
30651
30652 case PLUS:
30653 if (GET_MODE_CLASS (mode) == MODE_INT
30654 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
30655 {
30656 if (GET_CODE (XEXP (x, 0)) == PLUS
30657 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
30658 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
30659 && CONSTANT_P (XEXP (x, 1)))
30660 {
30661 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
30662 if (val == 2 || val == 4 || val == 8)
30663 {
30664 *total = cost->lea;
30665 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
30666 outer_code, opno, speed);
30667 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
30668 outer_code, opno, speed);
30669 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
30670 return true;
30671 }
30672 }
30673 else if (GET_CODE (XEXP (x, 0)) == MULT
30674 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
30675 {
30676 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
30677 if (val == 2 || val == 4 || val == 8)
30678 {
30679 *total = cost->lea;
30680 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
30681 outer_code, opno, speed);
30682 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
30683 return true;
30684 }
30685 }
30686 else if (GET_CODE (XEXP (x, 0)) == PLUS)
30687 {
30688 *total = cost->lea;
30689 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
30690 outer_code, opno, speed);
30691 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
30692 outer_code, opno, speed);
30693 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
30694 return true;
30695 }
30696 }
30697 /* FALLTHRU */
30698
30699 case MINUS:
30700 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
30701 {
30702 /* ??? SSE cost should be used here. */
30703 *total = cost->fadd;
30704 return false;
30705 }
30706 else if (X87_FLOAT_MODE_P (mode))
30707 {
30708 *total = cost->fadd;
30709 return false;
30710 }
30711 else if (FLOAT_MODE_P (mode))
30712 {
30713 /* ??? SSE vector cost should be used here. */
30714 *total = cost->fadd;
30715 return false;
30716 }
30717 /* FALLTHRU */
30718
30719 case AND:
30720 case IOR:
30721 case XOR:
30722 if (!TARGET_64BIT && mode == DImode)
30723 {
30724 *total = (cost->add * 2
30725 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
30726 << (GET_MODE (XEXP (x, 0)) != DImode))
30727 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
30728 << (GET_MODE (XEXP (x, 1)) != DImode)));
30729 return true;
30730 }
30731 /* FALLTHRU */
30732
30733 case NEG:
30734 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
30735 {
30736 /* ??? SSE cost should be used here. */
30737 *total = cost->fchs;
30738 return false;
30739 }
30740 else if (X87_FLOAT_MODE_P (mode))
30741 {
30742 *total = cost->fchs;
30743 return false;
30744 }
30745 else if (FLOAT_MODE_P (mode))
30746 {
30747 /* ??? SSE vector cost should be used here. */
30748 *total = cost->fchs;
30749 return false;
30750 }
30751 /* FALLTHRU */
30752
30753 case NOT:
30754 if (!TARGET_64BIT && mode == DImode)
30755 *total = cost->add * 2;
30756 else
30757 *total = cost->add;
30758 return false;
30759
30760 case COMPARE:
30761 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
30762 && XEXP (XEXP (x, 0), 1) == const1_rtx
30763 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
30764 && XEXP (x, 1) == const0_rtx)
30765 {
30766 /* This kind of construct is implemented using test[bwl].
30767 Treat it as if we had an AND. */
30768 *total = (cost->add
30769 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
30770 + rtx_cost (const1_rtx, outer_code, opno, speed));
30771 return true;
30772 }
30773 return false;
30774
30775 case FLOAT_EXTEND:
30776 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
30777 *total = 0;
30778 return false;
30779
30780 case ABS:
30781 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
30782 /* ??? SSE cost should be used here. */
30783 *total = cost->fabs;
30784 else if (X87_FLOAT_MODE_P (mode))
30785 *total = cost->fabs;
30786 else if (FLOAT_MODE_P (mode))
30787 /* ??? SSE vector cost should be used here. */
30788 *total = cost->fabs;
30789 return false;
30790
30791 case SQRT:
30792 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
30793 /* ??? SSE cost should be used here. */
30794 *total = cost->fsqrt;
30795 else if (X87_FLOAT_MODE_P (mode))
30796 *total = cost->fsqrt;
30797 else if (FLOAT_MODE_P (mode))
30798 /* ??? SSE vector cost should be used here. */
30799 *total = cost->fsqrt;
30800 return false;
30801
30802 case UNSPEC:
30803 if (XINT (x, 1) == UNSPEC_TP)
30804 *total = 0;
30805 return false;
30806
30807 case VEC_SELECT:
30808 case VEC_CONCAT:
30809 case VEC_MERGE:
30810 case VEC_DUPLICATE:
30811 /* ??? Assume all of these vector manipulation patterns are
30812 recognizable. In which case they all pretty much have the
30813 same cost. */
30814 *total = COSTS_N_INSNS (1);
30815 return true;
30816
30817 default:
30818 return false;
30819 }
30820 }
30821
30822 #if TARGET_MACHO
30823
30824 static int current_machopic_label_num;
30825
30826 /* Given a symbol name and its associated stub, write out the
30827 definition of the stub. */
30828
30829 void
30830 machopic_output_stub (FILE *file, const char *symb, const char *stub)
30831 {
30832 unsigned int length;
30833 char *binder_name, *symbol_name, lazy_ptr_name[32];
30834 int label = ++current_machopic_label_num;
30835
30836 /* For 64-bit we shouldn't get here. */
30837 gcc_assert (!TARGET_64BIT);
30838
30839 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
30840 symb = targetm.strip_name_encoding (symb);
30841
30842 length = strlen (stub);
30843 binder_name = XALLOCAVEC (char, length + 32);
30844 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
30845
30846 length = strlen (symb);
30847 symbol_name = XALLOCAVEC (char, length + 32);
30848 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
30849
30850 sprintf (lazy_ptr_name, "L%d$lz", label);
30851
30852 if (MACHOPIC_ATT_STUB)
30853 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
30854 else if (MACHOPIC_PURE)
30855 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
30856 else
30857 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
30858
30859 fprintf (file, "%s:\n", stub);
30860 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
30861
30862 if (MACHOPIC_ATT_STUB)
30863 {
30864 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
30865 }
30866 else if (MACHOPIC_PURE)
30867 {
30868 /* PIC stub. */
30869 /* 25-byte PIC stub using "CALL get_pc_thunk". */
30870 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
30871 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
30872 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
30873 label, lazy_ptr_name, label);
30874 fprintf (file, "\tjmp\t*%%ecx\n");
30875 }
30876 else
30877 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
30878
30879 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
30880 it needs no stub-binding-helper. */
30881 if (MACHOPIC_ATT_STUB)
30882 return;
30883
30884 fprintf (file, "%s:\n", binder_name);
30885
30886 if (MACHOPIC_PURE)
30887 {
30888 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
30889 fprintf (file, "\tpushl\t%%ecx\n");
30890 }
30891 else
30892 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
30893
30894 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
30895
30896 /* N.B. Keep the correspondence of these
30897 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
30898 old-pic/new-pic/non-pic stubs; altering this will break
30899 compatibility with existing dylibs. */
30900 if (MACHOPIC_PURE)
30901 {
30902 /* 25-byte PIC stub using "CALL get_pc_thunk". */
30903 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
30904 }
30905 else
30906 /* 16-byte -mdynamic-no-pic stub. */
30907 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
30908
30909 fprintf (file, "%s:\n", lazy_ptr_name);
30910 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
30911 fprintf (file, ASM_LONG "%s\n", binder_name);
30912 }
30913 #endif /* TARGET_MACHO */
30914
30915 /* Order the registers for register allocator. */
30916
30917 void
30918 x86_order_regs_for_local_alloc (void)
30919 {
30920 int pos = 0;
30921 int i;
30922
30923 /* First allocate the local general purpose registers. */
30924 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
30925 if (GENERAL_REGNO_P (i) && call_used_regs[i])
30926 reg_alloc_order [pos++] = i;
30927
30928 /* Global general purpose registers. */
30929 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
30930 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
30931 reg_alloc_order [pos++] = i;
30932
30933 /* x87 registers come first in case we are doing FP math
30934 using them. */
30935 if (!TARGET_SSE_MATH)
30936 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
30937 reg_alloc_order [pos++] = i;
30938
30939 /* SSE registers. */
30940 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
30941 reg_alloc_order [pos++] = i;
30942 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
30943 reg_alloc_order [pos++] = i;
30944
30945 /* x87 registers. */
30946 if (TARGET_SSE_MATH)
30947 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
30948 reg_alloc_order [pos++] = i;
30949
30950 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
30951 reg_alloc_order [pos++] = i;
30952
30953 /* Initialize the rest of array as we do not allocate some registers
30954 at all. */
30955 while (pos < FIRST_PSEUDO_REGISTER)
30956 reg_alloc_order [pos++] = 0;
30957 }
30958
30959 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
30960 in struct attribute_spec handler. */
30961 static tree
30962 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
30963 tree args,
30964 int flags ATTRIBUTE_UNUSED,
30965 bool *no_add_attrs)
30966 {
30967 if (TREE_CODE (*node) != FUNCTION_TYPE
30968 && TREE_CODE (*node) != METHOD_TYPE
30969 && TREE_CODE (*node) != FIELD_DECL
30970 && TREE_CODE (*node) != TYPE_DECL)
30971 {
30972 warning (OPT_Wattributes, "%qE attribute only applies to functions",
30973 name);
30974 *no_add_attrs = true;
30975 return NULL_TREE;
30976 }
30977 if (TARGET_64BIT)
30978 {
30979 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
30980 name);
30981 *no_add_attrs = true;
30982 return NULL_TREE;
30983 }
30984 if (is_attribute_p ("callee_pop_aggregate_return", name))
30985 {
30986 tree cst;
30987
30988 cst = TREE_VALUE (args);
30989 if (TREE_CODE (cst) != INTEGER_CST)
30990 {
30991 warning (OPT_Wattributes,
30992 "%qE attribute requires an integer constant argument",
30993 name);
30994 *no_add_attrs = true;
30995 }
30996 else if (compare_tree_int (cst, 0) != 0
30997 && compare_tree_int (cst, 1) != 0)
30998 {
30999 warning (OPT_Wattributes,
31000 "argument to %qE attribute is neither zero, nor one",
31001 name);
31002 *no_add_attrs = true;
31003 }
31004
31005 return NULL_TREE;
31006 }
31007
31008 return NULL_TREE;
31009 }
31010
31011 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
31012 struct attribute_spec.handler. */
31013 static tree
31014 ix86_handle_abi_attribute (tree *node, tree name,
31015 tree args ATTRIBUTE_UNUSED,
31016 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31017 {
31018 if (TREE_CODE (*node) != FUNCTION_TYPE
31019 && TREE_CODE (*node) != METHOD_TYPE
31020 && TREE_CODE (*node) != FIELD_DECL
31021 && TREE_CODE (*node) != TYPE_DECL)
31022 {
31023 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31024 name);
31025 *no_add_attrs = true;
31026 return NULL_TREE;
31027 }
31028
31029 /* Can combine regparm with all attributes but fastcall. */
31030 if (is_attribute_p ("ms_abi", name))
31031 {
31032 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
31033 {
31034 error ("ms_abi and sysv_abi attributes are not compatible");
31035 }
31036
31037 return NULL_TREE;
31038 }
31039 else if (is_attribute_p ("sysv_abi", name))
31040 {
31041 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
31042 {
31043 error ("ms_abi and sysv_abi attributes are not compatible");
31044 }
31045
31046 return NULL_TREE;
31047 }
31048
31049 return NULL_TREE;
31050 }
31051
31052 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
31053 struct attribute_spec.handler. */
31054 static tree
31055 ix86_handle_struct_attribute (tree *node, tree name,
31056 tree args ATTRIBUTE_UNUSED,
31057 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31058 {
31059 tree *type = NULL;
31060 if (DECL_P (*node))
31061 {
31062 if (TREE_CODE (*node) == TYPE_DECL)
31063 type = &TREE_TYPE (*node);
31064 }
31065 else
31066 type = node;
31067
31068 if (!(type && (TREE_CODE (*type) == RECORD_TYPE
31069 || TREE_CODE (*type) == UNION_TYPE)))
31070 {
31071 warning (OPT_Wattributes, "%qE attribute ignored",
31072 name);
31073 *no_add_attrs = true;
31074 }
31075
31076 else if ((is_attribute_p ("ms_struct", name)
31077 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
31078 || ((is_attribute_p ("gcc_struct", name)
31079 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
31080 {
31081 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
31082 name);
31083 *no_add_attrs = true;
31084 }
31085
31086 return NULL_TREE;
31087 }
31088
31089 static tree
31090 ix86_handle_fndecl_attribute (tree *node, tree name,
31091 tree args ATTRIBUTE_UNUSED,
31092 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31093 {
31094 if (TREE_CODE (*node) != FUNCTION_DECL)
31095 {
31096 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31097 name);
31098 *no_add_attrs = true;
31099 }
31100 return NULL_TREE;
31101 }
31102
31103 static bool
31104 ix86_ms_bitfield_layout_p (const_tree record_type)
31105 {
31106 return ((TARGET_MS_BITFIELD_LAYOUT
31107 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
31108 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
31109 }
31110
31111 /* Returns an expression indicating where the this parameter is
31112 located on entry to the FUNCTION. */
31113
31114 static rtx
31115 x86_this_parameter (tree function)
31116 {
31117 tree type = TREE_TYPE (function);
31118 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
31119 int nregs;
31120
31121 if (TARGET_64BIT)
31122 {
31123 const int *parm_regs;
31124
31125 if (ix86_function_type_abi (type) == MS_ABI)
31126 parm_regs = x86_64_ms_abi_int_parameter_registers;
31127 else
31128 parm_regs = x86_64_int_parameter_registers;
31129 return gen_rtx_REG (DImode, parm_regs[aggr]);
31130 }
31131
31132 nregs = ix86_function_regparm (type, function);
31133
31134 if (nregs > 0 && !stdarg_p (type))
31135 {
31136 int regno;
31137 unsigned int ccvt = ix86_get_callcvt (type);
31138
31139 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
31140 regno = aggr ? DX_REG : CX_REG;
31141 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
31142 {
31143 regno = CX_REG;
31144 if (aggr)
31145 return gen_rtx_MEM (SImode,
31146 plus_constant (stack_pointer_rtx, 4));
31147 }
31148 else
31149 {
31150 regno = AX_REG;
31151 if (aggr)
31152 {
31153 regno = DX_REG;
31154 if (nregs == 1)
31155 return gen_rtx_MEM (SImode,
31156 plus_constant (stack_pointer_rtx, 4));
31157 }
31158 }
31159 return gen_rtx_REG (SImode, regno);
31160 }
31161
31162 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
31163 }
31164
31165 /* Determine whether x86_output_mi_thunk can succeed. */
31166
31167 static bool
31168 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
31169 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
31170 HOST_WIDE_INT vcall_offset, const_tree function)
31171 {
31172 /* 64-bit can handle anything. */
31173 if (TARGET_64BIT)
31174 return true;
31175
31176 /* For 32-bit, everything's fine if we have one free register. */
31177 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
31178 return true;
31179
31180 /* Need a free register for vcall_offset. */
31181 if (vcall_offset)
31182 return false;
31183
31184 /* Need a free register for GOT references. */
31185 if (flag_pic && !targetm.binds_local_p (function))
31186 return false;
31187
31188 /* Otherwise ok. */
31189 return true;
31190 }
31191
31192 /* Output the assembler code for a thunk function. THUNK_DECL is the
31193 declaration for the thunk function itself, FUNCTION is the decl for
31194 the target function. DELTA is an immediate constant offset to be
31195 added to THIS. If VCALL_OFFSET is nonzero, the word at
31196 *(*this + vcall_offset) should be added to THIS. */
31197
31198 static void
31199 x86_output_mi_thunk (FILE *file,
31200 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
31201 HOST_WIDE_INT vcall_offset, tree function)
31202 {
31203 rtx this_param = x86_this_parameter (function);
31204 rtx this_reg, tmp, fnaddr;
31205
31206 emit_note (NOTE_INSN_PROLOGUE_END);
31207
31208 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
31209 pull it in now and let DELTA benefit. */
31210 if (REG_P (this_param))
31211 this_reg = this_param;
31212 else if (vcall_offset)
31213 {
31214 /* Put the this parameter into %eax. */
31215 this_reg = gen_rtx_REG (Pmode, AX_REG);
31216 emit_move_insn (this_reg, this_param);
31217 }
31218 else
31219 this_reg = NULL_RTX;
31220
31221 /* Adjust the this parameter by a fixed constant. */
31222 if (delta)
31223 {
31224 rtx delta_rtx = GEN_INT (delta);
31225 rtx delta_dst = this_reg ? this_reg : this_param;
31226
31227 if (TARGET_64BIT)
31228 {
31229 if (!x86_64_general_operand (delta_rtx, Pmode))
31230 {
31231 tmp = gen_rtx_REG (Pmode, R10_REG);
31232 emit_move_insn (tmp, delta_rtx);
31233 delta_rtx = tmp;
31234 }
31235 }
31236
31237 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
31238 }
31239
31240 /* Adjust the this parameter by a value stored in the vtable. */
31241 if (vcall_offset)
31242 {
31243 rtx vcall_addr, vcall_mem, this_mem;
31244 unsigned int tmp_regno;
31245
31246 if (TARGET_64BIT)
31247 tmp_regno = R10_REG;
31248 else
31249 {
31250 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
31251 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
31252 tmp_regno = AX_REG;
31253 else
31254 tmp_regno = CX_REG;
31255 }
31256 tmp = gen_rtx_REG (Pmode, tmp_regno);
31257
31258 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
31259 if (Pmode != ptr_mode)
31260 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
31261 emit_move_insn (tmp, this_mem);
31262
31263 /* Adjust the this parameter. */
31264 vcall_addr = plus_constant (tmp, vcall_offset);
31265 if (TARGET_64BIT
31266 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
31267 {
31268 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
31269 emit_move_insn (tmp2, GEN_INT (vcall_offset));
31270 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
31271 }
31272
31273 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
31274 if (Pmode != ptr_mode)
31275 emit_insn (gen_addsi_1_zext (this_reg,
31276 gen_rtx_REG (ptr_mode,
31277 REGNO (this_reg)),
31278 vcall_mem));
31279 else
31280 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
31281 }
31282
31283 /* If necessary, drop THIS back to its stack slot. */
31284 if (this_reg && this_reg != this_param)
31285 emit_move_insn (this_param, this_reg);
31286
31287 fnaddr = XEXP (DECL_RTL (function), 0);
31288 if (TARGET_64BIT)
31289 {
31290 if (!flag_pic || targetm.binds_local_p (function)
31291 || cfun->machine->call_abi == MS_ABI)
31292 ;
31293 else
31294 {
31295 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
31296 tmp = gen_rtx_CONST (Pmode, tmp);
31297 fnaddr = gen_rtx_MEM (Pmode, tmp);
31298 }
31299 }
31300 else
31301 {
31302 if (!flag_pic || targetm.binds_local_p (function))
31303 ;
31304 #if TARGET_MACHO
31305 else if (TARGET_MACHO)
31306 {
31307 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
31308 fnaddr = XEXP (fnaddr, 0);
31309 }
31310 #endif /* TARGET_MACHO */
31311 else
31312 {
31313 tmp = gen_rtx_REG (Pmode, CX_REG);
31314 output_set_got (tmp, NULL_RTX);
31315
31316 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
31317 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
31318 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
31319 }
31320 }
31321
31322 /* Our sibling call patterns do not allow memories, because we have no
31323 predicate that can distinguish between frame and non-frame memory.
31324 For our purposes here, we can get away with (ab)using a jump pattern,
31325 because we're going to do no optimization. */
31326 if (MEM_P (fnaddr))
31327 emit_jump_insn (gen_indirect_jump (fnaddr));
31328 else
31329 {
31330 tmp = gen_rtx_MEM (QImode, fnaddr);
31331 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
31332 tmp = emit_call_insn (tmp);
31333 SIBLING_CALL_P (tmp) = 1;
31334 }
31335 emit_barrier ();
31336
31337 /* Emit just enough of rest_of_compilation to get the insns emitted.
31338 Note that use_thunk calls assemble_start_function et al. */
31339 tmp = get_insns ();
31340 insn_locators_alloc ();
31341 shorten_branches (tmp);
31342 final_start_function (tmp, file, 1);
31343 final (tmp, file, 1);
31344 final_end_function ();
31345 }
31346
31347 static void
31348 x86_file_start (void)
31349 {
31350 default_file_start ();
31351 #if TARGET_MACHO
31352 darwin_file_start ();
31353 #endif
31354 if (X86_FILE_START_VERSION_DIRECTIVE)
31355 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
31356 if (X86_FILE_START_FLTUSED)
31357 fputs ("\t.global\t__fltused\n", asm_out_file);
31358 if (ix86_asm_dialect == ASM_INTEL)
31359 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
31360 }
31361
31362 int
31363 x86_field_alignment (tree field, int computed)
31364 {
31365 enum machine_mode mode;
31366 tree type = TREE_TYPE (field);
31367
31368 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
31369 return computed;
31370 mode = TYPE_MODE (strip_array_types (type));
31371 if (mode == DFmode || mode == DCmode
31372 || GET_MODE_CLASS (mode) == MODE_INT
31373 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
31374 return MIN (32, computed);
31375 return computed;
31376 }
31377
31378 /* Output assembler code to FILE to increment profiler label # LABELNO
31379 for profiling a function entry. */
31380 void
31381 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
31382 {
31383 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
31384 : MCOUNT_NAME);
31385
31386 if (TARGET_64BIT)
31387 {
31388 #ifndef NO_PROFILE_COUNTERS
31389 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
31390 #endif
31391
31392 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
31393 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
31394 else
31395 fprintf (file, "\tcall\t%s\n", mcount_name);
31396 }
31397 else if (flag_pic)
31398 {
31399 #ifndef NO_PROFILE_COUNTERS
31400 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
31401 LPREFIX, labelno);
31402 #endif
31403 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
31404 }
31405 else
31406 {
31407 #ifndef NO_PROFILE_COUNTERS
31408 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
31409 LPREFIX, labelno);
31410 #endif
31411 fprintf (file, "\tcall\t%s\n", mcount_name);
31412 }
31413 }
31414
31415 /* We don't have exact information about the insn sizes, but we may assume
31416 quite safely that we are informed about all 1 byte insns and memory
31417 address sizes. This is enough to eliminate unnecessary padding in
31418 99% of cases. */
31419
31420 static int
31421 min_insn_size (rtx insn)
31422 {
31423 int l = 0, len;
31424
31425 if (!INSN_P (insn) || !active_insn_p (insn))
31426 return 0;
31427
31428 /* Discard alignments we've emit and jump instructions. */
31429 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
31430 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
31431 return 0;
31432 if (JUMP_TABLE_DATA_P (insn))
31433 return 0;
31434
31435 /* Important case - calls are always 5 bytes.
31436 It is common to have many calls in the row. */
31437 if (CALL_P (insn)
31438 && symbolic_reference_mentioned_p (PATTERN (insn))
31439 && !SIBLING_CALL_P (insn))
31440 return 5;
31441 len = get_attr_length (insn);
31442 if (len <= 1)
31443 return 1;
31444
31445 /* For normal instructions we rely on get_attr_length being exact,
31446 with a few exceptions. */
31447 if (!JUMP_P (insn))
31448 {
31449 enum attr_type type = get_attr_type (insn);
31450
31451 switch (type)
31452 {
31453 case TYPE_MULTI:
31454 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
31455 || asm_noperands (PATTERN (insn)) >= 0)
31456 return 0;
31457 break;
31458 case TYPE_OTHER:
31459 case TYPE_FCMP:
31460 break;
31461 default:
31462 /* Otherwise trust get_attr_length. */
31463 return len;
31464 }
31465
31466 l = get_attr_length_address (insn);
31467 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
31468 l = 4;
31469 }
31470 if (l)
31471 return 1+l;
31472 else
31473 return 2;
31474 }
31475
31476 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
31477
31478 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
31479 window. */
31480
31481 static void
31482 ix86_avoid_jump_mispredicts (void)
31483 {
31484 rtx insn, start = get_insns ();
31485 int nbytes = 0, njumps = 0;
31486 int isjump = 0;
31487
31488 /* Look for all minimal intervals of instructions containing 4 jumps.
31489 The intervals are bounded by START and INSN. NBYTES is the total
31490 size of instructions in the interval including INSN and not including
31491 START. When the NBYTES is smaller than 16 bytes, it is possible
31492 that the end of START and INSN ends up in the same 16byte page.
31493
31494 The smallest offset in the page INSN can start is the case where START
31495 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
31496 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
31497 */
31498 for (insn = start; insn; insn = NEXT_INSN (insn))
31499 {
31500 int min_size;
31501
31502 if (LABEL_P (insn))
31503 {
31504 int align = label_to_alignment (insn);
31505 int max_skip = label_to_max_skip (insn);
31506
31507 if (max_skip > 15)
31508 max_skip = 15;
31509 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
31510 already in the current 16 byte page, because otherwise
31511 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
31512 bytes to reach 16 byte boundary. */
31513 if (align <= 0
31514 || (align <= 3 && max_skip != (1 << align) - 1))
31515 max_skip = 0;
31516 if (dump_file)
31517 fprintf (dump_file, "Label %i with max_skip %i\n",
31518 INSN_UID (insn), max_skip);
31519 if (max_skip)
31520 {
31521 while (nbytes + max_skip >= 16)
31522 {
31523 start = NEXT_INSN (start);
31524 if ((JUMP_P (start)
31525 && GET_CODE (PATTERN (start)) != ADDR_VEC
31526 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
31527 || CALL_P (start))
31528 njumps--, isjump = 1;
31529 else
31530 isjump = 0;
31531 nbytes -= min_insn_size (start);
31532 }
31533 }
31534 continue;
31535 }
31536
31537 min_size = min_insn_size (insn);
31538 nbytes += min_size;
31539 if (dump_file)
31540 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
31541 INSN_UID (insn), min_size);
31542 if ((JUMP_P (insn)
31543 && GET_CODE (PATTERN (insn)) != ADDR_VEC
31544 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
31545 || CALL_P (insn))
31546 njumps++;
31547 else
31548 continue;
31549
31550 while (njumps > 3)
31551 {
31552 start = NEXT_INSN (start);
31553 if ((JUMP_P (start)
31554 && GET_CODE (PATTERN (start)) != ADDR_VEC
31555 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
31556 || CALL_P (start))
31557 njumps--, isjump = 1;
31558 else
31559 isjump = 0;
31560 nbytes -= min_insn_size (start);
31561 }
31562 gcc_assert (njumps >= 0);
31563 if (dump_file)
31564 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
31565 INSN_UID (start), INSN_UID (insn), nbytes);
31566
31567 if (njumps == 3 && isjump && nbytes < 16)
31568 {
31569 int padsize = 15 - nbytes + min_insn_size (insn);
31570
31571 if (dump_file)
31572 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
31573 INSN_UID (insn), padsize);
31574 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
31575 }
31576 }
31577 }
31578 #endif
31579
31580 /* AMD Athlon works faster
31581 when RET is not destination of conditional jump or directly preceded
31582 by other jump instruction. We avoid the penalty by inserting NOP just
31583 before the RET instructions in such cases. */
31584 static void
31585 ix86_pad_returns (void)
31586 {
31587 edge e;
31588 edge_iterator ei;
31589
31590 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
31591 {
31592 basic_block bb = e->src;
31593 rtx ret = BB_END (bb);
31594 rtx prev;
31595 bool replace = false;
31596
31597 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
31598 || optimize_bb_for_size_p (bb))
31599 continue;
31600 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
31601 if (active_insn_p (prev) || LABEL_P (prev))
31602 break;
31603 if (prev && LABEL_P (prev))
31604 {
31605 edge e;
31606 edge_iterator ei;
31607
31608 FOR_EACH_EDGE (e, ei, bb->preds)
31609 if (EDGE_FREQUENCY (e) && e->src->index >= 0
31610 && !(e->flags & EDGE_FALLTHRU))
31611 replace = true;
31612 }
31613 if (!replace)
31614 {
31615 prev = prev_active_insn (ret);
31616 if (prev
31617 && ((JUMP_P (prev) && any_condjump_p (prev))
31618 || CALL_P (prev)))
31619 replace = true;
31620 /* Empty functions get branch mispredict even when
31621 the jump destination is not visible to us. */
31622 if (!prev && !optimize_function_for_size_p (cfun))
31623 replace = true;
31624 }
31625 if (replace)
31626 {
31627 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
31628 delete_insn (ret);
31629 }
31630 }
31631 }
31632
31633 /* Count the minimum number of instructions in BB. Return 4 if the
31634 number of instructions >= 4. */
31635
31636 static int
31637 ix86_count_insn_bb (basic_block bb)
31638 {
31639 rtx insn;
31640 int insn_count = 0;
31641
31642 /* Count number of instructions in this block. Return 4 if the number
31643 of instructions >= 4. */
31644 FOR_BB_INSNS (bb, insn)
31645 {
31646 /* Only happen in exit blocks. */
31647 if (JUMP_P (insn)
31648 && ANY_RETURN_P (PATTERN (insn)))
31649 break;
31650
31651 if (NONDEBUG_INSN_P (insn)
31652 && GET_CODE (PATTERN (insn)) != USE
31653 && GET_CODE (PATTERN (insn)) != CLOBBER)
31654 {
31655 insn_count++;
31656 if (insn_count >= 4)
31657 return insn_count;
31658 }
31659 }
31660
31661 return insn_count;
31662 }
31663
31664
31665 /* Count the minimum number of instructions in code path in BB.
31666 Return 4 if the number of instructions >= 4. */
31667
31668 static int
31669 ix86_count_insn (basic_block bb)
31670 {
31671 edge e;
31672 edge_iterator ei;
31673 int min_prev_count;
31674
31675 /* Only bother counting instructions along paths with no
31676 more than 2 basic blocks between entry and exit. Given
31677 that BB has an edge to exit, determine if a predecessor
31678 of BB has an edge from entry. If so, compute the number
31679 of instructions in the predecessor block. If there
31680 happen to be multiple such blocks, compute the minimum. */
31681 min_prev_count = 4;
31682 FOR_EACH_EDGE (e, ei, bb->preds)
31683 {
31684 edge prev_e;
31685 edge_iterator prev_ei;
31686
31687 if (e->src == ENTRY_BLOCK_PTR)
31688 {
31689 min_prev_count = 0;
31690 break;
31691 }
31692 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
31693 {
31694 if (prev_e->src == ENTRY_BLOCK_PTR)
31695 {
31696 int count = ix86_count_insn_bb (e->src);
31697 if (count < min_prev_count)
31698 min_prev_count = count;
31699 break;
31700 }
31701 }
31702 }
31703
31704 if (min_prev_count < 4)
31705 min_prev_count += ix86_count_insn_bb (bb);
31706
31707 return min_prev_count;
31708 }
31709
31710 /* Pad short funtion to 4 instructions. */
31711
31712 static void
31713 ix86_pad_short_function (void)
31714 {
31715 edge e;
31716 edge_iterator ei;
31717
31718 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
31719 {
31720 rtx ret = BB_END (e->src);
31721 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
31722 {
31723 int insn_count = ix86_count_insn (e->src);
31724
31725 /* Pad short function. */
31726 if (insn_count < 4)
31727 {
31728 rtx insn = ret;
31729
31730 /* Find epilogue. */
31731 while (insn
31732 && (!NOTE_P (insn)
31733 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
31734 insn = PREV_INSN (insn);
31735
31736 if (!insn)
31737 insn = ret;
31738
31739 /* Two NOPs count as one instruction. */
31740 insn_count = 2 * (4 - insn_count);
31741 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
31742 }
31743 }
31744 }
31745 }
31746
31747 /* Implement machine specific optimizations. We implement padding of returns
31748 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
31749 static void
31750 ix86_reorg (void)
31751 {
31752 /* We are freeing block_for_insn in the toplev to keep compatibility
31753 with old MDEP_REORGS that are not CFG based. Recompute it now. */
31754 compute_bb_for_insn ();
31755
31756 /* Run the vzeroupper optimization if needed. */
31757 if (TARGET_VZEROUPPER)
31758 move_or_delete_vzeroupper ();
31759
31760 if (optimize && optimize_function_for_speed_p (cfun))
31761 {
31762 if (TARGET_PAD_SHORT_FUNCTION)
31763 ix86_pad_short_function ();
31764 else if (TARGET_PAD_RETURNS)
31765 ix86_pad_returns ();
31766 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
31767 if (TARGET_FOUR_JUMP_LIMIT)
31768 ix86_avoid_jump_mispredicts ();
31769 #endif
31770 }
31771 }
31772
31773 /* Return nonzero when QImode register that must be represented via REX prefix
31774 is used. */
31775 bool
31776 x86_extended_QIreg_mentioned_p (rtx insn)
31777 {
31778 int i;
31779 extract_insn_cached (insn);
31780 for (i = 0; i < recog_data.n_operands; i++)
31781 if (REG_P (recog_data.operand[i])
31782 && REGNO (recog_data.operand[i]) > BX_REG)
31783 return true;
31784 return false;
31785 }
31786
31787 /* Return nonzero when P points to register encoded via REX prefix.
31788 Called via for_each_rtx. */
31789 static int
31790 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
31791 {
31792 unsigned int regno;
31793 if (!REG_P (*p))
31794 return 0;
31795 regno = REGNO (*p);
31796 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
31797 }
31798
31799 /* Return true when INSN mentions register that must be encoded using REX
31800 prefix. */
31801 bool
31802 x86_extended_reg_mentioned_p (rtx insn)
31803 {
31804 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
31805 extended_reg_mentioned_1, NULL);
31806 }
31807
31808 /* If profitable, negate (without causing overflow) integer constant
31809 of mode MODE at location LOC. Return true in this case. */
31810 bool
31811 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
31812 {
31813 HOST_WIDE_INT val;
31814
31815 if (!CONST_INT_P (*loc))
31816 return false;
31817
31818 switch (mode)
31819 {
31820 case DImode:
31821 /* DImode x86_64 constants must fit in 32 bits. */
31822 gcc_assert (x86_64_immediate_operand (*loc, mode));
31823
31824 mode = SImode;
31825 break;
31826
31827 case SImode:
31828 case HImode:
31829 case QImode:
31830 break;
31831
31832 default:
31833 gcc_unreachable ();
31834 }
31835
31836 /* Avoid overflows. */
31837 if (mode_signbit_p (mode, *loc))
31838 return false;
31839
31840 val = INTVAL (*loc);
31841
31842 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
31843 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
31844 if ((val < 0 && val != -128)
31845 || val == 128)
31846 {
31847 *loc = GEN_INT (-val);
31848 return true;
31849 }
31850
31851 return false;
31852 }
31853
31854 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
31855 optabs would emit if we didn't have TFmode patterns. */
31856
31857 void
31858 x86_emit_floatuns (rtx operands[2])
31859 {
31860 rtx neglab, donelab, i0, i1, f0, in, out;
31861 enum machine_mode mode, inmode;
31862
31863 inmode = GET_MODE (operands[1]);
31864 gcc_assert (inmode == SImode || inmode == DImode);
31865
31866 out = operands[0];
31867 in = force_reg (inmode, operands[1]);
31868 mode = GET_MODE (out);
31869 neglab = gen_label_rtx ();
31870 donelab = gen_label_rtx ();
31871 f0 = gen_reg_rtx (mode);
31872
31873 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
31874
31875 expand_float (out, in, 0);
31876
31877 emit_jump_insn (gen_jump (donelab));
31878 emit_barrier ();
31879
31880 emit_label (neglab);
31881
31882 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
31883 1, OPTAB_DIRECT);
31884 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
31885 1, OPTAB_DIRECT);
31886 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
31887
31888 expand_float (f0, i0, 0);
31889
31890 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
31891
31892 emit_label (donelab);
31893 }
31894 \f
31895 /* AVX2 does support 32-byte integer vector operations,
31896 thus the longest vector we are faced with is V32QImode. */
31897 #define MAX_VECT_LEN 32
31898
31899 struct expand_vec_perm_d
31900 {
31901 rtx target, op0, op1;
31902 unsigned char perm[MAX_VECT_LEN];
31903 enum machine_mode vmode;
31904 unsigned char nelt;
31905 bool testing_p;
31906 };
31907
31908 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
31909 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
31910
31911 /* Get a vector mode of the same size as the original but with elements
31912 twice as wide. This is only guaranteed to apply to integral vectors. */
31913
31914 static inline enum machine_mode
31915 get_mode_wider_vector (enum machine_mode o)
31916 {
31917 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
31918 enum machine_mode n = GET_MODE_WIDER_MODE (o);
31919 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
31920 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
31921 return n;
31922 }
31923
31924 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
31925 with all elements equal to VAR. Return true if successful. */
31926
31927 static bool
31928 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
31929 rtx target, rtx val)
31930 {
31931 bool ok;
31932
31933 switch (mode)
31934 {
31935 case V2SImode:
31936 case V2SFmode:
31937 if (!mmx_ok)
31938 return false;
31939 /* FALLTHRU */
31940
31941 case V4DFmode:
31942 case V4DImode:
31943 case V8SFmode:
31944 case V8SImode:
31945 case V2DFmode:
31946 case V2DImode:
31947 case V4SFmode:
31948 case V4SImode:
31949 {
31950 rtx insn, dup;
31951
31952 /* First attempt to recognize VAL as-is. */
31953 dup = gen_rtx_VEC_DUPLICATE (mode, val);
31954 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
31955 if (recog_memoized (insn) < 0)
31956 {
31957 rtx seq;
31958 /* If that fails, force VAL into a register. */
31959
31960 start_sequence ();
31961 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
31962 seq = get_insns ();
31963 end_sequence ();
31964 if (seq)
31965 emit_insn_before (seq, insn);
31966
31967 ok = recog_memoized (insn) >= 0;
31968 gcc_assert (ok);
31969 }
31970 }
31971 return true;
31972
31973 case V4HImode:
31974 if (!mmx_ok)
31975 return false;
31976 if (TARGET_SSE || TARGET_3DNOW_A)
31977 {
31978 rtx x;
31979
31980 val = gen_lowpart (SImode, val);
31981 x = gen_rtx_TRUNCATE (HImode, val);
31982 x = gen_rtx_VEC_DUPLICATE (mode, x);
31983 emit_insn (gen_rtx_SET (VOIDmode, target, x));
31984 return true;
31985 }
31986 goto widen;
31987
31988 case V8QImode:
31989 if (!mmx_ok)
31990 return false;
31991 goto widen;
31992
31993 case V8HImode:
31994 if (TARGET_SSE2)
31995 {
31996 struct expand_vec_perm_d dperm;
31997 rtx tmp1, tmp2;
31998
31999 permute:
32000 memset (&dperm, 0, sizeof (dperm));
32001 dperm.target = target;
32002 dperm.vmode = mode;
32003 dperm.nelt = GET_MODE_NUNITS (mode);
32004 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
32005
32006 /* Extend to SImode using a paradoxical SUBREG. */
32007 tmp1 = gen_reg_rtx (SImode);
32008 emit_move_insn (tmp1, gen_lowpart (SImode, val));
32009
32010 /* Insert the SImode value as low element of a V4SImode vector. */
32011 tmp2 = gen_lowpart (V4SImode, dperm.op0);
32012 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
32013
32014 ok = (expand_vec_perm_1 (&dperm)
32015 || expand_vec_perm_broadcast_1 (&dperm));
32016 gcc_assert (ok);
32017 return ok;
32018 }
32019 goto widen;
32020
32021 case V16QImode:
32022 if (TARGET_SSE2)
32023 goto permute;
32024 goto widen;
32025
32026 widen:
32027 /* Replicate the value once into the next wider mode and recurse. */
32028 {
32029 enum machine_mode smode, wsmode, wvmode;
32030 rtx x;
32031
32032 smode = GET_MODE_INNER (mode);
32033 wvmode = get_mode_wider_vector (mode);
32034 wsmode = GET_MODE_INNER (wvmode);
32035
32036 val = convert_modes (wsmode, smode, val, true);
32037 x = expand_simple_binop (wsmode, ASHIFT, val,
32038 GEN_INT (GET_MODE_BITSIZE (smode)),
32039 NULL_RTX, 1, OPTAB_LIB_WIDEN);
32040 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
32041
32042 x = gen_lowpart (wvmode, target);
32043 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
32044 gcc_assert (ok);
32045 return ok;
32046 }
32047
32048 case V16HImode:
32049 case V32QImode:
32050 {
32051 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
32052 rtx x = gen_reg_rtx (hvmode);
32053
32054 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
32055 gcc_assert (ok);
32056
32057 x = gen_rtx_VEC_CONCAT (mode, x, x);
32058 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32059 }
32060 return true;
32061
32062 default:
32063 return false;
32064 }
32065 }
32066
32067 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32068 whose ONE_VAR element is VAR, and other elements are zero. Return true
32069 if successful. */
32070
32071 static bool
32072 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
32073 rtx target, rtx var, int one_var)
32074 {
32075 enum machine_mode vsimode;
32076 rtx new_target;
32077 rtx x, tmp;
32078 bool use_vector_set = false;
32079
32080 switch (mode)
32081 {
32082 case V2DImode:
32083 /* For SSE4.1, we normally use vector set. But if the second
32084 element is zero and inter-unit moves are OK, we use movq
32085 instead. */
32086 use_vector_set = (TARGET_64BIT
32087 && TARGET_SSE4_1
32088 && !(TARGET_INTER_UNIT_MOVES
32089 && one_var == 0));
32090 break;
32091 case V16QImode:
32092 case V4SImode:
32093 case V4SFmode:
32094 use_vector_set = TARGET_SSE4_1;
32095 break;
32096 case V8HImode:
32097 use_vector_set = TARGET_SSE2;
32098 break;
32099 case V4HImode:
32100 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
32101 break;
32102 case V32QImode:
32103 case V16HImode:
32104 case V8SImode:
32105 case V8SFmode:
32106 case V4DFmode:
32107 use_vector_set = TARGET_AVX;
32108 break;
32109 case V4DImode:
32110 /* Use ix86_expand_vector_set in 64bit mode only. */
32111 use_vector_set = TARGET_AVX && TARGET_64BIT;
32112 break;
32113 default:
32114 break;
32115 }
32116
32117 if (use_vector_set)
32118 {
32119 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
32120 var = force_reg (GET_MODE_INNER (mode), var);
32121 ix86_expand_vector_set (mmx_ok, target, var, one_var);
32122 return true;
32123 }
32124
32125 switch (mode)
32126 {
32127 case V2SFmode:
32128 case V2SImode:
32129 if (!mmx_ok)
32130 return false;
32131 /* FALLTHRU */
32132
32133 case V2DFmode:
32134 case V2DImode:
32135 if (one_var != 0)
32136 return false;
32137 var = force_reg (GET_MODE_INNER (mode), var);
32138 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
32139 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32140 return true;
32141
32142 case V4SFmode:
32143 case V4SImode:
32144 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
32145 new_target = gen_reg_rtx (mode);
32146 else
32147 new_target = target;
32148 var = force_reg (GET_MODE_INNER (mode), var);
32149 x = gen_rtx_VEC_DUPLICATE (mode, var);
32150 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
32151 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
32152 if (one_var != 0)
32153 {
32154 /* We need to shuffle the value to the correct position, so
32155 create a new pseudo to store the intermediate result. */
32156
32157 /* With SSE2, we can use the integer shuffle insns. */
32158 if (mode != V4SFmode && TARGET_SSE2)
32159 {
32160 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
32161 const1_rtx,
32162 GEN_INT (one_var == 1 ? 0 : 1),
32163 GEN_INT (one_var == 2 ? 0 : 1),
32164 GEN_INT (one_var == 3 ? 0 : 1)));
32165 if (target != new_target)
32166 emit_move_insn (target, new_target);
32167 return true;
32168 }
32169
32170 /* Otherwise convert the intermediate result to V4SFmode and
32171 use the SSE1 shuffle instructions. */
32172 if (mode != V4SFmode)
32173 {
32174 tmp = gen_reg_rtx (V4SFmode);
32175 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
32176 }
32177 else
32178 tmp = new_target;
32179
32180 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
32181 const1_rtx,
32182 GEN_INT (one_var == 1 ? 0 : 1),
32183 GEN_INT (one_var == 2 ? 0+4 : 1+4),
32184 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
32185
32186 if (mode != V4SFmode)
32187 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
32188 else if (tmp != target)
32189 emit_move_insn (target, tmp);
32190 }
32191 else if (target != new_target)
32192 emit_move_insn (target, new_target);
32193 return true;
32194
32195 case V8HImode:
32196 case V16QImode:
32197 vsimode = V4SImode;
32198 goto widen;
32199 case V4HImode:
32200 case V8QImode:
32201 if (!mmx_ok)
32202 return false;
32203 vsimode = V2SImode;
32204 goto widen;
32205 widen:
32206 if (one_var != 0)
32207 return false;
32208
32209 /* Zero extend the variable element to SImode and recurse. */
32210 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
32211
32212 x = gen_reg_rtx (vsimode);
32213 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
32214 var, one_var))
32215 gcc_unreachable ();
32216
32217 emit_move_insn (target, gen_lowpart (mode, x));
32218 return true;
32219
32220 default:
32221 return false;
32222 }
32223 }
32224
32225 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32226 consisting of the values in VALS. It is known that all elements
32227 except ONE_VAR are constants. Return true if successful. */
32228
32229 static bool
32230 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
32231 rtx target, rtx vals, int one_var)
32232 {
32233 rtx var = XVECEXP (vals, 0, one_var);
32234 enum machine_mode wmode;
32235 rtx const_vec, x;
32236
32237 const_vec = copy_rtx (vals);
32238 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
32239 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
32240
32241 switch (mode)
32242 {
32243 case V2DFmode:
32244 case V2DImode:
32245 case V2SFmode:
32246 case V2SImode:
32247 /* For the two element vectors, it's just as easy to use
32248 the general case. */
32249 return false;
32250
32251 case V4DImode:
32252 /* Use ix86_expand_vector_set in 64bit mode only. */
32253 if (!TARGET_64BIT)
32254 return false;
32255 case V4DFmode:
32256 case V8SFmode:
32257 case V8SImode:
32258 case V16HImode:
32259 case V32QImode:
32260 case V4SFmode:
32261 case V4SImode:
32262 case V8HImode:
32263 case V4HImode:
32264 break;
32265
32266 case V16QImode:
32267 if (TARGET_SSE4_1)
32268 break;
32269 wmode = V8HImode;
32270 goto widen;
32271 case V8QImode:
32272 wmode = V4HImode;
32273 goto widen;
32274 widen:
32275 /* There's no way to set one QImode entry easily. Combine
32276 the variable value with its adjacent constant value, and
32277 promote to an HImode set. */
32278 x = XVECEXP (vals, 0, one_var ^ 1);
32279 if (one_var & 1)
32280 {
32281 var = convert_modes (HImode, QImode, var, true);
32282 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
32283 NULL_RTX, 1, OPTAB_LIB_WIDEN);
32284 x = GEN_INT (INTVAL (x) & 0xff);
32285 }
32286 else
32287 {
32288 var = convert_modes (HImode, QImode, var, true);
32289 x = gen_int_mode (INTVAL (x) << 8, HImode);
32290 }
32291 if (x != const0_rtx)
32292 var = expand_simple_binop (HImode, IOR, var, x, var,
32293 1, OPTAB_LIB_WIDEN);
32294
32295 x = gen_reg_rtx (wmode);
32296 emit_move_insn (x, gen_lowpart (wmode, const_vec));
32297 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
32298
32299 emit_move_insn (target, gen_lowpart (mode, x));
32300 return true;
32301
32302 default:
32303 return false;
32304 }
32305
32306 emit_move_insn (target, const_vec);
32307 ix86_expand_vector_set (mmx_ok, target, var, one_var);
32308 return true;
32309 }
32310
32311 /* A subroutine of ix86_expand_vector_init_general. Use vector
32312 concatenate to handle the most general case: all values variable,
32313 and none identical. */
32314
32315 static void
32316 ix86_expand_vector_init_concat (enum machine_mode mode,
32317 rtx target, rtx *ops, int n)
32318 {
32319 enum machine_mode cmode, hmode = VOIDmode;
32320 rtx first[8], second[4];
32321 rtvec v;
32322 int i, j;
32323
32324 switch (n)
32325 {
32326 case 2:
32327 switch (mode)
32328 {
32329 case V8SImode:
32330 cmode = V4SImode;
32331 break;
32332 case V8SFmode:
32333 cmode = V4SFmode;
32334 break;
32335 case V4DImode:
32336 cmode = V2DImode;
32337 break;
32338 case V4DFmode:
32339 cmode = V2DFmode;
32340 break;
32341 case V4SImode:
32342 cmode = V2SImode;
32343 break;
32344 case V4SFmode:
32345 cmode = V2SFmode;
32346 break;
32347 case V2DImode:
32348 cmode = DImode;
32349 break;
32350 case V2SImode:
32351 cmode = SImode;
32352 break;
32353 case V2DFmode:
32354 cmode = DFmode;
32355 break;
32356 case V2SFmode:
32357 cmode = SFmode;
32358 break;
32359 default:
32360 gcc_unreachable ();
32361 }
32362
32363 if (!register_operand (ops[1], cmode))
32364 ops[1] = force_reg (cmode, ops[1]);
32365 if (!register_operand (ops[0], cmode))
32366 ops[0] = force_reg (cmode, ops[0]);
32367 emit_insn (gen_rtx_SET (VOIDmode, target,
32368 gen_rtx_VEC_CONCAT (mode, ops[0],
32369 ops[1])));
32370 break;
32371
32372 case 4:
32373 switch (mode)
32374 {
32375 case V4DImode:
32376 cmode = V2DImode;
32377 break;
32378 case V4DFmode:
32379 cmode = V2DFmode;
32380 break;
32381 case V4SImode:
32382 cmode = V2SImode;
32383 break;
32384 case V4SFmode:
32385 cmode = V2SFmode;
32386 break;
32387 default:
32388 gcc_unreachable ();
32389 }
32390 goto half;
32391
32392 case 8:
32393 switch (mode)
32394 {
32395 case V8SImode:
32396 cmode = V2SImode;
32397 hmode = V4SImode;
32398 break;
32399 case V8SFmode:
32400 cmode = V2SFmode;
32401 hmode = V4SFmode;
32402 break;
32403 default:
32404 gcc_unreachable ();
32405 }
32406 goto half;
32407
32408 half:
32409 /* FIXME: We process inputs backward to help RA. PR 36222. */
32410 i = n - 1;
32411 j = (n >> 1) - 1;
32412 for (; i > 0; i -= 2, j--)
32413 {
32414 first[j] = gen_reg_rtx (cmode);
32415 v = gen_rtvec (2, ops[i - 1], ops[i]);
32416 ix86_expand_vector_init (false, first[j],
32417 gen_rtx_PARALLEL (cmode, v));
32418 }
32419
32420 n >>= 1;
32421 if (n > 2)
32422 {
32423 gcc_assert (hmode != VOIDmode);
32424 for (i = j = 0; i < n; i += 2, j++)
32425 {
32426 second[j] = gen_reg_rtx (hmode);
32427 ix86_expand_vector_init_concat (hmode, second [j],
32428 &first [i], 2);
32429 }
32430 n >>= 1;
32431 ix86_expand_vector_init_concat (mode, target, second, n);
32432 }
32433 else
32434 ix86_expand_vector_init_concat (mode, target, first, n);
32435 break;
32436
32437 default:
32438 gcc_unreachable ();
32439 }
32440 }
32441
32442 /* A subroutine of ix86_expand_vector_init_general. Use vector
32443 interleave to handle the most general case: all values variable,
32444 and none identical. */
32445
32446 static void
32447 ix86_expand_vector_init_interleave (enum machine_mode mode,
32448 rtx target, rtx *ops, int n)
32449 {
32450 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
32451 int i, j;
32452 rtx op0, op1;
32453 rtx (*gen_load_even) (rtx, rtx, rtx);
32454 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
32455 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
32456
32457 switch (mode)
32458 {
32459 case V8HImode:
32460 gen_load_even = gen_vec_setv8hi;
32461 gen_interleave_first_low = gen_vec_interleave_lowv4si;
32462 gen_interleave_second_low = gen_vec_interleave_lowv2di;
32463 inner_mode = HImode;
32464 first_imode = V4SImode;
32465 second_imode = V2DImode;
32466 third_imode = VOIDmode;
32467 break;
32468 case V16QImode:
32469 gen_load_even = gen_vec_setv16qi;
32470 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
32471 gen_interleave_second_low = gen_vec_interleave_lowv4si;
32472 inner_mode = QImode;
32473 first_imode = V8HImode;
32474 second_imode = V4SImode;
32475 third_imode = V2DImode;
32476 break;
32477 default:
32478 gcc_unreachable ();
32479 }
32480
32481 for (i = 0; i < n; i++)
32482 {
32483 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
32484 op0 = gen_reg_rtx (SImode);
32485 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
32486
32487 /* Insert the SImode value as low element of V4SImode vector. */
32488 op1 = gen_reg_rtx (V4SImode);
32489 op0 = gen_rtx_VEC_MERGE (V4SImode,
32490 gen_rtx_VEC_DUPLICATE (V4SImode,
32491 op0),
32492 CONST0_RTX (V4SImode),
32493 const1_rtx);
32494 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
32495
32496 /* Cast the V4SImode vector back to a vector in orignal mode. */
32497 op0 = gen_reg_rtx (mode);
32498 emit_move_insn (op0, gen_lowpart (mode, op1));
32499
32500 /* Load even elements into the second positon. */
32501 emit_insn (gen_load_even (op0,
32502 force_reg (inner_mode,
32503 ops [i + i + 1]),
32504 const1_rtx));
32505
32506 /* Cast vector to FIRST_IMODE vector. */
32507 ops[i] = gen_reg_rtx (first_imode);
32508 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
32509 }
32510
32511 /* Interleave low FIRST_IMODE vectors. */
32512 for (i = j = 0; i < n; i += 2, j++)
32513 {
32514 op0 = gen_reg_rtx (first_imode);
32515 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
32516
32517 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
32518 ops[j] = gen_reg_rtx (second_imode);
32519 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
32520 }
32521
32522 /* Interleave low SECOND_IMODE vectors. */
32523 switch (second_imode)
32524 {
32525 case V4SImode:
32526 for (i = j = 0; i < n / 2; i += 2, j++)
32527 {
32528 op0 = gen_reg_rtx (second_imode);
32529 emit_insn (gen_interleave_second_low (op0, ops[i],
32530 ops[i + 1]));
32531
32532 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
32533 vector. */
32534 ops[j] = gen_reg_rtx (third_imode);
32535 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
32536 }
32537 second_imode = V2DImode;
32538 gen_interleave_second_low = gen_vec_interleave_lowv2di;
32539 /* FALLTHRU */
32540
32541 case V2DImode:
32542 op0 = gen_reg_rtx (second_imode);
32543 emit_insn (gen_interleave_second_low (op0, ops[0],
32544 ops[1]));
32545
32546 /* Cast the SECOND_IMODE vector back to a vector on original
32547 mode. */
32548 emit_insn (gen_rtx_SET (VOIDmode, target,
32549 gen_lowpart (mode, op0)));
32550 break;
32551
32552 default:
32553 gcc_unreachable ();
32554 }
32555 }
32556
32557 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
32558 all values variable, and none identical. */
32559
32560 static void
32561 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
32562 rtx target, rtx vals)
32563 {
32564 rtx ops[32], op0, op1;
32565 enum machine_mode half_mode = VOIDmode;
32566 int n, i;
32567
32568 switch (mode)
32569 {
32570 case V2SFmode:
32571 case V2SImode:
32572 if (!mmx_ok && !TARGET_SSE)
32573 break;
32574 /* FALLTHRU */
32575
32576 case V8SFmode:
32577 case V8SImode:
32578 case V4DFmode:
32579 case V4DImode:
32580 case V4SFmode:
32581 case V4SImode:
32582 case V2DFmode:
32583 case V2DImode:
32584 n = GET_MODE_NUNITS (mode);
32585 for (i = 0; i < n; i++)
32586 ops[i] = XVECEXP (vals, 0, i);
32587 ix86_expand_vector_init_concat (mode, target, ops, n);
32588 return;
32589
32590 case V32QImode:
32591 half_mode = V16QImode;
32592 goto half;
32593
32594 case V16HImode:
32595 half_mode = V8HImode;
32596 goto half;
32597
32598 half:
32599 n = GET_MODE_NUNITS (mode);
32600 for (i = 0; i < n; i++)
32601 ops[i] = XVECEXP (vals, 0, i);
32602 op0 = gen_reg_rtx (half_mode);
32603 op1 = gen_reg_rtx (half_mode);
32604 ix86_expand_vector_init_interleave (half_mode, op0, ops,
32605 n >> 2);
32606 ix86_expand_vector_init_interleave (half_mode, op1,
32607 &ops [n >> 1], n >> 2);
32608 emit_insn (gen_rtx_SET (VOIDmode, target,
32609 gen_rtx_VEC_CONCAT (mode, op0, op1)));
32610 return;
32611
32612 case V16QImode:
32613 if (!TARGET_SSE4_1)
32614 break;
32615 /* FALLTHRU */
32616
32617 case V8HImode:
32618 if (!TARGET_SSE2)
32619 break;
32620
32621 /* Don't use ix86_expand_vector_init_interleave if we can't
32622 move from GPR to SSE register directly. */
32623 if (!TARGET_INTER_UNIT_MOVES)
32624 break;
32625
32626 n = GET_MODE_NUNITS (mode);
32627 for (i = 0; i < n; i++)
32628 ops[i] = XVECEXP (vals, 0, i);
32629 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
32630 return;
32631
32632 case V4HImode:
32633 case V8QImode:
32634 break;
32635
32636 default:
32637 gcc_unreachable ();
32638 }
32639
32640 {
32641 int i, j, n_elts, n_words, n_elt_per_word;
32642 enum machine_mode inner_mode;
32643 rtx words[4], shift;
32644
32645 inner_mode = GET_MODE_INNER (mode);
32646 n_elts = GET_MODE_NUNITS (mode);
32647 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
32648 n_elt_per_word = n_elts / n_words;
32649 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
32650
32651 for (i = 0; i < n_words; ++i)
32652 {
32653 rtx word = NULL_RTX;
32654
32655 for (j = 0; j < n_elt_per_word; ++j)
32656 {
32657 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
32658 elt = convert_modes (word_mode, inner_mode, elt, true);
32659
32660 if (j == 0)
32661 word = elt;
32662 else
32663 {
32664 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
32665 word, 1, OPTAB_LIB_WIDEN);
32666 word = expand_simple_binop (word_mode, IOR, word, elt,
32667 word, 1, OPTAB_LIB_WIDEN);
32668 }
32669 }
32670
32671 words[i] = word;
32672 }
32673
32674 if (n_words == 1)
32675 emit_move_insn (target, gen_lowpart (mode, words[0]));
32676 else if (n_words == 2)
32677 {
32678 rtx tmp = gen_reg_rtx (mode);
32679 emit_clobber (tmp);
32680 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
32681 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
32682 emit_move_insn (target, tmp);
32683 }
32684 else if (n_words == 4)
32685 {
32686 rtx tmp = gen_reg_rtx (V4SImode);
32687 gcc_assert (word_mode == SImode);
32688 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
32689 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
32690 emit_move_insn (target, gen_lowpart (mode, tmp));
32691 }
32692 else
32693 gcc_unreachable ();
32694 }
32695 }
32696
32697 /* Initialize vector TARGET via VALS. Suppress the use of MMX
32698 instructions unless MMX_OK is true. */
32699
32700 void
32701 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
32702 {
32703 enum machine_mode mode = GET_MODE (target);
32704 enum machine_mode inner_mode = GET_MODE_INNER (mode);
32705 int n_elts = GET_MODE_NUNITS (mode);
32706 int n_var = 0, one_var = -1;
32707 bool all_same = true, all_const_zero = true;
32708 int i;
32709 rtx x;
32710
32711 for (i = 0; i < n_elts; ++i)
32712 {
32713 x = XVECEXP (vals, 0, i);
32714 if (!(CONST_INT_P (x)
32715 || GET_CODE (x) == CONST_DOUBLE
32716 || GET_CODE (x) == CONST_FIXED))
32717 n_var++, one_var = i;
32718 else if (x != CONST0_RTX (inner_mode))
32719 all_const_zero = false;
32720 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
32721 all_same = false;
32722 }
32723
32724 /* Constants are best loaded from the constant pool. */
32725 if (n_var == 0)
32726 {
32727 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
32728 return;
32729 }
32730
32731 /* If all values are identical, broadcast the value. */
32732 if (all_same
32733 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
32734 XVECEXP (vals, 0, 0)))
32735 return;
32736
32737 /* Values where only one field is non-constant are best loaded from
32738 the pool and overwritten via move later. */
32739 if (n_var == 1)
32740 {
32741 if (all_const_zero
32742 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
32743 XVECEXP (vals, 0, one_var),
32744 one_var))
32745 return;
32746
32747 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
32748 return;
32749 }
32750
32751 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
32752 }
32753
32754 void
32755 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
32756 {
32757 enum machine_mode mode = GET_MODE (target);
32758 enum machine_mode inner_mode = GET_MODE_INNER (mode);
32759 enum machine_mode half_mode;
32760 bool use_vec_merge = false;
32761 rtx tmp;
32762 static rtx (*gen_extract[6][2]) (rtx, rtx)
32763 = {
32764 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
32765 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
32766 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
32767 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
32768 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
32769 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
32770 };
32771 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
32772 = {
32773 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
32774 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
32775 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
32776 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
32777 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
32778 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
32779 };
32780 int i, j, n;
32781
32782 switch (mode)
32783 {
32784 case V2SFmode:
32785 case V2SImode:
32786 if (mmx_ok)
32787 {
32788 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
32789 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
32790 if (elt == 0)
32791 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
32792 else
32793 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
32794 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
32795 return;
32796 }
32797 break;
32798
32799 case V2DImode:
32800 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
32801 if (use_vec_merge)
32802 break;
32803
32804 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
32805 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
32806 if (elt == 0)
32807 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
32808 else
32809 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
32810 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
32811 return;
32812
32813 case V2DFmode:
32814 {
32815 rtx op0, op1;
32816
32817 /* For the two element vectors, we implement a VEC_CONCAT with
32818 the extraction of the other element. */
32819
32820 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
32821 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
32822
32823 if (elt == 0)
32824 op0 = val, op1 = tmp;
32825 else
32826 op0 = tmp, op1 = val;
32827
32828 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
32829 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
32830 }
32831 return;
32832
32833 case V4SFmode:
32834 use_vec_merge = TARGET_SSE4_1;
32835 if (use_vec_merge)
32836 break;
32837
32838 switch (elt)
32839 {
32840 case 0:
32841 use_vec_merge = true;
32842 break;
32843
32844 case 1:
32845 /* tmp = target = A B C D */
32846 tmp = copy_to_reg (target);
32847 /* target = A A B B */
32848 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
32849 /* target = X A B B */
32850 ix86_expand_vector_set (false, target, val, 0);
32851 /* target = A X C D */
32852 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
32853 const1_rtx, const0_rtx,
32854 GEN_INT (2+4), GEN_INT (3+4)));
32855 return;
32856
32857 case 2:
32858 /* tmp = target = A B C D */
32859 tmp = copy_to_reg (target);
32860 /* tmp = X B C D */
32861 ix86_expand_vector_set (false, tmp, val, 0);
32862 /* target = A B X D */
32863 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
32864 const0_rtx, const1_rtx,
32865 GEN_INT (0+4), GEN_INT (3+4)));
32866 return;
32867
32868 case 3:
32869 /* tmp = target = A B C D */
32870 tmp = copy_to_reg (target);
32871 /* tmp = X B C D */
32872 ix86_expand_vector_set (false, tmp, val, 0);
32873 /* target = A B X D */
32874 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
32875 const0_rtx, const1_rtx,
32876 GEN_INT (2+4), GEN_INT (0+4)));
32877 return;
32878
32879 default:
32880 gcc_unreachable ();
32881 }
32882 break;
32883
32884 case V4SImode:
32885 use_vec_merge = TARGET_SSE4_1;
32886 if (use_vec_merge)
32887 break;
32888
32889 /* Element 0 handled by vec_merge below. */
32890 if (elt == 0)
32891 {
32892 use_vec_merge = true;
32893 break;
32894 }
32895
32896 if (TARGET_SSE2)
32897 {
32898 /* With SSE2, use integer shuffles to swap element 0 and ELT,
32899 store into element 0, then shuffle them back. */
32900
32901 rtx order[4];
32902
32903 order[0] = GEN_INT (elt);
32904 order[1] = const1_rtx;
32905 order[2] = const2_rtx;
32906 order[3] = GEN_INT (3);
32907 order[elt] = const0_rtx;
32908
32909 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
32910 order[1], order[2], order[3]));
32911
32912 ix86_expand_vector_set (false, target, val, 0);
32913
32914 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
32915 order[1], order[2], order[3]));
32916 }
32917 else
32918 {
32919 /* For SSE1, we have to reuse the V4SF code. */
32920 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
32921 gen_lowpart (SFmode, val), elt);
32922 }
32923 return;
32924
32925 case V8HImode:
32926 use_vec_merge = TARGET_SSE2;
32927 break;
32928 case V4HImode:
32929 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
32930 break;
32931
32932 case V16QImode:
32933 use_vec_merge = TARGET_SSE4_1;
32934 break;
32935
32936 case V8QImode:
32937 break;
32938
32939 case V32QImode:
32940 half_mode = V16QImode;
32941 j = 0;
32942 n = 16;
32943 goto half;
32944
32945 case V16HImode:
32946 half_mode = V8HImode;
32947 j = 1;
32948 n = 8;
32949 goto half;
32950
32951 case V8SImode:
32952 half_mode = V4SImode;
32953 j = 2;
32954 n = 4;
32955 goto half;
32956
32957 case V4DImode:
32958 half_mode = V2DImode;
32959 j = 3;
32960 n = 2;
32961 goto half;
32962
32963 case V8SFmode:
32964 half_mode = V4SFmode;
32965 j = 4;
32966 n = 4;
32967 goto half;
32968
32969 case V4DFmode:
32970 half_mode = V2DFmode;
32971 j = 5;
32972 n = 2;
32973 goto half;
32974
32975 half:
32976 /* Compute offset. */
32977 i = elt / n;
32978 elt %= n;
32979
32980 gcc_assert (i <= 1);
32981
32982 /* Extract the half. */
32983 tmp = gen_reg_rtx (half_mode);
32984 emit_insn (gen_extract[j][i] (tmp, target));
32985
32986 /* Put val in tmp at elt. */
32987 ix86_expand_vector_set (false, tmp, val, elt);
32988
32989 /* Put it back. */
32990 emit_insn (gen_insert[j][i] (target, target, tmp));
32991 return;
32992
32993 default:
32994 break;
32995 }
32996
32997 if (use_vec_merge)
32998 {
32999 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
33000 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
33001 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33002 }
33003 else
33004 {
33005 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33006
33007 emit_move_insn (mem, target);
33008
33009 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
33010 emit_move_insn (tmp, val);
33011
33012 emit_move_insn (target, mem);
33013 }
33014 }
33015
33016 void
33017 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
33018 {
33019 enum machine_mode mode = GET_MODE (vec);
33020 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33021 bool use_vec_extr = false;
33022 rtx tmp;
33023
33024 switch (mode)
33025 {
33026 case V2SImode:
33027 case V2SFmode:
33028 if (!mmx_ok)
33029 break;
33030 /* FALLTHRU */
33031
33032 case V2DFmode:
33033 case V2DImode:
33034 use_vec_extr = true;
33035 break;
33036
33037 case V4SFmode:
33038 use_vec_extr = TARGET_SSE4_1;
33039 if (use_vec_extr)
33040 break;
33041
33042 switch (elt)
33043 {
33044 case 0:
33045 tmp = vec;
33046 break;
33047
33048 case 1:
33049 case 3:
33050 tmp = gen_reg_rtx (mode);
33051 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
33052 GEN_INT (elt), GEN_INT (elt),
33053 GEN_INT (elt+4), GEN_INT (elt+4)));
33054 break;
33055
33056 case 2:
33057 tmp = gen_reg_rtx (mode);
33058 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
33059 break;
33060
33061 default:
33062 gcc_unreachable ();
33063 }
33064 vec = tmp;
33065 use_vec_extr = true;
33066 elt = 0;
33067 break;
33068
33069 case V4SImode:
33070 use_vec_extr = TARGET_SSE4_1;
33071 if (use_vec_extr)
33072 break;
33073
33074 if (TARGET_SSE2)
33075 {
33076 switch (elt)
33077 {
33078 case 0:
33079 tmp = vec;
33080 break;
33081
33082 case 1:
33083 case 3:
33084 tmp = gen_reg_rtx (mode);
33085 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
33086 GEN_INT (elt), GEN_INT (elt),
33087 GEN_INT (elt), GEN_INT (elt)));
33088 break;
33089
33090 case 2:
33091 tmp = gen_reg_rtx (mode);
33092 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
33093 break;
33094
33095 default:
33096 gcc_unreachable ();
33097 }
33098 vec = tmp;
33099 use_vec_extr = true;
33100 elt = 0;
33101 }
33102 else
33103 {
33104 /* For SSE1, we have to reuse the V4SF code. */
33105 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
33106 gen_lowpart (V4SFmode, vec), elt);
33107 return;
33108 }
33109 break;
33110
33111 case V8HImode:
33112 use_vec_extr = TARGET_SSE2;
33113 break;
33114 case V4HImode:
33115 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33116 break;
33117
33118 case V16QImode:
33119 use_vec_extr = TARGET_SSE4_1;
33120 break;
33121
33122 case V8SFmode:
33123 if (TARGET_AVX)
33124 {
33125 tmp = gen_reg_rtx (V4SFmode);
33126 if (elt < 4)
33127 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
33128 else
33129 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
33130 ix86_expand_vector_extract (false, target, tmp, elt & 3);
33131 return;
33132 }
33133 break;
33134
33135 case V4DFmode:
33136 if (TARGET_AVX)
33137 {
33138 tmp = gen_reg_rtx (V2DFmode);
33139 if (elt < 2)
33140 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
33141 else
33142 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
33143 ix86_expand_vector_extract (false, target, tmp, elt & 1);
33144 return;
33145 }
33146 break;
33147
33148 case V32QImode:
33149 if (TARGET_AVX)
33150 {
33151 tmp = gen_reg_rtx (V16QImode);
33152 if (elt < 16)
33153 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
33154 else
33155 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
33156 ix86_expand_vector_extract (false, target, tmp, elt & 15);
33157 return;
33158 }
33159 break;
33160
33161 case V16HImode:
33162 if (TARGET_AVX)
33163 {
33164 tmp = gen_reg_rtx (V8HImode);
33165 if (elt < 8)
33166 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
33167 else
33168 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
33169 ix86_expand_vector_extract (false, target, tmp, elt & 7);
33170 return;
33171 }
33172 break;
33173
33174 case V8SImode:
33175 if (TARGET_AVX)
33176 {
33177 tmp = gen_reg_rtx (V4SImode);
33178 if (elt < 4)
33179 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
33180 else
33181 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
33182 ix86_expand_vector_extract (false, target, tmp, elt & 3);
33183 return;
33184 }
33185 break;
33186
33187 case V4DImode:
33188 if (TARGET_AVX)
33189 {
33190 tmp = gen_reg_rtx (V2DImode);
33191 if (elt < 2)
33192 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
33193 else
33194 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
33195 ix86_expand_vector_extract (false, target, tmp, elt & 1);
33196 return;
33197 }
33198 break;
33199
33200 case V8QImode:
33201 /* ??? Could extract the appropriate HImode element and shift. */
33202 default:
33203 break;
33204 }
33205
33206 if (use_vec_extr)
33207 {
33208 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
33209 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
33210
33211 /* Let the rtl optimizers know about the zero extension performed. */
33212 if (inner_mode == QImode || inner_mode == HImode)
33213 {
33214 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
33215 target = gen_lowpart (SImode, target);
33216 }
33217
33218 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33219 }
33220 else
33221 {
33222 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33223
33224 emit_move_insn (mem, vec);
33225
33226 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
33227 emit_move_insn (target, tmp);
33228 }
33229 }
33230
33231 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
33232 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
33233 The upper bits of DEST are undefined, though they shouldn't cause
33234 exceptions (some bits from src or all zeros are ok). */
33235
33236 static void
33237 emit_reduc_half (rtx dest, rtx src, int i)
33238 {
33239 rtx tem;
33240 switch (GET_MODE (src))
33241 {
33242 case V4SFmode:
33243 if (i == 128)
33244 tem = gen_sse_movhlps (dest, src, src);
33245 else
33246 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
33247 GEN_INT (1 + 4), GEN_INT (1 + 4));
33248 break;
33249 case V2DFmode:
33250 tem = gen_vec_interleave_highv2df (dest, src, src);
33251 break;
33252 case V16QImode:
33253 case V8HImode:
33254 case V4SImode:
33255 case V2DImode:
33256 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
33257 gen_lowpart (V1TImode, src),
33258 GEN_INT (i / 2));
33259 break;
33260 case V8SFmode:
33261 if (i == 256)
33262 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
33263 else
33264 tem = gen_avx_shufps256 (dest, src, src,
33265 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
33266 break;
33267 case V4DFmode:
33268 if (i == 256)
33269 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
33270 else
33271 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
33272 break;
33273 case V32QImode:
33274 case V16HImode:
33275 case V8SImode:
33276 case V4DImode:
33277 if (i == 256)
33278 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
33279 gen_lowpart (V4DImode, src),
33280 gen_lowpart (V4DImode, src),
33281 const1_rtx);
33282 else
33283 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
33284 gen_lowpart (V2TImode, src),
33285 GEN_INT (i / 2));
33286 break;
33287 default:
33288 gcc_unreachable ();
33289 }
33290 emit_insn (tem);
33291 }
33292
33293 /* Expand a vector reduction. FN is the binary pattern to reduce;
33294 DEST is the destination; IN is the input vector. */
33295
33296 void
33297 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
33298 {
33299 rtx half, dst, vec = in;
33300 enum machine_mode mode = GET_MODE (in);
33301 int i;
33302
33303 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
33304 if (TARGET_SSE4_1
33305 && mode == V8HImode
33306 && fn == gen_uminv8hi3)
33307 {
33308 emit_insn (gen_sse4_1_phminposuw (dest, in));
33309 return;
33310 }
33311
33312 for (i = GET_MODE_BITSIZE (mode);
33313 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
33314 i >>= 1)
33315 {
33316 half = gen_reg_rtx (mode);
33317 emit_reduc_half (half, vec, i);
33318 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
33319 dst = dest;
33320 else
33321 dst = gen_reg_rtx (mode);
33322 emit_insn (fn (dst, half, vec));
33323 vec = dst;
33324 }
33325 }
33326 \f
33327 /* Target hook for scalar_mode_supported_p. */
33328 static bool
33329 ix86_scalar_mode_supported_p (enum machine_mode mode)
33330 {
33331 if (DECIMAL_FLOAT_MODE_P (mode))
33332 return default_decimal_float_supported_p ();
33333 else if (mode == TFmode)
33334 return true;
33335 else
33336 return default_scalar_mode_supported_p (mode);
33337 }
33338
33339 /* Implements target hook vector_mode_supported_p. */
33340 static bool
33341 ix86_vector_mode_supported_p (enum machine_mode mode)
33342 {
33343 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
33344 return true;
33345 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
33346 return true;
33347 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
33348 return true;
33349 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
33350 return true;
33351 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
33352 return true;
33353 return false;
33354 }
33355
33356 /* Target hook for c_mode_for_suffix. */
33357 static enum machine_mode
33358 ix86_c_mode_for_suffix (char suffix)
33359 {
33360 if (suffix == 'q')
33361 return TFmode;
33362 if (suffix == 'w')
33363 return XFmode;
33364
33365 return VOIDmode;
33366 }
33367
33368 /* Worker function for TARGET_MD_ASM_CLOBBERS.
33369
33370 We do this in the new i386 backend to maintain source compatibility
33371 with the old cc0-based compiler. */
33372
33373 static tree
33374 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
33375 tree inputs ATTRIBUTE_UNUSED,
33376 tree clobbers)
33377 {
33378 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
33379 clobbers);
33380 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
33381 clobbers);
33382 return clobbers;
33383 }
33384
33385 /* Implements target vector targetm.asm.encode_section_info. */
33386
33387 static void ATTRIBUTE_UNUSED
33388 ix86_encode_section_info (tree decl, rtx rtl, int first)
33389 {
33390 default_encode_section_info (decl, rtl, first);
33391
33392 if (TREE_CODE (decl) == VAR_DECL
33393 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
33394 && ix86_in_large_data_p (decl))
33395 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
33396 }
33397
33398 /* Worker function for REVERSE_CONDITION. */
33399
33400 enum rtx_code
33401 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
33402 {
33403 return (mode != CCFPmode && mode != CCFPUmode
33404 ? reverse_condition (code)
33405 : reverse_condition_maybe_unordered (code));
33406 }
33407
33408 /* Output code to perform an x87 FP register move, from OPERANDS[1]
33409 to OPERANDS[0]. */
33410
33411 const char *
33412 output_387_reg_move (rtx insn, rtx *operands)
33413 {
33414 if (REG_P (operands[0]))
33415 {
33416 if (REG_P (operands[1])
33417 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
33418 {
33419 if (REGNO (operands[0]) == FIRST_STACK_REG)
33420 return output_387_ffreep (operands, 0);
33421 return "fstp\t%y0";
33422 }
33423 if (STACK_TOP_P (operands[0]))
33424 return "fld%Z1\t%y1";
33425 return "fst\t%y0";
33426 }
33427 else if (MEM_P (operands[0]))
33428 {
33429 gcc_assert (REG_P (operands[1]));
33430 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
33431 return "fstp%Z0\t%y0";
33432 else
33433 {
33434 /* There is no non-popping store to memory for XFmode.
33435 So if we need one, follow the store with a load. */
33436 if (GET_MODE (operands[0]) == XFmode)
33437 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
33438 else
33439 return "fst%Z0\t%y0";
33440 }
33441 }
33442 else
33443 gcc_unreachable();
33444 }
33445
33446 /* Output code to perform a conditional jump to LABEL, if C2 flag in
33447 FP status register is set. */
33448
33449 void
33450 ix86_emit_fp_unordered_jump (rtx label)
33451 {
33452 rtx reg = gen_reg_rtx (HImode);
33453 rtx temp;
33454
33455 emit_insn (gen_x86_fnstsw_1 (reg));
33456
33457 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
33458 {
33459 emit_insn (gen_x86_sahf_1 (reg));
33460
33461 temp = gen_rtx_REG (CCmode, FLAGS_REG);
33462 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
33463 }
33464 else
33465 {
33466 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
33467
33468 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
33469 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
33470 }
33471
33472 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
33473 gen_rtx_LABEL_REF (VOIDmode, label),
33474 pc_rtx);
33475 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
33476
33477 emit_jump_insn (temp);
33478 predict_jump (REG_BR_PROB_BASE * 10 / 100);
33479 }
33480
33481 /* Output code to perform a log1p XFmode calculation. */
33482
33483 void ix86_emit_i387_log1p (rtx op0, rtx op1)
33484 {
33485 rtx label1 = gen_label_rtx ();
33486 rtx label2 = gen_label_rtx ();
33487
33488 rtx tmp = gen_reg_rtx (XFmode);
33489 rtx tmp2 = gen_reg_rtx (XFmode);
33490 rtx test;
33491
33492 emit_insn (gen_absxf2 (tmp, op1));
33493 test = gen_rtx_GE (VOIDmode, tmp,
33494 CONST_DOUBLE_FROM_REAL_VALUE (
33495 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
33496 XFmode));
33497 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
33498
33499 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
33500 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
33501 emit_jump (label2);
33502
33503 emit_label (label1);
33504 emit_move_insn (tmp, CONST1_RTX (XFmode));
33505 emit_insn (gen_addxf3 (tmp, op1, tmp));
33506 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
33507 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
33508
33509 emit_label (label2);
33510 }
33511
33512 /* Emit code for round calculation. */
33513 void ix86_emit_i387_round (rtx op0, rtx op1)
33514 {
33515 enum machine_mode inmode = GET_MODE (op1);
33516 enum machine_mode outmode = GET_MODE (op0);
33517 rtx e1, e2, res, tmp, tmp1, half;
33518 rtx scratch = gen_reg_rtx (HImode);
33519 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
33520 rtx jump_label = gen_label_rtx ();
33521 rtx insn;
33522 rtx (*gen_abs) (rtx, rtx);
33523 rtx (*gen_neg) (rtx, rtx);
33524
33525 switch (inmode)
33526 {
33527 case SFmode:
33528 gen_abs = gen_abssf2;
33529 break;
33530 case DFmode:
33531 gen_abs = gen_absdf2;
33532 break;
33533 case XFmode:
33534 gen_abs = gen_absxf2;
33535 break;
33536 default:
33537 gcc_unreachable ();
33538 }
33539
33540 switch (outmode)
33541 {
33542 case SFmode:
33543 gen_neg = gen_negsf2;
33544 break;
33545 case DFmode:
33546 gen_neg = gen_negdf2;
33547 break;
33548 case XFmode:
33549 gen_neg = gen_negxf2;
33550 break;
33551 case HImode:
33552 gen_neg = gen_neghi2;
33553 break;
33554 case SImode:
33555 gen_neg = gen_negsi2;
33556 break;
33557 case DImode:
33558 gen_neg = gen_negdi2;
33559 break;
33560 default:
33561 gcc_unreachable ();
33562 }
33563
33564 e1 = gen_reg_rtx (inmode);
33565 e2 = gen_reg_rtx (inmode);
33566 res = gen_reg_rtx (outmode);
33567
33568 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
33569
33570 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
33571
33572 /* scratch = fxam(op1) */
33573 emit_insn (gen_rtx_SET (VOIDmode, scratch,
33574 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
33575 UNSPEC_FXAM)));
33576 /* e1 = fabs(op1) */
33577 emit_insn (gen_abs (e1, op1));
33578
33579 /* e2 = e1 + 0.5 */
33580 half = force_reg (inmode, half);
33581 emit_insn (gen_rtx_SET (VOIDmode, e2,
33582 gen_rtx_PLUS (inmode, e1, half)));
33583
33584 /* res = floor(e2) */
33585 if (inmode != XFmode)
33586 {
33587 tmp1 = gen_reg_rtx (XFmode);
33588
33589 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
33590 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
33591 }
33592 else
33593 tmp1 = e2;
33594
33595 switch (outmode)
33596 {
33597 case SFmode:
33598 case DFmode:
33599 {
33600 rtx tmp0 = gen_reg_rtx (XFmode);
33601
33602 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
33603
33604 emit_insn (gen_rtx_SET (VOIDmode, res,
33605 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
33606 UNSPEC_TRUNC_NOOP)));
33607 }
33608 break;
33609 case XFmode:
33610 emit_insn (gen_frndintxf2_floor (res, tmp1));
33611 break;
33612 case HImode:
33613 emit_insn (gen_lfloorxfhi2 (res, tmp1));
33614 break;
33615 case SImode:
33616 emit_insn (gen_lfloorxfsi2 (res, tmp1));
33617 break;
33618 case DImode:
33619 emit_insn (gen_lfloorxfdi2 (res, tmp1));
33620 break;
33621 default:
33622 gcc_unreachable ();
33623 }
33624
33625 /* flags = signbit(a) */
33626 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
33627
33628 /* if (flags) then res = -res */
33629 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
33630 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
33631 gen_rtx_LABEL_REF (VOIDmode, jump_label),
33632 pc_rtx);
33633 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
33634 predict_jump (REG_BR_PROB_BASE * 50 / 100);
33635 JUMP_LABEL (insn) = jump_label;
33636
33637 emit_insn (gen_neg (res, res));
33638
33639 emit_label (jump_label);
33640 LABEL_NUSES (jump_label) = 1;
33641
33642 emit_move_insn (op0, res);
33643 }
33644
33645 /* Output code to perform a Newton-Rhapson approximation of a single precision
33646 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
33647
33648 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
33649 {
33650 rtx x0, x1, e0, e1;
33651
33652 x0 = gen_reg_rtx (mode);
33653 e0 = gen_reg_rtx (mode);
33654 e1 = gen_reg_rtx (mode);
33655 x1 = gen_reg_rtx (mode);
33656
33657 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
33658
33659 b = force_reg (mode, b);
33660
33661 /* x0 = rcp(b) estimate */
33662 emit_insn (gen_rtx_SET (VOIDmode, x0,
33663 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
33664 UNSPEC_RCP)));
33665 /* e0 = x0 * b */
33666 emit_insn (gen_rtx_SET (VOIDmode, e0,
33667 gen_rtx_MULT (mode, x0, b)));
33668
33669 /* e0 = x0 * e0 */
33670 emit_insn (gen_rtx_SET (VOIDmode, e0,
33671 gen_rtx_MULT (mode, x0, e0)));
33672
33673 /* e1 = x0 + x0 */
33674 emit_insn (gen_rtx_SET (VOIDmode, e1,
33675 gen_rtx_PLUS (mode, x0, x0)));
33676
33677 /* x1 = e1 - e0 */
33678 emit_insn (gen_rtx_SET (VOIDmode, x1,
33679 gen_rtx_MINUS (mode, e1, e0)));
33680
33681 /* res = a * x1 */
33682 emit_insn (gen_rtx_SET (VOIDmode, res,
33683 gen_rtx_MULT (mode, a, x1)));
33684 }
33685
33686 /* Output code to perform a Newton-Rhapson approximation of a
33687 single precision floating point [reciprocal] square root. */
33688
33689 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
33690 bool recip)
33691 {
33692 rtx x0, e0, e1, e2, e3, mthree, mhalf;
33693 REAL_VALUE_TYPE r;
33694
33695 x0 = gen_reg_rtx (mode);
33696 e0 = gen_reg_rtx (mode);
33697 e1 = gen_reg_rtx (mode);
33698 e2 = gen_reg_rtx (mode);
33699 e3 = gen_reg_rtx (mode);
33700
33701 real_from_integer (&r, VOIDmode, -3, -1, 0);
33702 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
33703
33704 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
33705 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
33706
33707 if (VECTOR_MODE_P (mode))
33708 {
33709 mthree = ix86_build_const_vector (mode, true, mthree);
33710 mhalf = ix86_build_const_vector (mode, true, mhalf);
33711 }
33712
33713 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
33714 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
33715
33716 a = force_reg (mode, a);
33717
33718 /* x0 = rsqrt(a) estimate */
33719 emit_insn (gen_rtx_SET (VOIDmode, x0,
33720 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
33721 UNSPEC_RSQRT)));
33722
33723 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
33724 if (!recip)
33725 {
33726 rtx zero, mask;
33727
33728 zero = gen_reg_rtx (mode);
33729 mask = gen_reg_rtx (mode);
33730
33731 zero = force_reg (mode, CONST0_RTX(mode));
33732 emit_insn (gen_rtx_SET (VOIDmode, mask,
33733 gen_rtx_NE (mode, zero, a)));
33734
33735 emit_insn (gen_rtx_SET (VOIDmode, x0,
33736 gen_rtx_AND (mode, x0, mask)));
33737 }
33738
33739 /* e0 = x0 * a */
33740 emit_insn (gen_rtx_SET (VOIDmode, e0,
33741 gen_rtx_MULT (mode, x0, a)));
33742 /* e1 = e0 * x0 */
33743 emit_insn (gen_rtx_SET (VOIDmode, e1,
33744 gen_rtx_MULT (mode, e0, x0)));
33745
33746 /* e2 = e1 - 3. */
33747 mthree = force_reg (mode, mthree);
33748 emit_insn (gen_rtx_SET (VOIDmode, e2,
33749 gen_rtx_PLUS (mode, e1, mthree)));
33750
33751 mhalf = force_reg (mode, mhalf);
33752 if (recip)
33753 /* e3 = -.5 * x0 */
33754 emit_insn (gen_rtx_SET (VOIDmode, e3,
33755 gen_rtx_MULT (mode, x0, mhalf)));
33756 else
33757 /* e3 = -.5 * e0 */
33758 emit_insn (gen_rtx_SET (VOIDmode, e3,
33759 gen_rtx_MULT (mode, e0, mhalf)));
33760 /* ret = e2 * e3 */
33761 emit_insn (gen_rtx_SET (VOIDmode, res,
33762 gen_rtx_MULT (mode, e2, e3)));
33763 }
33764
33765 #ifdef TARGET_SOLARIS
33766 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
33767
33768 static void
33769 i386_solaris_elf_named_section (const char *name, unsigned int flags,
33770 tree decl)
33771 {
33772 /* With Binutils 2.15, the "@unwind" marker must be specified on
33773 every occurrence of the ".eh_frame" section, not just the first
33774 one. */
33775 if (TARGET_64BIT
33776 && strcmp (name, ".eh_frame") == 0)
33777 {
33778 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
33779 flags & SECTION_WRITE ? "aw" : "a");
33780 return;
33781 }
33782
33783 #ifndef USE_GAS
33784 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
33785 {
33786 solaris_elf_asm_comdat_section (name, flags, decl);
33787 return;
33788 }
33789 #endif
33790
33791 default_elf_asm_named_section (name, flags, decl);
33792 }
33793 #endif /* TARGET_SOLARIS */
33794
33795 /* Return the mangling of TYPE if it is an extended fundamental type. */
33796
33797 static const char *
33798 ix86_mangle_type (const_tree type)
33799 {
33800 type = TYPE_MAIN_VARIANT (type);
33801
33802 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
33803 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
33804 return NULL;
33805
33806 switch (TYPE_MODE (type))
33807 {
33808 case TFmode:
33809 /* __float128 is "g". */
33810 return "g";
33811 case XFmode:
33812 /* "long double" or __float80 is "e". */
33813 return "e";
33814 default:
33815 return NULL;
33816 }
33817 }
33818
33819 /* For 32-bit code we can save PIC register setup by using
33820 __stack_chk_fail_local hidden function instead of calling
33821 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
33822 register, so it is better to call __stack_chk_fail directly. */
33823
33824 static tree ATTRIBUTE_UNUSED
33825 ix86_stack_protect_fail (void)
33826 {
33827 return TARGET_64BIT
33828 ? default_external_stack_protect_fail ()
33829 : default_hidden_stack_protect_fail ();
33830 }
33831
33832 /* Select a format to encode pointers in exception handling data. CODE
33833 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
33834 true if the symbol may be affected by dynamic relocations.
33835
33836 ??? All x86 object file formats are capable of representing this.
33837 After all, the relocation needed is the same as for the call insn.
33838 Whether or not a particular assembler allows us to enter such, I
33839 guess we'll have to see. */
33840 int
33841 asm_preferred_eh_data_format (int code, int global)
33842 {
33843 if (flag_pic)
33844 {
33845 int type = DW_EH_PE_sdata8;
33846 if (!TARGET_64BIT
33847 || ix86_cmodel == CM_SMALL_PIC
33848 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
33849 type = DW_EH_PE_sdata4;
33850 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
33851 }
33852 if (ix86_cmodel == CM_SMALL
33853 || (ix86_cmodel == CM_MEDIUM && code))
33854 return DW_EH_PE_udata4;
33855 return DW_EH_PE_absptr;
33856 }
33857 \f
33858 /* Expand copysign from SIGN to the positive value ABS_VALUE
33859 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
33860 the sign-bit. */
33861 static void
33862 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
33863 {
33864 enum machine_mode mode = GET_MODE (sign);
33865 rtx sgn = gen_reg_rtx (mode);
33866 if (mask == NULL_RTX)
33867 {
33868 enum machine_mode vmode;
33869
33870 if (mode == SFmode)
33871 vmode = V4SFmode;
33872 else if (mode == DFmode)
33873 vmode = V2DFmode;
33874 else
33875 vmode = mode;
33876
33877 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
33878 if (!VECTOR_MODE_P (mode))
33879 {
33880 /* We need to generate a scalar mode mask in this case. */
33881 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
33882 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
33883 mask = gen_reg_rtx (mode);
33884 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
33885 }
33886 }
33887 else
33888 mask = gen_rtx_NOT (mode, mask);
33889 emit_insn (gen_rtx_SET (VOIDmode, sgn,
33890 gen_rtx_AND (mode, mask, sign)));
33891 emit_insn (gen_rtx_SET (VOIDmode, result,
33892 gen_rtx_IOR (mode, abs_value, sgn)));
33893 }
33894
33895 /* Expand fabs (OP0) and return a new rtx that holds the result. The
33896 mask for masking out the sign-bit is stored in *SMASK, if that is
33897 non-null. */
33898 static rtx
33899 ix86_expand_sse_fabs (rtx op0, rtx *smask)
33900 {
33901 enum machine_mode vmode, mode = GET_MODE (op0);
33902 rtx xa, mask;
33903
33904 xa = gen_reg_rtx (mode);
33905 if (mode == SFmode)
33906 vmode = V4SFmode;
33907 else if (mode == DFmode)
33908 vmode = V2DFmode;
33909 else
33910 vmode = mode;
33911 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
33912 if (!VECTOR_MODE_P (mode))
33913 {
33914 /* We need to generate a scalar mode mask in this case. */
33915 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
33916 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
33917 mask = gen_reg_rtx (mode);
33918 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
33919 }
33920 emit_insn (gen_rtx_SET (VOIDmode, xa,
33921 gen_rtx_AND (mode, op0, mask)));
33922
33923 if (smask)
33924 *smask = mask;
33925
33926 return xa;
33927 }
33928
33929 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
33930 swapping the operands if SWAP_OPERANDS is true. The expanded
33931 code is a forward jump to a newly created label in case the
33932 comparison is true. The generated label rtx is returned. */
33933 static rtx
33934 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
33935 bool swap_operands)
33936 {
33937 rtx label, tmp;
33938
33939 if (swap_operands)
33940 {
33941 tmp = op0;
33942 op0 = op1;
33943 op1 = tmp;
33944 }
33945
33946 label = gen_label_rtx ();
33947 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
33948 emit_insn (gen_rtx_SET (VOIDmode, tmp,
33949 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
33950 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
33951 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
33952 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
33953 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
33954 JUMP_LABEL (tmp) = label;
33955
33956 return label;
33957 }
33958
33959 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
33960 using comparison code CODE. Operands are swapped for the comparison if
33961 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
33962 static rtx
33963 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
33964 bool swap_operands)
33965 {
33966 rtx (*insn)(rtx, rtx, rtx, rtx);
33967 enum machine_mode mode = GET_MODE (op0);
33968 rtx mask = gen_reg_rtx (mode);
33969
33970 if (swap_operands)
33971 {
33972 rtx tmp = op0;
33973 op0 = op1;
33974 op1 = tmp;
33975 }
33976
33977 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
33978
33979 emit_insn (insn (mask, op0, op1,
33980 gen_rtx_fmt_ee (code, mode, op0, op1)));
33981 return mask;
33982 }
33983
33984 /* Generate and return a rtx of mode MODE for 2**n where n is the number
33985 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
33986 static rtx
33987 ix86_gen_TWO52 (enum machine_mode mode)
33988 {
33989 REAL_VALUE_TYPE TWO52r;
33990 rtx TWO52;
33991
33992 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
33993 TWO52 = const_double_from_real_value (TWO52r, mode);
33994 TWO52 = force_reg (mode, TWO52);
33995
33996 return TWO52;
33997 }
33998
33999 /* Expand SSE sequence for computing lround from OP1 storing
34000 into OP0. */
34001 void
34002 ix86_expand_lround (rtx op0, rtx op1)
34003 {
34004 /* C code for the stuff we're doing below:
34005 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
34006 return (long)tmp;
34007 */
34008 enum machine_mode mode = GET_MODE (op1);
34009 const struct real_format *fmt;
34010 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34011 rtx adj;
34012
34013 /* load nextafter (0.5, 0.0) */
34014 fmt = REAL_MODE_FORMAT (mode);
34015 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34016 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34017
34018 /* adj = copysign (0.5, op1) */
34019 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
34020 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
34021
34022 /* adj = op1 + adj */
34023 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
34024
34025 /* op0 = (imode)adj */
34026 expand_fix (op0, adj, 0);
34027 }
34028
34029 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
34030 into OPERAND0. */
34031 void
34032 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
34033 {
34034 /* C code for the stuff we're doing below (for do_floor):
34035 xi = (long)op1;
34036 xi -= (double)xi > op1 ? 1 : 0;
34037 return xi;
34038 */
34039 enum machine_mode fmode = GET_MODE (op1);
34040 enum machine_mode imode = GET_MODE (op0);
34041 rtx ireg, freg, label, tmp;
34042
34043 /* reg = (long)op1 */
34044 ireg = gen_reg_rtx (imode);
34045 expand_fix (ireg, op1, 0);
34046
34047 /* freg = (double)reg */
34048 freg = gen_reg_rtx (fmode);
34049 expand_float (freg, ireg, 0);
34050
34051 /* ireg = (freg > op1) ? ireg - 1 : ireg */
34052 label = ix86_expand_sse_compare_and_jump (UNLE,
34053 freg, op1, !do_floor);
34054 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
34055 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
34056 emit_move_insn (ireg, tmp);
34057
34058 emit_label (label);
34059 LABEL_NUSES (label) = 1;
34060
34061 emit_move_insn (op0, ireg);
34062 }
34063
34064 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
34065 result in OPERAND0. */
34066 void
34067 ix86_expand_rint (rtx operand0, rtx operand1)
34068 {
34069 /* C code for the stuff we're doing below:
34070 xa = fabs (operand1);
34071 if (!isless (xa, 2**52))
34072 return operand1;
34073 xa = xa + 2**52 - 2**52;
34074 return copysign (xa, operand1);
34075 */
34076 enum machine_mode mode = GET_MODE (operand0);
34077 rtx res, xa, label, TWO52, mask;
34078
34079 res = gen_reg_rtx (mode);
34080 emit_move_insn (res, operand1);
34081
34082 /* xa = abs (operand1) */
34083 xa = ix86_expand_sse_fabs (res, &mask);
34084
34085 /* if (!isless (xa, TWO52)) goto label; */
34086 TWO52 = ix86_gen_TWO52 (mode);
34087 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34088
34089 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34090 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
34091
34092 ix86_sse_copysign_to_positive (res, xa, res, mask);
34093
34094 emit_label (label);
34095 LABEL_NUSES (label) = 1;
34096
34097 emit_move_insn (operand0, res);
34098 }
34099
34100 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
34101 into OPERAND0. */
34102 void
34103 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
34104 {
34105 /* C code for the stuff we expand below.
34106 double xa = fabs (x), x2;
34107 if (!isless (xa, TWO52))
34108 return x;
34109 xa = xa + TWO52 - TWO52;
34110 x2 = copysign (xa, x);
34111 Compensate. Floor:
34112 if (x2 > x)
34113 x2 -= 1;
34114 Compensate. Ceil:
34115 if (x2 < x)
34116 x2 -= -1;
34117 return x2;
34118 */
34119 enum machine_mode mode = GET_MODE (operand0);
34120 rtx xa, TWO52, tmp, label, one, res, mask;
34121
34122 TWO52 = ix86_gen_TWO52 (mode);
34123
34124 /* Temporary for holding the result, initialized to the input
34125 operand to ease control flow. */
34126 res = gen_reg_rtx (mode);
34127 emit_move_insn (res, operand1);
34128
34129 /* xa = abs (operand1) */
34130 xa = ix86_expand_sse_fabs (res, &mask);
34131
34132 /* if (!isless (xa, TWO52)) goto label; */
34133 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34134
34135 /* xa = xa + TWO52 - TWO52; */
34136 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34137 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
34138
34139 /* xa = copysign (xa, operand1) */
34140 ix86_sse_copysign_to_positive (xa, xa, res, mask);
34141
34142 /* generate 1.0 or -1.0 */
34143 one = force_reg (mode,
34144 const_double_from_real_value (do_floor
34145 ? dconst1 : dconstm1, mode));
34146
34147 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
34148 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
34149 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34150 gen_rtx_AND (mode, one, tmp)));
34151 /* We always need to subtract here to preserve signed zero. */
34152 tmp = expand_simple_binop (mode, MINUS,
34153 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34154 emit_move_insn (res, tmp);
34155
34156 emit_label (label);
34157 LABEL_NUSES (label) = 1;
34158
34159 emit_move_insn (operand0, res);
34160 }
34161
34162 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
34163 into OPERAND0. */
34164 void
34165 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
34166 {
34167 /* C code for the stuff we expand below.
34168 double xa = fabs (x), x2;
34169 if (!isless (xa, TWO52))
34170 return x;
34171 x2 = (double)(long)x;
34172 Compensate. Floor:
34173 if (x2 > x)
34174 x2 -= 1;
34175 Compensate. Ceil:
34176 if (x2 < x)
34177 x2 += 1;
34178 if (HONOR_SIGNED_ZEROS (mode))
34179 return copysign (x2, x);
34180 return x2;
34181 */
34182 enum machine_mode mode = GET_MODE (operand0);
34183 rtx xa, xi, TWO52, tmp, label, one, res, mask;
34184
34185 TWO52 = ix86_gen_TWO52 (mode);
34186
34187 /* Temporary for holding the result, initialized to the input
34188 operand to ease control flow. */
34189 res = gen_reg_rtx (mode);
34190 emit_move_insn (res, operand1);
34191
34192 /* xa = abs (operand1) */
34193 xa = ix86_expand_sse_fabs (res, &mask);
34194
34195 /* if (!isless (xa, TWO52)) goto label; */
34196 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34197
34198 /* xa = (double)(long)x */
34199 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
34200 expand_fix (xi, res, 0);
34201 expand_float (xa, xi, 0);
34202
34203 /* generate 1.0 */
34204 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
34205
34206 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
34207 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
34208 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34209 gen_rtx_AND (mode, one, tmp)));
34210 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
34211 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34212 emit_move_insn (res, tmp);
34213
34214 if (HONOR_SIGNED_ZEROS (mode))
34215 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
34216
34217 emit_label (label);
34218 LABEL_NUSES (label) = 1;
34219
34220 emit_move_insn (operand0, res);
34221 }
34222
34223 /* Expand SSE sequence for computing round from OPERAND1 storing
34224 into OPERAND0. Sequence that works without relying on DImode truncation
34225 via cvttsd2siq that is only available on 64bit targets. */
34226 void
34227 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
34228 {
34229 /* C code for the stuff we expand below.
34230 double xa = fabs (x), xa2, x2;
34231 if (!isless (xa, TWO52))
34232 return x;
34233 Using the absolute value and copying back sign makes
34234 -0.0 -> -0.0 correct.
34235 xa2 = xa + TWO52 - TWO52;
34236 Compensate.
34237 dxa = xa2 - xa;
34238 if (dxa <= -0.5)
34239 xa2 += 1;
34240 else if (dxa > 0.5)
34241 xa2 -= 1;
34242 x2 = copysign (xa2, x);
34243 return x2;
34244 */
34245 enum machine_mode mode = GET_MODE (operand0);
34246 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
34247
34248 TWO52 = ix86_gen_TWO52 (mode);
34249
34250 /* Temporary for holding the result, initialized to the input
34251 operand to ease control flow. */
34252 res = gen_reg_rtx (mode);
34253 emit_move_insn (res, operand1);
34254
34255 /* xa = abs (operand1) */
34256 xa = ix86_expand_sse_fabs (res, &mask);
34257
34258 /* if (!isless (xa, TWO52)) goto label; */
34259 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34260
34261 /* xa2 = xa + TWO52 - TWO52; */
34262 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34263 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
34264
34265 /* dxa = xa2 - xa; */
34266 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
34267
34268 /* generate 0.5, 1.0 and -0.5 */
34269 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
34270 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
34271 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
34272 0, OPTAB_DIRECT);
34273
34274 /* Compensate. */
34275 tmp = gen_reg_rtx (mode);
34276 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
34277 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
34278 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34279 gen_rtx_AND (mode, one, tmp)));
34280 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34281 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
34282 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
34283 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34284 gen_rtx_AND (mode, one, tmp)));
34285 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34286
34287 /* res = copysign (xa2, operand1) */
34288 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
34289
34290 emit_label (label);
34291 LABEL_NUSES (label) = 1;
34292
34293 emit_move_insn (operand0, res);
34294 }
34295
34296 /* Expand SSE sequence for computing trunc from OPERAND1 storing
34297 into OPERAND0. */
34298 void
34299 ix86_expand_trunc (rtx operand0, rtx operand1)
34300 {
34301 /* C code for SSE variant we expand below.
34302 double xa = fabs (x), x2;
34303 if (!isless (xa, TWO52))
34304 return x;
34305 x2 = (double)(long)x;
34306 if (HONOR_SIGNED_ZEROS (mode))
34307 return copysign (x2, x);
34308 return x2;
34309 */
34310 enum machine_mode mode = GET_MODE (operand0);
34311 rtx xa, xi, TWO52, label, res, mask;
34312
34313 TWO52 = ix86_gen_TWO52 (mode);
34314
34315 /* Temporary for holding the result, initialized to the input
34316 operand to ease control flow. */
34317 res = gen_reg_rtx (mode);
34318 emit_move_insn (res, operand1);
34319
34320 /* xa = abs (operand1) */
34321 xa = ix86_expand_sse_fabs (res, &mask);
34322
34323 /* if (!isless (xa, TWO52)) goto label; */
34324 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34325
34326 /* x = (double)(long)x */
34327 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
34328 expand_fix (xi, res, 0);
34329 expand_float (res, xi, 0);
34330
34331 if (HONOR_SIGNED_ZEROS (mode))
34332 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
34333
34334 emit_label (label);
34335 LABEL_NUSES (label) = 1;
34336
34337 emit_move_insn (operand0, res);
34338 }
34339
34340 /* Expand SSE sequence for computing trunc from OPERAND1 storing
34341 into OPERAND0. */
34342 void
34343 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
34344 {
34345 enum machine_mode mode = GET_MODE (operand0);
34346 rtx xa, mask, TWO52, label, one, res, smask, tmp;
34347
34348 /* C code for SSE variant we expand below.
34349 double xa = fabs (x), x2;
34350 if (!isless (xa, TWO52))
34351 return x;
34352 xa2 = xa + TWO52 - TWO52;
34353 Compensate:
34354 if (xa2 > xa)
34355 xa2 -= 1.0;
34356 x2 = copysign (xa2, x);
34357 return x2;
34358 */
34359
34360 TWO52 = ix86_gen_TWO52 (mode);
34361
34362 /* Temporary for holding the result, initialized to the input
34363 operand to ease control flow. */
34364 res = gen_reg_rtx (mode);
34365 emit_move_insn (res, operand1);
34366
34367 /* xa = abs (operand1) */
34368 xa = ix86_expand_sse_fabs (res, &smask);
34369
34370 /* if (!isless (xa, TWO52)) goto label; */
34371 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34372
34373 /* res = xa + TWO52 - TWO52; */
34374 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34375 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
34376 emit_move_insn (res, tmp);
34377
34378 /* generate 1.0 */
34379 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
34380
34381 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
34382 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
34383 emit_insn (gen_rtx_SET (VOIDmode, mask,
34384 gen_rtx_AND (mode, mask, one)));
34385 tmp = expand_simple_binop (mode, MINUS,
34386 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
34387 emit_move_insn (res, tmp);
34388
34389 /* res = copysign (res, operand1) */
34390 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
34391
34392 emit_label (label);
34393 LABEL_NUSES (label) = 1;
34394
34395 emit_move_insn (operand0, res);
34396 }
34397
34398 /* Expand SSE sequence for computing round from OPERAND1 storing
34399 into OPERAND0. */
34400 void
34401 ix86_expand_round (rtx operand0, rtx operand1)
34402 {
34403 /* C code for the stuff we're doing below:
34404 double xa = fabs (x);
34405 if (!isless (xa, TWO52))
34406 return x;
34407 xa = (double)(long)(xa + nextafter (0.5, 0.0));
34408 return copysign (xa, x);
34409 */
34410 enum machine_mode mode = GET_MODE (operand0);
34411 rtx res, TWO52, xa, label, xi, half, mask;
34412 const struct real_format *fmt;
34413 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34414
34415 /* Temporary for holding the result, initialized to the input
34416 operand to ease control flow. */
34417 res = gen_reg_rtx (mode);
34418 emit_move_insn (res, operand1);
34419
34420 TWO52 = ix86_gen_TWO52 (mode);
34421 xa = ix86_expand_sse_fabs (res, &mask);
34422 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34423
34424 /* load nextafter (0.5, 0.0) */
34425 fmt = REAL_MODE_FORMAT (mode);
34426 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34427 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34428
34429 /* xa = xa + 0.5 */
34430 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
34431 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
34432
34433 /* xa = (double)(int64_t)xa */
34434 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
34435 expand_fix (xi, xa, 0);
34436 expand_float (xa, xi, 0);
34437
34438 /* res = copysign (xa, operand1) */
34439 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
34440
34441 emit_label (label);
34442 LABEL_NUSES (label) = 1;
34443
34444 emit_move_insn (operand0, res);
34445 }
34446
34447 /* Expand SSE sequence for computing round
34448 from OP1 storing into OP0 using sse4 round insn. */
34449 void
34450 ix86_expand_round_sse4 (rtx op0, rtx op1)
34451 {
34452 enum machine_mode mode = GET_MODE (op0);
34453 rtx e1, e2, res, half;
34454 const struct real_format *fmt;
34455 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34456 rtx (*gen_copysign) (rtx, rtx, rtx);
34457 rtx (*gen_round) (rtx, rtx, rtx);
34458
34459 switch (mode)
34460 {
34461 case SFmode:
34462 gen_copysign = gen_copysignsf3;
34463 gen_round = gen_sse4_1_roundsf2;
34464 break;
34465 case DFmode:
34466 gen_copysign = gen_copysigndf3;
34467 gen_round = gen_sse4_1_rounddf2;
34468 break;
34469 default:
34470 gcc_unreachable ();
34471 }
34472
34473 /* round (a) = trunc (a + copysign (0.5, a)) */
34474
34475 /* load nextafter (0.5, 0.0) */
34476 fmt = REAL_MODE_FORMAT (mode);
34477 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34478 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34479 half = const_double_from_real_value (pred_half, mode);
34480
34481 /* e1 = copysign (0.5, op1) */
34482 e1 = gen_reg_rtx (mode);
34483 emit_insn (gen_copysign (e1, half, op1));
34484
34485 /* e2 = op1 + e1 */
34486 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
34487
34488 /* res = trunc (e2) */
34489 res = gen_reg_rtx (mode);
34490 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
34491
34492 emit_move_insn (op0, res);
34493 }
34494 \f
34495
34496 /* Table of valid machine attributes. */
34497 static const struct attribute_spec ix86_attribute_table[] =
34498 {
34499 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
34500 affects_type_identity } */
34501 /* Stdcall attribute says callee is responsible for popping arguments
34502 if they are not variable. */
34503 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
34504 true },
34505 /* Fastcall attribute says callee is responsible for popping arguments
34506 if they are not variable. */
34507 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
34508 true },
34509 /* Thiscall attribute says callee is responsible for popping arguments
34510 if they are not variable. */
34511 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
34512 true },
34513 /* Cdecl attribute says the callee is a normal C declaration */
34514 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
34515 true },
34516 /* Regparm attribute specifies how many integer arguments are to be
34517 passed in registers. */
34518 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
34519 true },
34520 /* Sseregparm attribute says we are using x86_64 calling conventions
34521 for FP arguments. */
34522 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
34523 true },
34524 /* force_align_arg_pointer says this function realigns the stack at entry. */
34525 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
34526 false, true, true, ix86_handle_cconv_attribute, false },
34527 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
34528 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
34529 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
34530 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
34531 false },
34532 #endif
34533 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
34534 false },
34535 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
34536 false },
34537 #ifdef SUBTARGET_ATTRIBUTE_TABLE
34538 SUBTARGET_ATTRIBUTE_TABLE,
34539 #endif
34540 /* ms_abi and sysv_abi calling convention function attributes. */
34541 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
34542 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
34543 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
34544 false },
34545 { "callee_pop_aggregate_return", 1, 1, false, true, true,
34546 ix86_handle_callee_pop_aggregate_return, true },
34547 /* End element. */
34548 { NULL, 0, 0, false, false, false, NULL, false }
34549 };
34550
34551 /* Implement targetm.vectorize.builtin_vectorization_cost. */
34552 static int
34553 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
34554 tree vectype ATTRIBUTE_UNUSED,
34555 int misalign ATTRIBUTE_UNUSED)
34556 {
34557 switch (type_of_cost)
34558 {
34559 case scalar_stmt:
34560 return ix86_cost->scalar_stmt_cost;
34561
34562 case scalar_load:
34563 return ix86_cost->scalar_load_cost;
34564
34565 case scalar_store:
34566 return ix86_cost->scalar_store_cost;
34567
34568 case vector_stmt:
34569 return ix86_cost->vec_stmt_cost;
34570
34571 case vector_load:
34572 return ix86_cost->vec_align_load_cost;
34573
34574 case vector_store:
34575 return ix86_cost->vec_store_cost;
34576
34577 case vec_to_scalar:
34578 return ix86_cost->vec_to_scalar_cost;
34579
34580 case scalar_to_vec:
34581 return ix86_cost->scalar_to_vec_cost;
34582
34583 case unaligned_load:
34584 case unaligned_store:
34585 return ix86_cost->vec_unalign_load_cost;
34586
34587 case cond_branch_taken:
34588 return ix86_cost->cond_taken_branch_cost;
34589
34590 case cond_branch_not_taken:
34591 return ix86_cost->cond_not_taken_branch_cost;
34592
34593 case vec_perm:
34594 return 1;
34595
34596 default:
34597 gcc_unreachable ();
34598 }
34599 }
34600
34601
34602 /* Return a vector mode with twice as many elements as VMODE. */
34603 /* ??? Consider moving this to a table generated by genmodes.c. */
34604
34605 static enum machine_mode
34606 doublesize_vector_mode (enum machine_mode vmode)
34607 {
34608 switch (vmode)
34609 {
34610 case V2SFmode: return V4SFmode;
34611 case V1DImode: return V2DImode;
34612 case V2SImode: return V4SImode;
34613 case V4HImode: return V8HImode;
34614 case V8QImode: return V16QImode;
34615
34616 case V2DFmode: return V4DFmode;
34617 case V4SFmode: return V8SFmode;
34618 case V2DImode: return V4DImode;
34619 case V4SImode: return V8SImode;
34620 case V8HImode: return V16HImode;
34621 case V16QImode: return V32QImode;
34622
34623 case V4DFmode: return V8DFmode;
34624 case V8SFmode: return V16SFmode;
34625 case V4DImode: return V8DImode;
34626 case V8SImode: return V16SImode;
34627 case V16HImode: return V32HImode;
34628 case V32QImode: return V64QImode;
34629
34630 default:
34631 gcc_unreachable ();
34632 }
34633 }
34634
34635 /* Construct (set target (vec_select op0 (parallel perm))) and
34636 return true if that's a valid instruction in the active ISA. */
34637
34638 static bool
34639 expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
34640 {
34641 rtx rperm[MAX_VECT_LEN], x;
34642 unsigned i;
34643
34644 for (i = 0; i < nelt; ++i)
34645 rperm[i] = GEN_INT (perm[i]);
34646
34647 x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
34648 x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
34649 x = gen_rtx_SET (VOIDmode, target, x);
34650
34651 x = emit_insn (x);
34652 if (recog_memoized (x) < 0)
34653 {
34654 remove_insn (x);
34655 return false;
34656 }
34657 return true;
34658 }
34659
34660 /* Similar, but generate a vec_concat from op0 and op1 as well. */
34661
34662 static bool
34663 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
34664 const unsigned char *perm, unsigned nelt)
34665 {
34666 enum machine_mode v2mode;
34667 rtx x;
34668
34669 v2mode = doublesize_vector_mode (GET_MODE (op0));
34670 x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
34671 return expand_vselect (target, x, perm, nelt);
34672 }
34673
34674 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
34675 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
34676
34677 static bool
34678 expand_vec_perm_blend (struct expand_vec_perm_d *d)
34679 {
34680 enum machine_mode vmode = d->vmode;
34681 unsigned i, mask, nelt = d->nelt;
34682 rtx target, op0, op1, x;
34683 rtx rperm[32], vperm;
34684
34685 if (d->op0 == d->op1)
34686 return false;
34687 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
34688 ;
34689 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
34690 ;
34691 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
34692 ;
34693 else
34694 return false;
34695
34696 /* This is a blend, not a permute. Elements must stay in their
34697 respective lanes. */
34698 for (i = 0; i < nelt; ++i)
34699 {
34700 unsigned e = d->perm[i];
34701 if (!(e == i || e == i + nelt))
34702 return false;
34703 }
34704
34705 if (d->testing_p)
34706 return true;
34707
34708 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
34709 decision should be extracted elsewhere, so that we only try that
34710 sequence once all budget==3 options have been tried. */
34711 target = d->target;
34712 op0 = d->op0;
34713 op1 = d->op1;
34714 mask = 0;
34715
34716 switch (vmode)
34717 {
34718 case V4DFmode:
34719 case V8SFmode:
34720 case V2DFmode:
34721 case V4SFmode:
34722 case V8HImode:
34723 case V8SImode:
34724 for (i = 0; i < nelt; ++i)
34725 mask |= (d->perm[i] >= nelt) << i;
34726 break;
34727
34728 case V2DImode:
34729 for (i = 0; i < 2; ++i)
34730 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
34731 vmode = V8HImode;
34732 goto do_subreg;
34733
34734 case V4SImode:
34735 for (i = 0; i < 4; ++i)
34736 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
34737 vmode = V8HImode;
34738 goto do_subreg;
34739
34740 case V16QImode:
34741 /* See if bytes move in pairs so we can use pblendw with
34742 an immediate argument, rather than pblendvb with a vector
34743 argument. */
34744 for (i = 0; i < 16; i += 2)
34745 if (d->perm[i] + 1 != d->perm[i + 1])
34746 {
34747 use_pblendvb:
34748 for (i = 0; i < nelt; ++i)
34749 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
34750
34751 finish_pblendvb:
34752 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
34753 vperm = force_reg (vmode, vperm);
34754
34755 if (GET_MODE_SIZE (vmode) == 16)
34756 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
34757 else
34758 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
34759 return true;
34760 }
34761
34762 for (i = 0; i < 8; ++i)
34763 mask |= (d->perm[i * 2] >= 16) << i;
34764 vmode = V8HImode;
34765 /* FALLTHRU */
34766
34767 do_subreg:
34768 target = gen_lowpart (vmode, target);
34769 op0 = gen_lowpart (vmode, op0);
34770 op1 = gen_lowpart (vmode, op1);
34771 break;
34772
34773 case V32QImode:
34774 /* See if bytes move in pairs. If not, vpblendvb must be used. */
34775 for (i = 0; i < 32; i += 2)
34776 if (d->perm[i] + 1 != d->perm[i + 1])
34777 goto use_pblendvb;
34778 /* See if bytes move in quadruplets. If yes, vpblendd
34779 with immediate can be used. */
34780 for (i = 0; i < 32; i += 4)
34781 if (d->perm[i] + 2 != d->perm[i + 2])
34782 break;
34783 if (i < 32)
34784 {
34785 /* See if bytes move the same in both lanes. If yes,
34786 vpblendw with immediate can be used. */
34787 for (i = 0; i < 16; i += 2)
34788 if (d->perm[i] + 16 != d->perm[i + 16])
34789 goto use_pblendvb;
34790
34791 /* Use vpblendw. */
34792 for (i = 0; i < 16; ++i)
34793 mask |= (d->perm[i * 2] >= 32) << i;
34794 vmode = V16HImode;
34795 goto do_subreg;
34796 }
34797
34798 /* Use vpblendd. */
34799 for (i = 0; i < 8; ++i)
34800 mask |= (d->perm[i * 4] >= 32) << i;
34801 vmode = V8SImode;
34802 goto do_subreg;
34803
34804 case V16HImode:
34805 /* See if words move in pairs. If yes, vpblendd can be used. */
34806 for (i = 0; i < 16; i += 2)
34807 if (d->perm[i] + 1 != d->perm[i + 1])
34808 break;
34809 if (i < 16)
34810 {
34811 /* See if words move the same in both lanes. If not,
34812 vpblendvb must be used. */
34813 for (i = 0; i < 8; i++)
34814 if (d->perm[i] + 8 != d->perm[i + 8])
34815 {
34816 /* Use vpblendvb. */
34817 for (i = 0; i < 32; ++i)
34818 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
34819
34820 vmode = V32QImode;
34821 nelt = 32;
34822 target = gen_lowpart (vmode, target);
34823 op0 = gen_lowpart (vmode, op0);
34824 op1 = gen_lowpart (vmode, op1);
34825 goto finish_pblendvb;
34826 }
34827
34828 /* Use vpblendw. */
34829 for (i = 0; i < 16; ++i)
34830 mask |= (d->perm[i] >= 16) << i;
34831 break;
34832 }
34833
34834 /* Use vpblendd. */
34835 for (i = 0; i < 8; ++i)
34836 mask |= (d->perm[i * 2] >= 16) << i;
34837 vmode = V8SImode;
34838 goto do_subreg;
34839
34840 case V4DImode:
34841 /* Use vpblendd. */
34842 for (i = 0; i < 4; ++i)
34843 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
34844 vmode = V8SImode;
34845 goto do_subreg;
34846
34847 default:
34848 gcc_unreachable ();
34849 }
34850
34851 /* This matches five different patterns with the different modes. */
34852 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
34853 x = gen_rtx_SET (VOIDmode, target, x);
34854 emit_insn (x);
34855
34856 return true;
34857 }
34858
34859 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
34860 in terms of the variable form of vpermilps.
34861
34862 Note that we will have already failed the immediate input vpermilps,
34863 which requires that the high and low part shuffle be identical; the
34864 variable form doesn't require that. */
34865
34866 static bool
34867 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
34868 {
34869 rtx rperm[8], vperm;
34870 unsigned i;
34871
34872 if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
34873 return false;
34874
34875 /* We can only permute within the 128-bit lane. */
34876 for (i = 0; i < 8; ++i)
34877 {
34878 unsigned e = d->perm[i];
34879 if (i < 4 ? e >= 4 : e < 4)
34880 return false;
34881 }
34882
34883 if (d->testing_p)
34884 return true;
34885
34886 for (i = 0; i < 8; ++i)
34887 {
34888 unsigned e = d->perm[i];
34889
34890 /* Within each 128-bit lane, the elements of op0 are numbered
34891 from 0 and the elements of op1 are numbered from 4. */
34892 if (e >= 8 + 4)
34893 e -= 8;
34894 else if (e >= 4)
34895 e -= 4;
34896
34897 rperm[i] = GEN_INT (e);
34898 }
34899
34900 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
34901 vperm = force_reg (V8SImode, vperm);
34902 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
34903
34904 return true;
34905 }
34906
34907 /* Return true if permutation D can be performed as VMODE permutation
34908 instead. */
34909
34910 static bool
34911 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
34912 {
34913 unsigned int i, j, chunk;
34914
34915 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
34916 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
34917 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
34918 return false;
34919
34920 if (GET_MODE_NUNITS (vmode) >= d->nelt)
34921 return true;
34922
34923 chunk = d->nelt / GET_MODE_NUNITS (vmode);
34924 for (i = 0; i < d->nelt; i += chunk)
34925 if (d->perm[i] & (chunk - 1))
34926 return false;
34927 else
34928 for (j = 1; j < chunk; ++j)
34929 if (d->perm[i] + j != d->perm[i + j])
34930 return false;
34931
34932 return true;
34933 }
34934
34935 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
34936 in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128. */
34937
34938 static bool
34939 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
34940 {
34941 unsigned i, nelt, eltsz, mask;
34942 unsigned char perm[32];
34943 enum machine_mode vmode = V16QImode;
34944 rtx rperm[32], vperm, target, op0, op1;
34945
34946 nelt = d->nelt;
34947
34948 if (d->op0 != d->op1)
34949 {
34950 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
34951 {
34952 if (TARGET_AVX2
34953 && valid_perm_using_mode_p (V2TImode, d))
34954 {
34955 if (d->testing_p)
34956 return true;
34957
34958 /* Use vperm2i128 insn. The pattern uses
34959 V4DImode instead of V2TImode. */
34960 target = gen_lowpart (V4DImode, d->target);
34961 op0 = gen_lowpart (V4DImode, d->op0);
34962 op1 = gen_lowpart (V4DImode, d->op1);
34963 rperm[0]
34964 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
34965 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
34966 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
34967 return true;
34968 }
34969 return false;
34970 }
34971 }
34972 else
34973 {
34974 if (GET_MODE_SIZE (d->vmode) == 16)
34975 {
34976 if (!TARGET_SSSE3)
34977 return false;
34978 }
34979 else if (GET_MODE_SIZE (d->vmode) == 32)
34980 {
34981 if (!TARGET_AVX2)
34982 return false;
34983
34984 /* V4DImode should be already handled through
34985 expand_vselect by vpermq instruction. */
34986 gcc_assert (d->vmode != V4DImode);
34987
34988 vmode = V32QImode;
34989 if (d->vmode == V8SImode
34990 || d->vmode == V16HImode
34991 || d->vmode == V32QImode)
34992 {
34993 /* First see if vpermq can be used for
34994 V8SImode/V16HImode/V32QImode. */
34995 if (valid_perm_using_mode_p (V4DImode, d))
34996 {
34997 for (i = 0; i < 4; i++)
34998 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
34999 if (d->testing_p)
35000 return true;
35001 return expand_vselect (gen_lowpart (V4DImode, d->target),
35002 gen_lowpart (V4DImode, d->op0),
35003 perm, 4);
35004 }
35005
35006 /* Next see if vpermd can be used. */
35007 if (valid_perm_using_mode_p (V8SImode, d))
35008 vmode = V8SImode;
35009 }
35010
35011 if (vmode == V32QImode)
35012 {
35013 /* vpshufb only works intra lanes, it is not
35014 possible to shuffle bytes in between the lanes. */
35015 for (i = 0; i < nelt; ++i)
35016 if ((d->perm[i] ^ i) & (nelt / 2))
35017 return false;
35018 }
35019 }
35020 else
35021 return false;
35022 }
35023
35024 if (d->testing_p)
35025 return true;
35026
35027 if (vmode == V8SImode)
35028 for (i = 0; i < 8; ++i)
35029 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
35030 else
35031 {
35032 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
35033 if (d->op0 != d->op1)
35034 mask = 2 * nelt - 1;
35035 else if (vmode == V16QImode)
35036 mask = nelt - 1;
35037 else
35038 mask = nelt / 2 - 1;
35039
35040 for (i = 0; i < nelt; ++i)
35041 {
35042 unsigned j, e = d->perm[i] & mask;
35043 for (j = 0; j < eltsz; ++j)
35044 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
35045 }
35046 }
35047
35048 vperm = gen_rtx_CONST_VECTOR (vmode,
35049 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
35050 vperm = force_reg (vmode, vperm);
35051
35052 target = gen_lowpart (vmode, d->target);
35053 op0 = gen_lowpart (vmode, d->op0);
35054 if (d->op0 == d->op1)
35055 {
35056 if (vmode == V16QImode)
35057 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
35058 else if (vmode == V32QImode)
35059 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
35060 else
35061 emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
35062 }
35063 else
35064 {
35065 op1 = gen_lowpart (vmode, d->op1);
35066 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
35067 }
35068
35069 return true;
35070 }
35071
35072 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
35073 in a single instruction. */
35074
35075 static bool
35076 expand_vec_perm_1 (struct expand_vec_perm_d *d)
35077 {
35078 unsigned i, nelt = d->nelt;
35079 unsigned char perm2[MAX_VECT_LEN];
35080
35081 /* Check plain VEC_SELECT first, because AVX has instructions that could
35082 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
35083 input where SEL+CONCAT may not. */
35084 if (d->op0 == d->op1)
35085 {
35086 int mask = nelt - 1;
35087 bool identity_perm = true;
35088 bool broadcast_perm = true;
35089
35090 for (i = 0; i < nelt; i++)
35091 {
35092 perm2[i] = d->perm[i] & mask;
35093 if (perm2[i] != i)
35094 identity_perm = false;
35095 if (perm2[i])
35096 broadcast_perm = false;
35097 }
35098
35099 if (identity_perm)
35100 {
35101 if (!d->testing_p)
35102 emit_move_insn (d->target, d->op0);
35103 return true;
35104 }
35105 else if (broadcast_perm && TARGET_AVX2)
35106 {
35107 /* Use vpbroadcast{b,w,d}. */
35108 rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
35109 switch (d->vmode)
35110 {
35111 case V32QImode:
35112 op = gen_lowpart (V16QImode, op);
35113 gen = gen_avx2_pbroadcastv32qi;
35114 break;
35115 case V16HImode:
35116 op = gen_lowpart (V8HImode, op);
35117 gen = gen_avx2_pbroadcastv16hi;
35118 break;
35119 case V8SImode:
35120 op = gen_lowpart (V4SImode, op);
35121 gen = gen_avx2_pbroadcastv8si;
35122 break;
35123 case V16QImode:
35124 gen = gen_avx2_pbroadcastv16qi;
35125 break;
35126 case V8HImode:
35127 gen = gen_avx2_pbroadcastv8hi;
35128 break;
35129 /* For other modes prefer other shuffles this function creates. */
35130 default: break;
35131 }
35132 if (gen != NULL)
35133 {
35134 if (!d->testing_p)
35135 emit_insn (gen (d->target, op));
35136 return true;
35137 }
35138 }
35139
35140 if (expand_vselect (d->target, d->op0, perm2, nelt))
35141 return true;
35142
35143 /* There are plenty of patterns in sse.md that are written for
35144 SEL+CONCAT and are not replicated for a single op. Perhaps
35145 that should be changed, to avoid the nastiness here. */
35146
35147 /* Recognize interleave style patterns, which means incrementing
35148 every other permutation operand. */
35149 for (i = 0; i < nelt; i += 2)
35150 {
35151 perm2[i] = d->perm[i] & mask;
35152 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
35153 }
35154 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
35155 return true;
35156
35157 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
35158 if (nelt >= 4)
35159 {
35160 for (i = 0; i < nelt; i += 4)
35161 {
35162 perm2[i + 0] = d->perm[i + 0] & mask;
35163 perm2[i + 1] = d->perm[i + 1] & mask;
35164 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
35165 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
35166 }
35167
35168 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
35169 return true;
35170 }
35171 }
35172
35173 /* Finally, try the fully general two operand permute. */
35174 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
35175 return true;
35176
35177 /* Recognize interleave style patterns with reversed operands. */
35178 if (d->op0 != d->op1)
35179 {
35180 for (i = 0; i < nelt; ++i)
35181 {
35182 unsigned e = d->perm[i];
35183 if (e >= nelt)
35184 e -= nelt;
35185 else
35186 e += nelt;
35187 perm2[i] = e;
35188 }
35189
35190 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
35191 return true;
35192 }
35193
35194 /* Try the SSE4.1 blend variable merge instructions. */
35195 if (expand_vec_perm_blend (d))
35196 return true;
35197
35198 /* Try one of the AVX vpermil variable permutations. */
35199 if (expand_vec_perm_vpermil (d))
35200 return true;
35201
35202 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
35203 vpshufb, vpermd or vpermq variable permutation. */
35204 if (expand_vec_perm_pshufb (d))
35205 return true;
35206
35207 return false;
35208 }
35209
35210 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35211 in terms of a pair of pshuflw + pshufhw instructions. */
35212
35213 static bool
35214 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
35215 {
35216 unsigned char perm2[MAX_VECT_LEN];
35217 unsigned i;
35218 bool ok;
35219
35220 if (d->vmode != V8HImode || d->op0 != d->op1)
35221 return false;
35222
35223 /* The two permutations only operate in 64-bit lanes. */
35224 for (i = 0; i < 4; ++i)
35225 if (d->perm[i] >= 4)
35226 return false;
35227 for (i = 4; i < 8; ++i)
35228 if (d->perm[i] < 4)
35229 return false;
35230
35231 if (d->testing_p)
35232 return true;
35233
35234 /* Emit the pshuflw. */
35235 memcpy (perm2, d->perm, 4);
35236 for (i = 4; i < 8; ++i)
35237 perm2[i] = i;
35238 ok = expand_vselect (d->target, d->op0, perm2, 8);
35239 gcc_assert (ok);
35240
35241 /* Emit the pshufhw. */
35242 memcpy (perm2 + 4, d->perm + 4, 4);
35243 for (i = 0; i < 4; ++i)
35244 perm2[i] = i;
35245 ok = expand_vselect (d->target, d->target, perm2, 8);
35246 gcc_assert (ok);
35247
35248 return true;
35249 }
35250
35251 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
35252 the permutation using the SSSE3 palignr instruction. This succeeds
35253 when all of the elements in PERM fit within one vector and we merely
35254 need to shift them down so that a single vector permutation has a
35255 chance to succeed. */
35256
35257 static bool
35258 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
35259 {
35260 unsigned i, nelt = d->nelt;
35261 unsigned min, max;
35262 bool in_order, ok;
35263 rtx shift;
35264
35265 /* Even with AVX, palignr only operates on 128-bit vectors. */
35266 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
35267 return false;
35268
35269 min = nelt, max = 0;
35270 for (i = 0; i < nelt; ++i)
35271 {
35272 unsigned e = d->perm[i];
35273 if (e < min)
35274 min = e;
35275 if (e > max)
35276 max = e;
35277 }
35278 if (min == 0 || max - min >= nelt)
35279 return false;
35280
35281 /* Given that we have SSSE3, we know we'll be able to implement the
35282 single operand permutation after the palignr with pshufb. */
35283 if (d->testing_p)
35284 return true;
35285
35286 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
35287 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
35288 gen_lowpart (TImode, d->op1),
35289 gen_lowpart (TImode, d->op0), shift));
35290
35291 d->op0 = d->op1 = d->target;
35292
35293 in_order = true;
35294 for (i = 0; i < nelt; ++i)
35295 {
35296 unsigned e = d->perm[i] - min;
35297 if (e != i)
35298 in_order = false;
35299 d->perm[i] = e;
35300 }
35301
35302 /* Test for the degenerate case where the alignment by itself
35303 produces the desired permutation. */
35304 if (in_order)
35305 return true;
35306
35307 ok = expand_vec_perm_1 (d);
35308 gcc_assert (ok);
35309
35310 return ok;
35311 }
35312
35313 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
35314 a two vector permutation into a single vector permutation by using
35315 an interleave operation to merge the vectors. */
35316
35317 static bool
35318 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
35319 {
35320 struct expand_vec_perm_d dremap, dfinal;
35321 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
35322 unsigned HOST_WIDE_INT contents;
35323 unsigned char remap[2 * MAX_VECT_LEN];
35324 rtx seq;
35325 bool ok, same_halves = false;
35326
35327 if (GET_MODE_SIZE (d->vmode) == 16)
35328 {
35329 if (d->op0 == d->op1)
35330 return false;
35331 }
35332 else if (GET_MODE_SIZE (d->vmode) == 32)
35333 {
35334 if (!TARGET_AVX)
35335 return false;
35336 /* For 32-byte modes allow even d->op0 == d->op1.
35337 The lack of cross-lane shuffling in some instructions
35338 might prevent a single insn shuffle. */
35339 }
35340 else
35341 return false;
35342
35343 /* Examine from whence the elements come. */
35344 contents = 0;
35345 for (i = 0; i < nelt; ++i)
35346 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
35347
35348 memset (remap, 0xff, sizeof (remap));
35349 dremap = *d;
35350
35351 if (GET_MODE_SIZE (d->vmode) == 16)
35352 {
35353 unsigned HOST_WIDE_INT h1, h2, h3, h4;
35354
35355 /* Split the two input vectors into 4 halves. */
35356 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
35357 h2 = h1 << nelt2;
35358 h3 = h2 << nelt2;
35359 h4 = h3 << nelt2;
35360
35361 /* If the elements from the low halves use interleave low, and similarly
35362 for interleave high. If the elements are from mis-matched halves, we
35363 can use shufps for V4SF/V4SI or do a DImode shuffle. */
35364 if ((contents & (h1 | h3)) == contents)
35365 {
35366 /* punpckl* */
35367 for (i = 0; i < nelt2; ++i)
35368 {
35369 remap[i] = i * 2;
35370 remap[i + nelt] = i * 2 + 1;
35371 dremap.perm[i * 2] = i;
35372 dremap.perm[i * 2 + 1] = i + nelt;
35373 }
35374 }
35375 else if ((contents & (h2 | h4)) == contents)
35376 {
35377 /* punpckh* */
35378 for (i = 0; i < nelt2; ++i)
35379 {
35380 remap[i + nelt2] = i * 2;
35381 remap[i + nelt + nelt2] = i * 2 + 1;
35382 dremap.perm[i * 2] = i + nelt2;
35383 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
35384 }
35385 }
35386 else if ((contents & (h1 | h4)) == contents)
35387 {
35388 /* shufps */
35389 for (i = 0; i < nelt2; ++i)
35390 {
35391 remap[i] = i;
35392 remap[i + nelt + nelt2] = i + nelt2;
35393 dremap.perm[i] = i;
35394 dremap.perm[i + nelt2] = i + nelt + nelt2;
35395 }
35396 if (nelt != 4)
35397 {
35398 /* shufpd */
35399 dremap.vmode = V2DImode;
35400 dremap.nelt = 2;
35401 dremap.perm[0] = 0;
35402 dremap.perm[1] = 3;
35403 }
35404 }
35405 else if ((contents & (h2 | h3)) == contents)
35406 {
35407 /* shufps */
35408 for (i = 0; i < nelt2; ++i)
35409 {
35410 remap[i + nelt2] = i;
35411 remap[i + nelt] = i + nelt2;
35412 dremap.perm[i] = i + nelt2;
35413 dremap.perm[i + nelt2] = i + nelt;
35414 }
35415 if (nelt != 4)
35416 {
35417 /* shufpd */
35418 dremap.vmode = V2DImode;
35419 dremap.nelt = 2;
35420 dremap.perm[0] = 1;
35421 dremap.perm[1] = 2;
35422 }
35423 }
35424 else
35425 return false;
35426 }
35427 else
35428 {
35429 unsigned int nelt4 = nelt / 4, nzcnt = 0;
35430 unsigned HOST_WIDE_INT q[8];
35431 unsigned int nonzero_halves[4];
35432
35433 /* Split the two input vectors into 8 quarters. */
35434 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
35435 for (i = 1; i < 8; ++i)
35436 q[i] = q[0] << (nelt4 * i);
35437 for (i = 0; i < 4; ++i)
35438 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
35439 {
35440 nonzero_halves[nzcnt] = i;
35441 ++nzcnt;
35442 }
35443
35444 if (nzcnt == 1)
35445 {
35446 gcc_assert (d->op0 == d->op1);
35447 nonzero_halves[1] = nonzero_halves[0];
35448 same_halves = true;
35449 }
35450 else if (d->op0 == d->op1)
35451 {
35452 gcc_assert (nonzero_halves[0] == 0);
35453 gcc_assert (nonzero_halves[1] == 1);
35454 }
35455
35456 if (nzcnt <= 2)
35457 {
35458 if (d->perm[0] / nelt2 == nonzero_halves[1])
35459 {
35460 /* Attempt to increase the likelyhood that dfinal
35461 shuffle will be intra-lane. */
35462 char tmph = nonzero_halves[0];
35463 nonzero_halves[0] = nonzero_halves[1];
35464 nonzero_halves[1] = tmph;
35465 }
35466
35467 /* vperm2f128 or vperm2i128. */
35468 for (i = 0; i < nelt2; ++i)
35469 {
35470 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
35471 remap[i + nonzero_halves[0] * nelt2] = i;
35472 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
35473 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
35474 }
35475
35476 if (d->vmode != V8SFmode
35477 && d->vmode != V4DFmode
35478 && d->vmode != V8SImode)
35479 {
35480 dremap.vmode = V8SImode;
35481 dremap.nelt = 8;
35482 for (i = 0; i < 4; ++i)
35483 {
35484 dremap.perm[i] = i + nonzero_halves[0] * 4;
35485 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
35486 }
35487 }
35488 }
35489 else if (d->op0 == d->op1)
35490 return false;
35491 else if (TARGET_AVX2
35492 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
35493 {
35494 /* vpunpckl* */
35495 for (i = 0; i < nelt4; ++i)
35496 {
35497 remap[i] = i * 2;
35498 remap[i + nelt] = i * 2 + 1;
35499 remap[i + nelt2] = i * 2 + nelt2;
35500 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
35501 dremap.perm[i * 2] = i;
35502 dremap.perm[i * 2 + 1] = i + nelt;
35503 dremap.perm[i * 2 + nelt2] = i + nelt2;
35504 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
35505 }
35506 }
35507 else if (TARGET_AVX2
35508 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
35509 {
35510 /* vpunpckh* */
35511 for (i = 0; i < nelt4; ++i)
35512 {
35513 remap[i + nelt4] = i * 2;
35514 remap[i + nelt + nelt4] = i * 2 + 1;
35515 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
35516 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
35517 dremap.perm[i * 2] = i + nelt4;
35518 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
35519 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
35520 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
35521 }
35522 }
35523 else
35524 return false;
35525 }
35526
35527 /* Use the remapping array set up above to move the elements from their
35528 swizzled locations into their final destinations. */
35529 dfinal = *d;
35530 for (i = 0; i < nelt; ++i)
35531 {
35532 unsigned e = remap[d->perm[i]];
35533 gcc_assert (e < nelt);
35534 /* If same_halves is true, both halves of the remapped vector are the
35535 same. Avoid cross-lane accesses if possible. */
35536 if (same_halves && i >= nelt2)
35537 {
35538 gcc_assert (e < nelt2);
35539 dfinal.perm[i] = e + nelt2;
35540 }
35541 else
35542 dfinal.perm[i] = e;
35543 }
35544 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
35545 dfinal.op1 = dfinal.op0;
35546 dremap.target = dfinal.op0;
35547
35548 /* Test if the final remap can be done with a single insn. For V4SFmode or
35549 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
35550 start_sequence ();
35551 ok = expand_vec_perm_1 (&dfinal);
35552 seq = get_insns ();
35553 end_sequence ();
35554
35555 if (!ok)
35556 return false;
35557
35558 if (d->testing_p)
35559 return true;
35560
35561 if (dremap.vmode != dfinal.vmode)
35562 {
35563 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
35564 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
35565 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
35566 }
35567
35568 ok = expand_vec_perm_1 (&dremap);
35569 gcc_assert (ok);
35570
35571 emit_insn (seq);
35572 return true;
35573 }
35574
35575 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
35576 a single vector cross-lane permutation into vpermq followed
35577 by any of the single insn permutations. */
35578
35579 static bool
35580 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
35581 {
35582 struct expand_vec_perm_d dremap, dfinal;
35583 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
35584 unsigned contents[2];
35585 bool ok;
35586
35587 if (!(TARGET_AVX2
35588 && (d->vmode == V32QImode || d->vmode == V16HImode)
35589 && d->op0 == d->op1))
35590 return false;
35591
35592 contents[0] = 0;
35593 contents[1] = 0;
35594 for (i = 0; i < nelt2; ++i)
35595 {
35596 contents[0] |= 1u << (d->perm[i] / nelt4);
35597 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
35598 }
35599
35600 for (i = 0; i < 2; ++i)
35601 {
35602 unsigned int cnt = 0;
35603 for (j = 0; j < 4; ++j)
35604 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
35605 return false;
35606 }
35607
35608 if (d->testing_p)
35609 return true;
35610
35611 dremap = *d;
35612 dremap.vmode = V4DImode;
35613 dremap.nelt = 4;
35614 dremap.target = gen_reg_rtx (V4DImode);
35615 dremap.op0 = gen_lowpart (V4DImode, d->op0);
35616 dremap.op1 = dremap.op0;
35617 for (i = 0; i < 2; ++i)
35618 {
35619 unsigned int cnt = 0;
35620 for (j = 0; j < 4; ++j)
35621 if ((contents[i] & (1u << j)) != 0)
35622 dremap.perm[2 * i + cnt++] = j;
35623 for (; cnt < 2; ++cnt)
35624 dremap.perm[2 * i + cnt] = 0;
35625 }
35626
35627 dfinal = *d;
35628 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
35629 dfinal.op1 = dfinal.op0;
35630 for (i = 0, j = 0; i < nelt; ++i)
35631 {
35632 if (i == nelt2)
35633 j = 2;
35634 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
35635 if ((d->perm[i] / nelt4) == dremap.perm[j])
35636 ;
35637 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
35638 dfinal.perm[i] |= nelt4;
35639 else
35640 gcc_unreachable ();
35641 }
35642
35643 ok = expand_vec_perm_1 (&dremap);
35644 gcc_assert (ok);
35645
35646 ok = expand_vec_perm_1 (&dfinal);
35647 gcc_assert (ok);
35648
35649 return true;
35650 }
35651
35652 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
35653 a two vector permutation using 2 intra-lane interleave insns
35654 and cross-lane shuffle for 32-byte vectors. */
35655
35656 static bool
35657 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
35658 {
35659 unsigned i, nelt;
35660 rtx (*gen) (rtx, rtx, rtx);
35661
35662 if (d->op0 == d->op1)
35663 return false;
35664 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
35665 ;
35666 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
35667 ;
35668 else
35669 return false;
35670
35671 nelt = d->nelt;
35672 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
35673 return false;
35674 for (i = 0; i < nelt; i += 2)
35675 if (d->perm[i] != d->perm[0] + i / 2
35676 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
35677 return false;
35678
35679 if (d->testing_p)
35680 return true;
35681
35682 switch (d->vmode)
35683 {
35684 case V32QImode:
35685 if (d->perm[0])
35686 gen = gen_vec_interleave_highv32qi;
35687 else
35688 gen = gen_vec_interleave_lowv32qi;
35689 break;
35690 case V16HImode:
35691 if (d->perm[0])
35692 gen = gen_vec_interleave_highv16hi;
35693 else
35694 gen = gen_vec_interleave_lowv16hi;
35695 break;
35696 case V8SImode:
35697 if (d->perm[0])
35698 gen = gen_vec_interleave_highv8si;
35699 else
35700 gen = gen_vec_interleave_lowv8si;
35701 break;
35702 case V4DImode:
35703 if (d->perm[0])
35704 gen = gen_vec_interleave_highv4di;
35705 else
35706 gen = gen_vec_interleave_lowv4di;
35707 break;
35708 case V8SFmode:
35709 if (d->perm[0])
35710 gen = gen_vec_interleave_highv8sf;
35711 else
35712 gen = gen_vec_interleave_lowv8sf;
35713 break;
35714 case V4DFmode:
35715 if (d->perm[0])
35716 gen = gen_vec_interleave_highv4df;
35717 else
35718 gen = gen_vec_interleave_lowv4df;
35719 break;
35720 default:
35721 gcc_unreachable ();
35722 }
35723
35724 emit_insn (gen (d->target, d->op0, d->op1));
35725 return true;
35726 }
35727
35728 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
35729 permutation with two pshufb insns and an ior. We should have already
35730 failed all two instruction sequences. */
35731
35732 static bool
35733 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
35734 {
35735 rtx rperm[2][16], vperm, l, h, op, m128;
35736 unsigned int i, nelt, eltsz;
35737
35738 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
35739 return false;
35740 gcc_assert (d->op0 != d->op1);
35741
35742 nelt = d->nelt;
35743 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
35744
35745 /* Generate two permutation masks. If the required element is within
35746 the given vector it is shuffled into the proper lane. If the required
35747 element is in the other vector, force a zero into the lane by setting
35748 bit 7 in the permutation mask. */
35749 m128 = GEN_INT (-128);
35750 for (i = 0; i < nelt; ++i)
35751 {
35752 unsigned j, e = d->perm[i];
35753 unsigned which = (e >= nelt);
35754 if (e >= nelt)
35755 e -= nelt;
35756
35757 for (j = 0; j < eltsz; ++j)
35758 {
35759 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
35760 rperm[1-which][i*eltsz + j] = m128;
35761 }
35762 }
35763
35764 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
35765 vperm = force_reg (V16QImode, vperm);
35766
35767 l = gen_reg_rtx (V16QImode);
35768 op = gen_lowpart (V16QImode, d->op0);
35769 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
35770
35771 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
35772 vperm = force_reg (V16QImode, vperm);
35773
35774 h = gen_reg_rtx (V16QImode);
35775 op = gen_lowpart (V16QImode, d->op1);
35776 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
35777
35778 op = gen_lowpart (V16QImode, d->target);
35779 emit_insn (gen_iorv16qi3 (op, l, h));
35780
35781 return true;
35782 }
35783
35784 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
35785 with two vpshufb insns, vpermq and vpor. We should have already failed
35786 all two or three instruction sequences. */
35787
35788 static bool
35789 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
35790 {
35791 rtx rperm[2][32], vperm, l, h, hp, op, m128;
35792 unsigned int i, nelt, eltsz;
35793
35794 if (!TARGET_AVX2
35795 || d->op0 != d->op1
35796 || (d->vmode != V32QImode && d->vmode != V16HImode))
35797 return false;
35798
35799 if (d->testing_p)
35800 return true;
35801
35802 nelt = d->nelt;
35803 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
35804
35805 /* Generate two permutation masks. If the required element is within
35806 the same lane, it is shuffled in. If the required element from the
35807 other lane, force a zero by setting bit 7 in the permutation mask.
35808 In the other mask the mask has non-negative elements if element
35809 is requested from the other lane, but also moved to the other lane,
35810 so that the result of vpshufb can have the two V2TImode halves
35811 swapped. */
35812 m128 = GEN_INT (-128);
35813 for (i = 0; i < nelt; ++i)
35814 {
35815 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
35816 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
35817
35818 for (j = 0; j < eltsz; ++j)
35819 {
35820 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
35821 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
35822 }
35823 }
35824
35825 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
35826 vperm = force_reg (V32QImode, vperm);
35827
35828 h = gen_reg_rtx (V32QImode);
35829 op = gen_lowpart (V32QImode, d->op0);
35830 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
35831
35832 /* Swap the 128-byte lanes of h into hp. */
35833 hp = gen_reg_rtx (V4DImode);
35834 op = gen_lowpart (V4DImode, h);
35835 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
35836 const1_rtx));
35837
35838 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
35839 vperm = force_reg (V32QImode, vperm);
35840
35841 l = gen_reg_rtx (V32QImode);
35842 op = gen_lowpart (V32QImode, d->op0);
35843 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
35844
35845 op = gen_lowpart (V32QImode, d->target);
35846 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
35847
35848 return true;
35849 }
35850
35851 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
35852 and extract-odd permutations of two V32QImode and V16QImode operand
35853 with two vpshufb insns, vpor and vpermq. We should have already
35854 failed all two or three instruction sequences. */
35855
35856 static bool
35857 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
35858 {
35859 rtx rperm[2][32], vperm, l, h, ior, op, m128;
35860 unsigned int i, nelt, eltsz;
35861
35862 if (!TARGET_AVX2
35863 || d->op0 == d->op1
35864 || (d->vmode != V32QImode && d->vmode != V16HImode))
35865 return false;
35866
35867 for (i = 0; i < d->nelt; ++i)
35868 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
35869 return false;
35870
35871 if (d->testing_p)
35872 return true;
35873
35874 nelt = d->nelt;
35875 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
35876
35877 /* Generate two permutation masks. In the first permutation mask
35878 the first quarter will contain indexes for the first half
35879 of the op0, the second quarter will contain bit 7 set, third quarter
35880 will contain indexes for the second half of the op0 and the
35881 last quarter bit 7 set. In the second permutation mask
35882 the first quarter will contain bit 7 set, the second quarter
35883 indexes for the first half of the op1, the third quarter bit 7 set
35884 and last quarter indexes for the second half of the op1.
35885 I.e. the first mask e.g. for V32QImode extract even will be:
35886 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
35887 (all values masked with 0xf except for -128) and second mask
35888 for extract even will be
35889 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
35890 m128 = GEN_INT (-128);
35891 for (i = 0; i < nelt; ++i)
35892 {
35893 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
35894 unsigned which = d->perm[i] >= nelt;
35895 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
35896
35897 for (j = 0; j < eltsz; ++j)
35898 {
35899 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
35900 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
35901 }
35902 }
35903
35904 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
35905 vperm = force_reg (V32QImode, vperm);
35906
35907 l = gen_reg_rtx (V32QImode);
35908 op = gen_lowpart (V32QImode, d->op0);
35909 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
35910
35911 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
35912 vperm = force_reg (V32QImode, vperm);
35913
35914 h = gen_reg_rtx (V32QImode);
35915 op = gen_lowpart (V32QImode, d->op1);
35916 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
35917
35918 ior = gen_reg_rtx (V32QImode);
35919 emit_insn (gen_iorv32qi3 (ior, l, h));
35920
35921 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
35922 op = gen_lowpart (V4DImode, d->target);
35923 ior = gen_lowpart (V4DImode, ior);
35924 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
35925 const1_rtx, GEN_INT (3)));
35926
35927 return true;
35928 }
35929
35930 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
35931 and extract-odd permutations. */
35932
35933 static bool
35934 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
35935 {
35936 rtx t1, t2, t3;
35937
35938 switch (d->vmode)
35939 {
35940 case V4DFmode:
35941 t1 = gen_reg_rtx (V4DFmode);
35942 t2 = gen_reg_rtx (V4DFmode);
35943
35944 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
35945 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
35946 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
35947
35948 /* Now an unpck[lh]pd will produce the result required. */
35949 if (odd)
35950 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
35951 else
35952 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
35953 emit_insn (t3);
35954 break;
35955
35956 case V8SFmode:
35957 {
35958 int mask = odd ? 0xdd : 0x88;
35959
35960 t1 = gen_reg_rtx (V8SFmode);
35961 t2 = gen_reg_rtx (V8SFmode);
35962 t3 = gen_reg_rtx (V8SFmode);
35963
35964 /* Shuffle within the 128-bit lanes to produce:
35965 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
35966 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
35967 GEN_INT (mask)));
35968
35969 /* Shuffle the lanes around to produce:
35970 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
35971 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
35972 GEN_INT (0x3)));
35973
35974 /* Shuffle within the 128-bit lanes to produce:
35975 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
35976 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
35977
35978 /* Shuffle within the 128-bit lanes to produce:
35979 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
35980 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
35981
35982 /* Shuffle the lanes around to produce:
35983 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
35984 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
35985 GEN_INT (0x20)));
35986 }
35987 break;
35988
35989 case V2DFmode:
35990 case V4SFmode:
35991 case V2DImode:
35992 case V4SImode:
35993 /* These are always directly implementable by expand_vec_perm_1. */
35994 gcc_unreachable ();
35995
35996 case V8HImode:
35997 if (TARGET_SSSE3)
35998 return expand_vec_perm_pshufb2 (d);
35999 else
36000 {
36001 /* We need 2*log2(N)-1 operations to achieve odd/even
36002 with interleave. */
36003 t1 = gen_reg_rtx (V8HImode);
36004 t2 = gen_reg_rtx (V8HImode);
36005 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
36006 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
36007 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
36008 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
36009 if (odd)
36010 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
36011 else
36012 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
36013 emit_insn (t3);
36014 }
36015 break;
36016
36017 case V16QImode:
36018 if (TARGET_SSSE3)
36019 return expand_vec_perm_pshufb2 (d);
36020 else
36021 {
36022 t1 = gen_reg_rtx (V16QImode);
36023 t2 = gen_reg_rtx (V16QImode);
36024 t3 = gen_reg_rtx (V16QImode);
36025 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
36026 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
36027 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
36028 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
36029 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
36030 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
36031 if (odd)
36032 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
36033 else
36034 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
36035 emit_insn (t3);
36036 }
36037 break;
36038
36039 case V16HImode:
36040 case V32QImode:
36041 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
36042
36043 case V4DImode:
36044 if (!TARGET_AVX2)
36045 {
36046 struct expand_vec_perm_d d_copy = *d;
36047 d_copy.vmode = V4DFmode;
36048 d_copy.target = gen_lowpart (V4DFmode, d->target);
36049 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
36050 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
36051 return expand_vec_perm_even_odd_1 (&d_copy, odd);
36052 }
36053
36054 t1 = gen_reg_rtx (V4DImode);
36055 t2 = gen_reg_rtx (V4DImode);
36056
36057 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
36058 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
36059 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
36060
36061 /* Now an vpunpck[lh]qdq will produce the result required. */
36062 if (odd)
36063 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
36064 else
36065 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
36066 emit_insn (t3);
36067 break;
36068
36069 case V8SImode:
36070 if (!TARGET_AVX2)
36071 {
36072 struct expand_vec_perm_d d_copy = *d;
36073 d_copy.vmode = V8SFmode;
36074 d_copy.target = gen_lowpart (V8SFmode, d->target);
36075 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
36076 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
36077 return expand_vec_perm_even_odd_1 (&d_copy, odd);
36078 }
36079
36080 t1 = gen_reg_rtx (V8SImode);
36081 t2 = gen_reg_rtx (V8SImode);
36082
36083 /* Shuffle the lanes around into
36084 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
36085 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
36086 gen_lowpart (V4DImode, d->op0),
36087 gen_lowpart (V4DImode, d->op1),
36088 GEN_INT (0x20)));
36089 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
36090 gen_lowpart (V4DImode, d->op0),
36091 gen_lowpart (V4DImode, d->op1),
36092 GEN_INT (0x31)));
36093
36094 /* Swap the 2nd and 3rd position in each lane into
36095 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
36096 emit_insn (gen_avx2_pshufdv3 (t1, t1,
36097 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
36098 emit_insn (gen_avx2_pshufdv3 (t2, t2,
36099 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
36100
36101 /* Now an vpunpck[lh]qdq will produce
36102 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
36103 if (odd)
36104 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
36105 gen_lowpart (V4DImode, t1),
36106 gen_lowpart (V4DImode, t2));
36107 else
36108 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
36109 gen_lowpart (V4DImode, t1),
36110 gen_lowpart (V4DImode, t2));
36111 emit_insn (t3);
36112 break;
36113
36114 default:
36115 gcc_unreachable ();
36116 }
36117
36118 return true;
36119 }
36120
36121 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
36122 extract-even and extract-odd permutations. */
36123
36124 static bool
36125 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
36126 {
36127 unsigned i, odd, nelt = d->nelt;
36128
36129 odd = d->perm[0];
36130 if (odd != 0 && odd != 1)
36131 return false;
36132
36133 for (i = 1; i < nelt; ++i)
36134 if (d->perm[i] != 2 * i + odd)
36135 return false;
36136
36137 return expand_vec_perm_even_odd_1 (d, odd);
36138 }
36139
36140 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
36141 permutations. We assume that expand_vec_perm_1 has already failed. */
36142
36143 static bool
36144 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
36145 {
36146 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
36147 enum machine_mode vmode = d->vmode;
36148 unsigned char perm2[4];
36149 rtx op0 = d->op0;
36150 bool ok;
36151
36152 switch (vmode)
36153 {
36154 case V4DFmode:
36155 case V8SFmode:
36156 /* These are special-cased in sse.md so that we can optionally
36157 use the vbroadcast instruction. They expand to two insns
36158 if the input happens to be in a register. */
36159 gcc_unreachable ();
36160
36161 case V2DFmode:
36162 case V2DImode:
36163 case V4SFmode:
36164 case V4SImode:
36165 /* These are always implementable using standard shuffle patterns. */
36166 gcc_unreachable ();
36167
36168 case V8HImode:
36169 case V16QImode:
36170 /* These can be implemented via interleave. We save one insn by
36171 stopping once we have promoted to V4SImode and then use pshufd. */
36172 do
36173 {
36174 optab otab = vec_interleave_low_optab;
36175
36176 if (elt >= nelt2)
36177 {
36178 otab = vec_interleave_high_optab;
36179 elt -= nelt2;
36180 }
36181 nelt2 /= 2;
36182
36183 op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT);
36184 vmode = get_mode_wider_vector (vmode);
36185 op0 = gen_lowpart (vmode, op0);
36186 }
36187 while (vmode != V4SImode);
36188
36189 memset (perm2, elt, 4);
36190 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
36191 gcc_assert (ok);
36192 return true;
36193
36194 case V32QImode:
36195 case V16HImode:
36196 case V8SImode:
36197 case V4DImode:
36198 /* For AVX2 broadcasts of the first element vpbroadcast* or
36199 vpermq should be used by expand_vec_perm_1. */
36200 gcc_assert (!TARGET_AVX2 || d->perm[0]);
36201 return false;
36202
36203 default:
36204 gcc_unreachable ();
36205 }
36206 }
36207
36208 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
36209 broadcast permutations. */
36210
36211 static bool
36212 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
36213 {
36214 unsigned i, elt, nelt = d->nelt;
36215
36216 if (d->op0 != d->op1)
36217 return false;
36218
36219 elt = d->perm[0];
36220 for (i = 1; i < nelt; ++i)
36221 if (d->perm[i] != elt)
36222 return false;
36223
36224 return expand_vec_perm_broadcast_1 (d);
36225 }
36226
36227 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
36228 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
36229 all the shorter instruction sequences. */
36230
36231 static bool
36232 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
36233 {
36234 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
36235 unsigned int i, nelt, eltsz;
36236 bool used[4];
36237
36238 if (!TARGET_AVX2
36239 || d->op0 == d->op1
36240 || (d->vmode != V32QImode && d->vmode != V16HImode))
36241 return false;
36242
36243 if (d->testing_p)
36244 return true;
36245
36246 nelt = d->nelt;
36247 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36248
36249 /* Generate 4 permutation masks. If the required element is within
36250 the same lane, it is shuffled in. If the required element from the
36251 other lane, force a zero by setting bit 7 in the permutation mask.
36252 In the other mask the mask has non-negative elements if element
36253 is requested from the other lane, but also moved to the other lane,
36254 so that the result of vpshufb can have the two V2TImode halves
36255 swapped. */
36256 m128 = GEN_INT (-128);
36257 for (i = 0; i < 32; ++i)
36258 {
36259 rperm[0][i] = m128;
36260 rperm[1][i] = m128;
36261 rperm[2][i] = m128;
36262 rperm[3][i] = m128;
36263 }
36264 used[0] = false;
36265 used[1] = false;
36266 used[2] = false;
36267 used[3] = false;
36268 for (i = 0; i < nelt; ++i)
36269 {
36270 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36271 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
36272 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
36273
36274 for (j = 0; j < eltsz; ++j)
36275 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
36276 used[which] = true;
36277 }
36278
36279 for (i = 0; i < 2; ++i)
36280 {
36281 if (!used[2 * i + 1])
36282 {
36283 h[i] = NULL_RTX;
36284 continue;
36285 }
36286 vperm = gen_rtx_CONST_VECTOR (V32QImode,
36287 gen_rtvec_v (32, rperm[2 * i + 1]));
36288 vperm = force_reg (V32QImode, vperm);
36289 h[i] = gen_reg_rtx (V32QImode);
36290 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
36291 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
36292 }
36293
36294 /* Swap the 128-byte lanes of h[X]. */
36295 for (i = 0; i < 2; ++i)
36296 {
36297 if (h[i] == NULL_RTX)
36298 continue;
36299 op = gen_reg_rtx (V4DImode);
36300 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
36301 const2_rtx, GEN_INT (3), const0_rtx,
36302 const1_rtx));
36303 h[i] = gen_lowpart (V32QImode, op);
36304 }
36305
36306 for (i = 0; i < 2; ++i)
36307 {
36308 if (!used[2 * i])
36309 {
36310 l[i] = NULL_RTX;
36311 continue;
36312 }
36313 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
36314 vperm = force_reg (V32QImode, vperm);
36315 l[i] = gen_reg_rtx (V32QImode);
36316 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
36317 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
36318 }
36319
36320 for (i = 0; i < 2; ++i)
36321 {
36322 if (h[i] && l[i])
36323 {
36324 op = gen_reg_rtx (V32QImode);
36325 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
36326 l[i] = op;
36327 }
36328 else if (h[i])
36329 l[i] = h[i];
36330 }
36331
36332 gcc_assert (l[0] && l[1]);
36333 op = gen_lowpart (V32QImode, d->target);
36334 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
36335 return true;
36336 }
36337
36338 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
36339 With all of the interface bits taken care of, perform the expansion
36340 in D and return true on success. */
36341
36342 static bool
36343 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
36344 {
36345 /* Try a single instruction expansion. */
36346 if (expand_vec_perm_1 (d))
36347 return true;
36348
36349 /* Try sequences of two instructions. */
36350
36351 if (expand_vec_perm_pshuflw_pshufhw (d))
36352 return true;
36353
36354 if (expand_vec_perm_palignr (d))
36355 return true;
36356
36357 if (expand_vec_perm_interleave2 (d))
36358 return true;
36359
36360 if (expand_vec_perm_broadcast (d))
36361 return true;
36362
36363 if (expand_vec_perm_vpermq_perm_1 (d))
36364 return true;
36365
36366 /* Try sequences of three instructions. */
36367
36368 if (expand_vec_perm_pshufb2 (d))
36369 return true;
36370
36371 if (expand_vec_perm_interleave3 (d))
36372 return true;
36373
36374 /* Try sequences of four instructions. */
36375
36376 if (expand_vec_perm_vpshufb2_vpermq (d))
36377 return true;
36378
36379 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
36380 return true;
36381
36382 /* ??? Look for narrow permutations whose element orderings would
36383 allow the promotion to a wider mode. */
36384
36385 /* ??? Look for sequences of interleave or a wider permute that place
36386 the data into the correct lanes for a half-vector shuffle like
36387 pshuf[lh]w or vpermilps. */
36388
36389 /* ??? Look for sequences of interleave that produce the desired results.
36390 The combinatorics of punpck[lh] get pretty ugly... */
36391
36392 if (expand_vec_perm_even_odd (d))
36393 return true;
36394
36395 /* Even longer sequences. */
36396 if (expand_vec_perm_vpshufb4_vpermq2 (d))
36397 return true;
36398
36399 return false;
36400 }
36401
36402 bool
36403 ix86_expand_vec_perm_const (rtx operands[4])
36404 {
36405 struct expand_vec_perm_d d;
36406 unsigned char perm[MAX_VECT_LEN];
36407 int i, nelt, which;
36408 rtx sel;
36409
36410 d.target = operands[0];
36411 d.op0 = operands[1];
36412 d.op1 = operands[2];
36413 sel = operands[3];
36414
36415 d.vmode = GET_MODE (d.target);
36416 gcc_assert (VECTOR_MODE_P (d.vmode));
36417 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
36418 d.testing_p = false;
36419
36420 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
36421 gcc_assert (XVECLEN (sel, 0) == nelt);
36422 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
36423
36424 for (i = which = 0; i < nelt; ++i)
36425 {
36426 rtx e = XVECEXP (sel, 0, i);
36427 int ei = INTVAL (e) & (2 * nelt - 1);
36428
36429 which |= (ei < nelt ? 1 : 2);
36430 d.perm[i] = ei;
36431 perm[i] = ei;
36432 }
36433
36434 switch (which)
36435 {
36436 default:
36437 gcc_unreachable();
36438
36439 case 3:
36440 if (!rtx_equal_p (d.op0, d.op1))
36441 break;
36442
36443 /* The elements of PERM do not suggest that only the first operand
36444 is used, but both operands are identical. Allow easier matching
36445 of the permutation by folding the permutation into the single
36446 input vector. */
36447 for (i = 0; i < nelt; ++i)
36448 if (d.perm[i] >= nelt)
36449 d.perm[i] -= nelt;
36450 /* FALLTHRU */
36451
36452 case 1:
36453 d.op1 = d.op0;
36454 break;
36455
36456 case 2:
36457 for (i = 0; i < nelt; ++i)
36458 d.perm[i] -= nelt;
36459 d.op0 = d.op1;
36460 break;
36461 }
36462
36463 if (ix86_expand_vec_perm_const_1 (&d))
36464 return true;
36465
36466 /* If the mask says both arguments are needed, but they are the same,
36467 the above tried to expand with d.op0 == d.op1. If that didn't work,
36468 retry with d.op0 != d.op1 as that is what testing has been done with. */
36469 if (which == 3 && d.op0 == d.op1)
36470 {
36471 rtx seq;
36472 bool ok;
36473
36474 memcpy (d.perm, perm, sizeof (perm));
36475 d.op1 = gen_reg_rtx (d.vmode);
36476 start_sequence ();
36477 ok = ix86_expand_vec_perm_const_1 (&d);
36478 seq = get_insns ();
36479 end_sequence ();
36480 if (ok)
36481 {
36482 emit_move_insn (d.op1, d.op0);
36483 emit_insn (seq);
36484 return true;
36485 }
36486 }
36487
36488 return false;
36489 }
36490
36491 /* Implement targetm.vectorize.vec_perm_const_ok. */
36492
36493 static bool
36494 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
36495 const unsigned char *sel)
36496 {
36497 struct expand_vec_perm_d d;
36498 unsigned int i, nelt, which;
36499 bool ret, one_vec;
36500
36501 d.vmode = vmode;
36502 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
36503 d.testing_p = true;
36504
36505 /* Given sufficient ISA support we can just return true here
36506 for selected vector modes. */
36507 if (GET_MODE_SIZE (d.vmode) == 16)
36508 {
36509 /* All implementable with a single vpperm insn. */
36510 if (TARGET_XOP)
36511 return true;
36512 /* All implementable with 2 pshufb + 1 ior. */
36513 if (TARGET_SSSE3)
36514 return true;
36515 /* All implementable with shufpd or unpck[lh]pd. */
36516 if (d.nelt == 2)
36517 return true;
36518 }
36519
36520 /* Extract the values from the vector CST into the permutation
36521 array in D. */
36522 memcpy (d.perm, sel, nelt);
36523 for (i = which = 0; i < nelt; ++i)
36524 {
36525 unsigned char e = d.perm[i];
36526 gcc_assert (e < 2 * nelt);
36527 which |= (e < nelt ? 1 : 2);
36528 }
36529
36530 /* For all elements from second vector, fold the elements to first. */
36531 if (which == 2)
36532 for (i = 0; i < nelt; ++i)
36533 d.perm[i] -= nelt;
36534
36535 /* Check whether the mask can be applied to the vector type. */
36536 one_vec = (which != 3);
36537
36538 /* Implementable with shufps or pshufd. */
36539 if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
36540 return true;
36541
36542 /* Otherwise we have to go through the motions and see if we can
36543 figure out how to generate the requested permutation. */
36544 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
36545 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
36546 if (!one_vec)
36547 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
36548
36549 start_sequence ();
36550 ret = ix86_expand_vec_perm_const_1 (&d);
36551 end_sequence ();
36552
36553 return ret;
36554 }
36555
36556 void
36557 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
36558 {
36559 struct expand_vec_perm_d d;
36560 unsigned i, nelt;
36561
36562 d.target = targ;
36563 d.op0 = op0;
36564 d.op1 = op1;
36565 d.vmode = GET_MODE (targ);
36566 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
36567 d.testing_p = false;
36568
36569 for (i = 0; i < nelt; ++i)
36570 d.perm[i] = i * 2 + odd;
36571
36572 /* We'll either be able to implement the permutation directly... */
36573 if (expand_vec_perm_1 (&d))
36574 return;
36575
36576 /* ... or we use the special-case patterns. */
36577 expand_vec_perm_even_odd_1 (&d, odd);
36578 }
36579
36580 /* Expand an insert into a vector register through pinsr insn.
36581 Return true if successful. */
36582
36583 bool
36584 ix86_expand_pinsr (rtx *operands)
36585 {
36586 rtx dst = operands[0];
36587 rtx src = operands[3];
36588
36589 unsigned int size = INTVAL (operands[1]);
36590 unsigned int pos = INTVAL (operands[2]);
36591
36592 if (GET_CODE (dst) == SUBREG)
36593 {
36594 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
36595 dst = SUBREG_REG (dst);
36596 }
36597
36598 if (GET_CODE (src) == SUBREG)
36599 src = SUBREG_REG (src);
36600
36601 switch (GET_MODE (dst))
36602 {
36603 case V16QImode:
36604 case V8HImode:
36605 case V4SImode:
36606 case V2DImode:
36607 {
36608 enum machine_mode srcmode, dstmode;
36609 rtx (*pinsr)(rtx, rtx, rtx, rtx);
36610
36611 srcmode = mode_for_size (size, MODE_INT, 0);
36612
36613 switch (srcmode)
36614 {
36615 case QImode:
36616 if (!TARGET_SSE4_1)
36617 return false;
36618 dstmode = V16QImode;
36619 pinsr = gen_sse4_1_pinsrb;
36620 break;
36621
36622 case HImode:
36623 if (!TARGET_SSE2)
36624 return false;
36625 dstmode = V8HImode;
36626 pinsr = gen_sse2_pinsrw;
36627 break;
36628
36629 case SImode:
36630 if (!TARGET_SSE4_1)
36631 return false;
36632 dstmode = V4SImode;
36633 pinsr = gen_sse4_1_pinsrd;
36634 break;
36635
36636 case DImode:
36637 gcc_assert (TARGET_64BIT);
36638 if (!TARGET_SSE4_1)
36639 return false;
36640 dstmode = V2DImode;
36641 pinsr = gen_sse4_1_pinsrq;
36642 break;
36643
36644 default:
36645 return false;
36646 }
36647
36648 dst = gen_lowpart (dstmode, dst);
36649 src = gen_lowpart (srcmode, src);
36650
36651 pos /= size;
36652
36653 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
36654 return true;
36655 }
36656
36657 default:
36658 return false;
36659 }
36660 }
36661 \f
36662 /* This function returns the calling abi specific va_list type node.
36663 It returns the FNDECL specific va_list type. */
36664
36665 static tree
36666 ix86_fn_abi_va_list (tree fndecl)
36667 {
36668 if (!TARGET_64BIT)
36669 return va_list_type_node;
36670 gcc_assert (fndecl != NULL_TREE);
36671
36672 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
36673 return ms_va_list_type_node;
36674 else
36675 return sysv_va_list_type_node;
36676 }
36677
36678 /* Returns the canonical va_list type specified by TYPE. If there
36679 is no valid TYPE provided, it return NULL_TREE. */
36680
36681 static tree
36682 ix86_canonical_va_list_type (tree type)
36683 {
36684 tree wtype, htype;
36685
36686 /* Resolve references and pointers to va_list type. */
36687 if (TREE_CODE (type) == MEM_REF)
36688 type = TREE_TYPE (type);
36689 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
36690 type = TREE_TYPE (type);
36691 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
36692 type = TREE_TYPE (type);
36693
36694 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
36695 {
36696 wtype = va_list_type_node;
36697 gcc_assert (wtype != NULL_TREE);
36698 htype = type;
36699 if (TREE_CODE (wtype) == ARRAY_TYPE)
36700 {
36701 /* If va_list is an array type, the argument may have decayed
36702 to a pointer type, e.g. by being passed to another function.
36703 In that case, unwrap both types so that we can compare the
36704 underlying records. */
36705 if (TREE_CODE (htype) == ARRAY_TYPE
36706 || POINTER_TYPE_P (htype))
36707 {
36708 wtype = TREE_TYPE (wtype);
36709 htype = TREE_TYPE (htype);
36710 }
36711 }
36712 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
36713 return va_list_type_node;
36714 wtype = sysv_va_list_type_node;
36715 gcc_assert (wtype != NULL_TREE);
36716 htype = type;
36717 if (TREE_CODE (wtype) == ARRAY_TYPE)
36718 {
36719 /* If va_list is an array type, the argument may have decayed
36720 to a pointer type, e.g. by being passed to another function.
36721 In that case, unwrap both types so that we can compare the
36722 underlying records. */
36723 if (TREE_CODE (htype) == ARRAY_TYPE
36724 || POINTER_TYPE_P (htype))
36725 {
36726 wtype = TREE_TYPE (wtype);
36727 htype = TREE_TYPE (htype);
36728 }
36729 }
36730 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
36731 return sysv_va_list_type_node;
36732 wtype = ms_va_list_type_node;
36733 gcc_assert (wtype != NULL_TREE);
36734 htype = type;
36735 if (TREE_CODE (wtype) == ARRAY_TYPE)
36736 {
36737 /* If va_list is an array type, the argument may have decayed
36738 to a pointer type, e.g. by being passed to another function.
36739 In that case, unwrap both types so that we can compare the
36740 underlying records. */
36741 if (TREE_CODE (htype) == ARRAY_TYPE
36742 || POINTER_TYPE_P (htype))
36743 {
36744 wtype = TREE_TYPE (wtype);
36745 htype = TREE_TYPE (htype);
36746 }
36747 }
36748 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
36749 return ms_va_list_type_node;
36750 return NULL_TREE;
36751 }
36752 return std_canonical_va_list_type (type);
36753 }
36754
36755 /* Iterate through the target-specific builtin types for va_list.
36756 IDX denotes the iterator, *PTREE is set to the result type of
36757 the va_list builtin, and *PNAME to its internal type.
36758 Returns zero if there is no element for this index, otherwise
36759 IDX should be increased upon the next call.
36760 Note, do not iterate a base builtin's name like __builtin_va_list.
36761 Used from c_common_nodes_and_builtins. */
36762
36763 static int
36764 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
36765 {
36766 if (TARGET_64BIT)
36767 {
36768 switch (idx)
36769 {
36770 default:
36771 break;
36772
36773 case 0:
36774 *ptree = ms_va_list_type_node;
36775 *pname = "__builtin_ms_va_list";
36776 return 1;
36777
36778 case 1:
36779 *ptree = sysv_va_list_type_node;
36780 *pname = "__builtin_sysv_va_list";
36781 return 1;
36782 }
36783 }
36784
36785 return 0;
36786 }
36787
36788 #undef TARGET_SCHED_DISPATCH
36789 #define TARGET_SCHED_DISPATCH has_dispatch
36790 #undef TARGET_SCHED_DISPATCH_DO
36791 #define TARGET_SCHED_DISPATCH_DO do_dispatch
36792 #undef TARGET_SCHED_REASSOCIATION_WIDTH
36793 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
36794
36795 /* The size of the dispatch window is the total number of bytes of
36796 object code allowed in a window. */
36797 #define DISPATCH_WINDOW_SIZE 16
36798
36799 /* Number of dispatch windows considered for scheduling. */
36800 #define MAX_DISPATCH_WINDOWS 3
36801
36802 /* Maximum number of instructions in a window. */
36803 #define MAX_INSN 4
36804
36805 /* Maximum number of immediate operands in a window. */
36806 #define MAX_IMM 4
36807
36808 /* Maximum number of immediate bits allowed in a window. */
36809 #define MAX_IMM_SIZE 128
36810
36811 /* Maximum number of 32 bit immediates allowed in a window. */
36812 #define MAX_IMM_32 4
36813
36814 /* Maximum number of 64 bit immediates allowed in a window. */
36815 #define MAX_IMM_64 2
36816
36817 /* Maximum total of loads or prefetches allowed in a window. */
36818 #define MAX_LOAD 2
36819
36820 /* Maximum total of stores allowed in a window. */
36821 #define MAX_STORE 1
36822
36823 #undef BIG
36824 #define BIG 100
36825
36826
36827 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
36828 enum dispatch_group {
36829 disp_no_group = 0,
36830 disp_load,
36831 disp_store,
36832 disp_load_store,
36833 disp_prefetch,
36834 disp_imm,
36835 disp_imm_32,
36836 disp_imm_64,
36837 disp_branch,
36838 disp_cmp,
36839 disp_jcc,
36840 disp_last
36841 };
36842
36843 /* Number of allowable groups in a dispatch window. It is an array
36844 indexed by dispatch_group enum. 100 is used as a big number,
36845 because the number of these kind of operations does not have any
36846 effect in dispatch window, but we need them for other reasons in
36847 the table. */
36848 static unsigned int num_allowable_groups[disp_last] = {
36849 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
36850 };
36851
36852 char group_name[disp_last + 1][16] = {
36853 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
36854 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
36855 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
36856 };
36857
36858 /* Instruction path. */
36859 enum insn_path {
36860 no_path = 0,
36861 path_single, /* Single micro op. */
36862 path_double, /* Double micro op. */
36863 path_multi, /* Instructions with more than 2 micro op.. */
36864 last_path
36865 };
36866
36867 /* sched_insn_info defines a window to the instructions scheduled in
36868 the basic block. It contains a pointer to the insn_info table and
36869 the instruction scheduled.
36870
36871 Windows are allocated for each basic block and are linked
36872 together. */
36873 typedef struct sched_insn_info_s {
36874 rtx insn;
36875 enum dispatch_group group;
36876 enum insn_path path;
36877 int byte_len;
36878 int imm_bytes;
36879 } sched_insn_info;
36880
36881 /* Linked list of dispatch windows. This is a two way list of
36882 dispatch windows of a basic block. It contains information about
36883 the number of uops in the window and the total number of
36884 instructions and of bytes in the object code for this dispatch
36885 window. */
36886 typedef struct dispatch_windows_s {
36887 int num_insn; /* Number of insn in the window. */
36888 int num_uops; /* Number of uops in the window. */
36889 int window_size; /* Number of bytes in the window. */
36890 int window_num; /* Window number between 0 or 1. */
36891 int num_imm; /* Number of immediates in an insn. */
36892 int num_imm_32; /* Number of 32 bit immediates in an insn. */
36893 int num_imm_64; /* Number of 64 bit immediates in an insn. */
36894 int imm_size; /* Total immediates in the window. */
36895 int num_loads; /* Total memory loads in the window. */
36896 int num_stores; /* Total memory stores in the window. */
36897 int violation; /* Violation exists in window. */
36898 sched_insn_info *window; /* Pointer to the window. */
36899 struct dispatch_windows_s *next;
36900 struct dispatch_windows_s *prev;
36901 } dispatch_windows;
36902
36903 /* Immediate valuse used in an insn. */
36904 typedef struct imm_info_s
36905 {
36906 int imm;
36907 int imm32;
36908 int imm64;
36909 } imm_info;
36910
36911 static dispatch_windows *dispatch_window_list;
36912 static dispatch_windows *dispatch_window_list1;
36913
36914 /* Get dispatch group of insn. */
36915
36916 static enum dispatch_group
36917 get_mem_group (rtx insn)
36918 {
36919 enum attr_memory memory;
36920
36921 if (INSN_CODE (insn) < 0)
36922 return disp_no_group;
36923 memory = get_attr_memory (insn);
36924 if (memory == MEMORY_STORE)
36925 return disp_store;
36926
36927 if (memory == MEMORY_LOAD)
36928 return disp_load;
36929
36930 if (memory == MEMORY_BOTH)
36931 return disp_load_store;
36932
36933 return disp_no_group;
36934 }
36935
36936 /* Return true if insn is a compare instruction. */
36937
36938 static bool
36939 is_cmp (rtx insn)
36940 {
36941 enum attr_type type;
36942
36943 type = get_attr_type (insn);
36944 return (type == TYPE_TEST
36945 || type == TYPE_ICMP
36946 || type == TYPE_FCMP
36947 || GET_CODE (PATTERN (insn)) == COMPARE);
36948 }
36949
36950 /* Return true if a dispatch violation encountered. */
36951
36952 static bool
36953 dispatch_violation (void)
36954 {
36955 if (dispatch_window_list->next)
36956 return dispatch_window_list->next->violation;
36957 return dispatch_window_list->violation;
36958 }
36959
36960 /* Return true if insn is a branch instruction. */
36961
36962 static bool
36963 is_branch (rtx insn)
36964 {
36965 return (CALL_P (insn) || JUMP_P (insn));
36966 }
36967
36968 /* Return true if insn is a prefetch instruction. */
36969
36970 static bool
36971 is_prefetch (rtx insn)
36972 {
36973 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
36974 }
36975
36976 /* This function initializes a dispatch window and the list container holding a
36977 pointer to the window. */
36978
36979 static void
36980 init_window (int window_num)
36981 {
36982 int i;
36983 dispatch_windows *new_list;
36984
36985 if (window_num == 0)
36986 new_list = dispatch_window_list;
36987 else
36988 new_list = dispatch_window_list1;
36989
36990 new_list->num_insn = 0;
36991 new_list->num_uops = 0;
36992 new_list->window_size = 0;
36993 new_list->next = NULL;
36994 new_list->prev = NULL;
36995 new_list->window_num = window_num;
36996 new_list->num_imm = 0;
36997 new_list->num_imm_32 = 0;
36998 new_list->num_imm_64 = 0;
36999 new_list->imm_size = 0;
37000 new_list->num_loads = 0;
37001 new_list->num_stores = 0;
37002 new_list->violation = false;
37003
37004 for (i = 0; i < MAX_INSN; i++)
37005 {
37006 new_list->window[i].insn = NULL;
37007 new_list->window[i].group = disp_no_group;
37008 new_list->window[i].path = no_path;
37009 new_list->window[i].byte_len = 0;
37010 new_list->window[i].imm_bytes = 0;
37011 }
37012 return;
37013 }
37014
37015 /* This function allocates and initializes a dispatch window and the
37016 list container holding a pointer to the window. */
37017
37018 static dispatch_windows *
37019 allocate_window (void)
37020 {
37021 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
37022 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
37023
37024 return new_list;
37025 }
37026
37027 /* This routine initializes the dispatch scheduling information. It
37028 initiates building dispatch scheduler tables and constructs the
37029 first dispatch window. */
37030
37031 static void
37032 init_dispatch_sched (void)
37033 {
37034 /* Allocate a dispatch list and a window. */
37035 dispatch_window_list = allocate_window ();
37036 dispatch_window_list1 = allocate_window ();
37037 init_window (0);
37038 init_window (1);
37039 }
37040
37041 /* This function returns true if a branch is detected. End of a basic block
37042 does not have to be a branch, but here we assume only branches end a
37043 window. */
37044
37045 static bool
37046 is_end_basic_block (enum dispatch_group group)
37047 {
37048 return group == disp_branch;
37049 }
37050
37051 /* This function is called when the end of a window processing is reached. */
37052
37053 static void
37054 process_end_window (void)
37055 {
37056 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
37057 if (dispatch_window_list->next)
37058 {
37059 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
37060 gcc_assert (dispatch_window_list->window_size
37061 + dispatch_window_list1->window_size <= 48);
37062 init_window (1);
37063 }
37064 init_window (0);
37065 }
37066
37067 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
37068 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
37069 for 48 bytes of instructions. Note that these windows are not dispatch
37070 windows that their sizes are DISPATCH_WINDOW_SIZE. */
37071
37072 static dispatch_windows *
37073 allocate_next_window (int window_num)
37074 {
37075 if (window_num == 0)
37076 {
37077 if (dispatch_window_list->next)
37078 init_window (1);
37079 init_window (0);
37080 return dispatch_window_list;
37081 }
37082
37083 dispatch_window_list->next = dispatch_window_list1;
37084 dispatch_window_list1->prev = dispatch_window_list;
37085
37086 return dispatch_window_list1;
37087 }
37088
37089 /* Increment the number of immediate operands of an instruction. */
37090
37091 static int
37092 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
37093 {
37094 if (*in_rtx == 0)
37095 return 0;
37096
37097 switch ( GET_CODE (*in_rtx))
37098 {
37099 case CONST:
37100 case SYMBOL_REF:
37101 case CONST_INT:
37102 (imm_values->imm)++;
37103 if (x86_64_immediate_operand (*in_rtx, SImode))
37104 (imm_values->imm32)++;
37105 else
37106 (imm_values->imm64)++;
37107 break;
37108
37109 case CONST_DOUBLE:
37110 (imm_values->imm)++;
37111 (imm_values->imm64)++;
37112 break;
37113
37114 case CODE_LABEL:
37115 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
37116 {
37117 (imm_values->imm)++;
37118 (imm_values->imm32)++;
37119 }
37120 break;
37121
37122 default:
37123 break;
37124 }
37125
37126 return 0;
37127 }
37128
37129 /* Compute number of immediate operands of an instruction. */
37130
37131 static void
37132 find_constant (rtx in_rtx, imm_info *imm_values)
37133 {
37134 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
37135 (rtx_function) find_constant_1, (void *) imm_values);
37136 }
37137
37138 /* Return total size of immediate operands of an instruction along with number
37139 of corresponding immediate-operands. It initializes its parameters to zero
37140 befor calling FIND_CONSTANT.
37141 INSN is the input instruction. IMM is the total of immediates.
37142 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
37143 bit immediates. */
37144
37145 static int
37146 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
37147 {
37148 imm_info imm_values = {0, 0, 0};
37149
37150 find_constant (insn, &imm_values);
37151 *imm = imm_values.imm;
37152 *imm32 = imm_values.imm32;
37153 *imm64 = imm_values.imm64;
37154 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
37155 }
37156
37157 /* This function indicates if an operand of an instruction is an
37158 immediate. */
37159
37160 static bool
37161 has_immediate (rtx insn)
37162 {
37163 int num_imm_operand;
37164 int num_imm32_operand;
37165 int num_imm64_operand;
37166
37167 if (insn)
37168 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37169 &num_imm64_operand);
37170 return false;
37171 }
37172
37173 /* Return single or double path for instructions. */
37174
37175 static enum insn_path
37176 get_insn_path (rtx insn)
37177 {
37178 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
37179
37180 if ((int)path == 0)
37181 return path_single;
37182
37183 if ((int)path == 1)
37184 return path_double;
37185
37186 return path_multi;
37187 }
37188
37189 /* Return insn dispatch group. */
37190
37191 static enum dispatch_group
37192 get_insn_group (rtx insn)
37193 {
37194 enum dispatch_group group = get_mem_group (insn);
37195 if (group)
37196 return group;
37197
37198 if (is_branch (insn))
37199 return disp_branch;
37200
37201 if (is_cmp (insn))
37202 return disp_cmp;
37203
37204 if (has_immediate (insn))
37205 return disp_imm;
37206
37207 if (is_prefetch (insn))
37208 return disp_prefetch;
37209
37210 return disp_no_group;
37211 }
37212
37213 /* Count number of GROUP restricted instructions in a dispatch
37214 window WINDOW_LIST. */
37215
37216 static int
37217 count_num_restricted (rtx insn, dispatch_windows *window_list)
37218 {
37219 enum dispatch_group group = get_insn_group (insn);
37220 int imm_size;
37221 int num_imm_operand;
37222 int num_imm32_operand;
37223 int num_imm64_operand;
37224
37225 if (group == disp_no_group)
37226 return 0;
37227
37228 if (group == disp_imm)
37229 {
37230 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37231 &num_imm64_operand);
37232 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
37233 || num_imm_operand + window_list->num_imm > MAX_IMM
37234 || (num_imm32_operand > 0
37235 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
37236 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
37237 || (num_imm64_operand > 0
37238 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
37239 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
37240 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
37241 && num_imm64_operand > 0
37242 && ((window_list->num_imm_64 > 0
37243 && window_list->num_insn >= 2)
37244 || window_list->num_insn >= 3)))
37245 return BIG;
37246
37247 return 1;
37248 }
37249
37250 if ((group == disp_load_store
37251 && (window_list->num_loads >= MAX_LOAD
37252 || window_list->num_stores >= MAX_STORE))
37253 || ((group == disp_load
37254 || group == disp_prefetch)
37255 && window_list->num_loads >= MAX_LOAD)
37256 || (group == disp_store
37257 && window_list->num_stores >= MAX_STORE))
37258 return BIG;
37259
37260 return 1;
37261 }
37262
37263 /* This function returns true if insn satisfies dispatch rules on the
37264 last window scheduled. */
37265
37266 static bool
37267 fits_dispatch_window (rtx insn)
37268 {
37269 dispatch_windows *window_list = dispatch_window_list;
37270 dispatch_windows *window_list_next = dispatch_window_list->next;
37271 unsigned int num_restrict;
37272 enum dispatch_group group = get_insn_group (insn);
37273 enum insn_path path = get_insn_path (insn);
37274 int sum;
37275
37276 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
37277 instructions should be given the lowest priority in the
37278 scheduling process in Haifa scheduler to make sure they will be
37279 scheduled in the same dispatch window as the refrence to them. */
37280 if (group == disp_jcc || group == disp_cmp)
37281 return false;
37282
37283 /* Check nonrestricted. */
37284 if (group == disp_no_group || group == disp_branch)
37285 return true;
37286
37287 /* Get last dispatch window. */
37288 if (window_list_next)
37289 window_list = window_list_next;
37290
37291 if (window_list->window_num == 1)
37292 {
37293 sum = window_list->prev->window_size + window_list->window_size;
37294
37295 if (sum == 32
37296 || (min_insn_size (insn) + sum) >= 48)
37297 /* Window 1 is full. Go for next window. */
37298 return true;
37299 }
37300
37301 num_restrict = count_num_restricted (insn, window_list);
37302
37303 if (num_restrict > num_allowable_groups[group])
37304 return false;
37305
37306 /* See if it fits in the first window. */
37307 if (window_list->window_num == 0)
37308 {
37309 /* The first widow should have only single and double path
37310 uops. */
37311 if (path == path_double
37312 && (window_list->num_uops + 2) > MAX_INSN)
37313 return false;
37314 else if (path != path_single)
37315 return false;
37316 }
37317 return true;
37318 }
37319
37320 /* Add an instruction INSN with NUM_UOPS micro-operations to the
37321 dispatch window WINDOW_LIST. */
37322
37323 static void
37324 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
37325 {
37326 int byte_len = min_insn_size (insn);
37327 int num_insn = window_list->num_insn;
37328 int imm_size;
37329 sched_insn_info *window = window_list->window;
37330 enum dispatch_group group = get_insn_group (insn);
37331 enum insn_path path = get_insn_path (insn);
37332 int num_imm_operand;
37333 int num_imm32_operand;
37334 int num_imm64_operand;
37335
37336 if (!window_list->violation && group != disp_cmp
37337 && !fits_dispatch_window (insn))
37338 window_list->violation = true;
37339
37340 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37341 &num_imm64_operand);
37342
37343 /* Initialize window with new instruction. */
37344 window[num_insn].insn = insn;
37345 window[num_insn].byte_len = byte_len;
37346 window[num_insn].group = group;
37347 window[num_insn].path = path;
37348 window[num_insn].imm_bytes = imm_size;
37349
37350 window_list->window_size += byte_len;
37351 window_list->num_insn = num_insn + 1;
37352 window_list->num_uops = window_list->num_uops + num_uops;
37353 window_list->imm_size += imm_size;
37354 window_list->num_imm += num_imm_operand;
37355 window_list->num_imm_32 += num_imm32_operand;
37356 window_list->num_imm_64 += num_imm64_operand;
37357
37358 if (group == disp_store)
37359 window_list->num_stores += 1;
37360 else if (group == disp_load
37361 || group == disp_prefetch)
37362 window_list->num_loads += 1;
37363 else if (group == disp_load_store)
37364 {
37365 window_list->num_stores += 1;
37366 window_list->num_loads += 1;
37367 }
37368 }
37369
37370 /* Adds a scheduled instruction, INSN, to the current dispatch window.
37371 If the total bytes of instructions or the number of instructions in
37372 the window exceed allowable, it allocates a new window. */
37373
37374 static void
37375 add_to_dispatch_window (rtx insn)
37376 {
37377 int byte_len;
37378 dispatch_windows *window_list;
37379 dispatch_windows *next_list;
37380 dispatch_windows *window0_list;
37381 enum insn_path path;
37382 enum dispatch_group insn_group;
37383 bool insn_fits;
37384 int num_insn;
37385 int num_uops;
37386 int window_num;
37387 int insn_num_uops;
37388 int sum;
37389
37390 if (INSN_CODE (insn) < 0)
37391 return;
37392
37393 byte_len = min_insn_size (insn);
37394 window_list = dispatch_window_list;
37395 next_list = window_list->next;
37396 path = get_insn_path (insn);
37397 insn_group = get_insn_group (insn);
37398
37399 /* Get the last dispatch window. */
37400 if (next_list)
37401 window_list = dispatch_window_list->next;
37402
37403 if (path == path_single)
37404 insn_num_uops = 1;
37405 else if (path == path_double)
37406 insn_num_uops = 2;
37407 else
37408 insn_num_uops = (int) path;
37409
37410 /* If current window is full, get a new window.
37411 Window number zero is full, if MAX_INSN uops are scheduled in it.
37412 Window number one is full, if window zero's bytes plus window
37413 one's bytes is 32, or if the bytes of the new instruction added
37414 to the total makes it greater than 48, or it has already MAX_INSN
37415 instructions in it. */
37416 num_insn = window_list->num_insn;
37417 num_uops = window_list->num_uops;
37418 window_num = window_list->window_num;
37419 insn_fits = fits_dispatch_window (insn);
37420
37421 if (num_insn >= MAX_INSN
37422 || num_uops + insn_num_uops > MAX_INSN
37423 || !(insn_fits))
37424 {
37425 window_num = ~window_num & 1;
37426 window_list = allocate_next_window (window_num);
37427 }
37428
37429 if (window_num == 0)
37430 {
37431 add_insn_window (insn, window_list, insn_num_uops);
37432 if (window_list->num_insn >= MAX_INSN
37433 && insn_group == disp_branch)
37434 {
37435 process_end_window ();
37436 return;
37437 }
37438 }
37439 else if (window_num == 1)
37440 {
37441 window0_list = window_list->prev;
37442 sum = window0_list->window_size + window_list->window_size;
37443 if (sum == 32
37444 || (byte_len + sum) >= 48)
37445 {
37446 process_end_window ();
37447 window_list = dispatch_window_list;
37448 }
37449
37450 add_insn_window (insn, window_list, insn_num_uops);
37451 }
37452 else
37453 gcc_unreachable ();
37454
37455 if (is_end_basic_block (insn_group))
37456 {
37457 /* End of basic block is reached do end-basic-block process. */
37458 process_end_window ();
37459 return;
37460 }
37461 }
37462
37463 /* Print the dispatch window, WINDOW_NUM, to FILE. */
37464
37465 DEBUG_FUNCTION static void
37466 debug_dispatch_window_file (FILE *file, int window_num)
37467 {
37468 dispatch_windows *list;
37469 int i;
37470
37471 if (window_num == 0)
37472 list = dispatch_window_list;
37473 else
37474 list = dispatch_window_list1;
37475
37476 fprintf (file, "Window #%d:\n", list->window_num);
37477 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
37478 list->num_insn, list->num_uops, list->window_size);
37479 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
37480 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
37481
37482 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
37483 list->num_stores);
37484 fprintf (file, " insn info:\n");
37485
37486 for (i = 0; i < MAX_INSN; i++)
37487 {
37488 if (!list->window[i].insn)
37489 break;
37490 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
37491 i, group_name[list->window[i].group],
37492 i, (void *)list->window[i].insn,
37493 i, list->window[i].path,
37494 i, list->window[i].byte_len,
37495 i, list->window[i].imm_bytes);
37496 }
37497 }
37498
37499 /* Print to stdout a dispatch window. */
37500
37501 DEBUG_FUNCTION void
37502 debug_dispatch_window (int window_num)
37503 {
37504 debug_dispatch_window_file (stdout, window_num);
37505 }
37506
37507 /* Print INSN dispatch information to FILE. */
37508
37509 DEBUG_FUNCTION static void
37510 debug_insn_dispatch_info_file (FILE *file, rtx insn)
37511 {
37512 int byte_len;
37513 enum insn_path path;
37514 enum dispatch_group group;
37515 int imm_size;
37516 int num_imm_operand;
37517 int num_imm32_operand;
37518 int num_imm64_operand;
37519
37520 if (INSN_CODE (insn) < 0)
37521 return;
37522
37523 byte_len = min_insn_size (insn);
37524 path = get_insn_path (insn);
37525 group = get_insn_group (insn);
37526 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37527 &num_imm64_operand);
37528
37529 fprintf (file, " insn info:\n");
37530 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
37531 group_name[group], path, byte_len);
37532 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
37533 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
37534 }
37535
37536 /* Print to STDERR the status of the ready list with respect to
37537 dispatch windows. */
37538
37539 DEBUG_FUNCTION void
37540 debug_ready_dispatch (void)
37541 {
37542 int i;
37543 int no_ready = number_in_ready ();
37544
37545 fprintf (stdout, "Number of ready: %d\n", no_ready);
37546
37547 for (i = 0; i < no_ready; i++)
37548 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
37549 }
37550
37551 /* This routine is the driver of the dispatch scheduler. */
37552
37553 static void
37554 do_dispatch (rtx insn, int mode)
37555 {
37556 if (mode == DISPATCH_INIT)
37557 init_dispatch_sched ();
37558 else if (mode == ADD_TO_DISPATCH_WINDOW)
37559 add_to_dispatch_window (insn);
37560 }
37561
37562 /* Return TRUE if Dispatch Scheduling is supported. */
37563
37564 static bool
37565 has_dispatch (rtx insn, int action)
37566 {
37567 if ((ix86_tune == PROCESSOR_BDVER1 || ix86_tune == PROCESSOR_BDVER2)
37568 && flag_dispatch_scheduler)
37569 switch (action)
37570 {
37571 default:
37572 return false;
37573
37574 case IS_DISPATCH_ON:
37575 return true;
37576 break;
37577
37578 case IS_CMP:
37579 return is_cmp (insn);
37580
37581 case DISPATCH_VIOLATION:
37582 return dispatch_violation ();
37583
37584 case FITS_DISPATCH_WINDOW:
37585 return fits_dispatch_window (insn);
37586 }
37587
37588 return false;
37589 }
37590
37591 /* Implementation of reassociation_width target hook used by
37592 reassoc phase to identify parallelism level in reassociated
37593 tree. Statements tree_code is passed in OPC. Arguments type
37594 is passed in MODE.
37595
37596 Currently parallel reassociation is enabled for Atom
37597 processors only and we set reassociation width to be 2
37598 because Atom may issue up to 2 instructions per cycle.
37599
37600 Return value should be fixed if parallel reassociation is
37601 enabled for other processors. */
37602
37603 static int
37604 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
37605 enum machine_mode mode)
37606 {
37607 int res = 1;
37608
37609 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
37610 res = 2;
37611 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
37612 res = 2;
37613
37614 return res;
37615 }
37616
37617 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
37618 place emms and femms instructions. */
37619
37620 static enum machine_mode
37621 ix86_preferred_simd_mode (enum machine_mode mode)
37622 {
37623 if (!TARGET_SSE)
37624 return word_mode;
37625
37626 switch (mode)
37627 {
37628 case QImode:
37629 return TARGET_AVX2 ? V32QImode : V16QImode;
37630 case HImode:
37631 return TARGET_AVX2 ? V16HImode : V8HImode;
37632 case SImode:
37633 return TARGET_AVX2 ? V8SImode : V4SImode;
37634 case DImode:
37635 return TARGET_AVX2 ? V4DImode : V2DImode;
37636
37637 case SFmode:
37638 if (TARGET_AVX && !TARGET_PREFER_AVX128)
37639 return V8SFmode;
37640 else
37641 return V4SFmode;
37642
37643 case DFmode:
37644 if (!TARGET_VECTORIZE_DOUBLE)
37645 return word_mode;
37646 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
37647 return V4DFmode;
37648 else if (TARGET_SSE2)
37649 return V2DFmode;
37650 /* FALLTHRU */
37651
37652 default:
37653 return word_mode;
37654 }
37655 }
37656
37657 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
37658 vectors. */
37659
37660 static unsigned int
37661 ix86_autovectorize_vector_sizes (void)
37662 {
37663 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
37664 }
37665
37666 /* Initialize the GCC target structure. */
37667 #undef TARGET_RETURN_IN_MEMORY
37668 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
37669
37670 #undef TARGET_LEGITIMIZE_ADDRESS
37671 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
37672
37673 #undef TARGET_ATTRIBUTE_TABLE
37674 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
37675 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
37676 # undef TARGET_MERGE_DECL_ATTRIBUTES
37677 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
37678 #endif
37679
37680 #undef TARGET_COMP_TYPE_ATTRIBUTES
37681 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
37682
37683 #undef TARGET_INIT_BUILTINS
37684 #define TARGET_INIT_BUILTINS ix86_init_builtins
37685 #undef TARGET_BUILTIN_DECL
37686 #define TARGET_BUILTIN_DECL ix86_builtin_decl
37687 #undef TARGET_EXPAND_BUILTIN
37688 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
37689
37690 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
37691 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
37692 ix86_builtin_vectorized_function
37693
37694 #undef TARGET_BUILTIN_RECIPROCAL
37695 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
37696
37697 #undef TARGET_ASM_FUNCTION_EPILOGUE
37698 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
37699
37700 #undef TARGET_ENCODE_SECTION_INFO
37701 #ifndef SUBTARGET_ENCODE_SECTION_INFO
37702 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
37703 #else
37704 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
37705 #endif
37706
37707 #undef TARGET_ASM_OPEN_PAREN
37708 #define TARGET_ASM_OPEN_PAREN ""
37709 #undef TARGET_ASM_CLOSE_PAREN
37710 #define TARGET_ASM_CLOSE_PAREN ""
37711
37712 #undef TARGET_ASM_BYTE_OP
37713 #define TARGET_ASM_BYTE_OP ASM_BYTE
37714
37715 #undef TARGET_ASM_ALIGNED_HI_OP
37716 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
37717 #undef TARGET_ASM_ALIGNED_SI_OP
37718 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
37719 #ifdef ASM_QUAD
37720 #undef TARGET_ASM_ALIGNED_DI_OP
37721 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
37722 #endif
37723
37724 #undef TARGET_PROFILE_BEFORE_PROLOGUE
37725 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
37726
37727 #undef TARGET_ASM_UNALIGNED_HI_OP
37728 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
37729 #undef TARGET_ASM_UNALIGNED_SI_OP
37730 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
37731 #undef TARGET_ASM_UNALIGNED_DI_OP
37732 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
37733
37734 #undef TARGET_PRINT_OPERAND
37735 #define TARGET_PRINT_OPERAND ix86_print_operand
37736 #undef TARGET_PRINT_OPERAND_ADDRESS
37737 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
37738 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
37739 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
37740 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
37741 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
37742
37743 #undef TARGET_SCHED_INIT_GLOBAL
37744 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
37745 #undef TARGET_SCHED_ADJUST_COST
37746 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
37747 #undef TARGET_SCHED_ISSUE_RATE
37748 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
37749 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
37750 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
37751 ia32_multipass_dfa_lookahead
37752
37753 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
37754 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
37755
37756 #ifdef HAVE_AS_TLS
37757 #undef TARGET_HAVE_TLS
37758 #define TARGET_HAVE_TLS true
37759 #endif
37760 #undef TARGET_CANNOT_FORCE_CONST_MEM
37761 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
37762 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
37763 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
37764
37765 #undef TARGET_DELEGITIMIZE_ADDRESS
37766 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
37767
37768 #undef TARGET_MS_BITFIELD_LAYOUT_P
37769 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
37770
37771 #if TARGET_MACHO
37772 #undef TARGET_BINDS_LOCAL_P
37773 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
37774 #endif
37775 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
37776 #undef TARGET_BINDS_LOCAL_P
37777 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
37778 #endif
37779
37780 #undef TARGET_ASM_OUTPUT_MI_THUNK
37781 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
37782 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
37783 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
37784
37785 #undef TARGET_ASM_FILE_START
37786 #define TARGET_ASM_FILE_START x86_file_start
37787
37788 #undef TARGET_OPTION_OVERRIDE
37789 #define TARGET_OPTION_OVERRIDE ix86_option_override
37790
37791 #undef TARGET_REGISTER_MOVE_COST
37792 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
37793 #undef TARGET_MEMORY_MOVE_COST
37794 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
37795 #undef TARGET_RTX_COSTS
37796 #define TARGET_RTX_COSTS ix86_rtx_costs
37797 #undef TARGET_ADDRESS_COST
37798 #define TARGET_ADDRESS_COST ix86_address_cost
37799
37800 #undef TARGET_FIXED_CONDITION_CODE_REGS
37801 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
37802 #undef TARGET_CC_MODES_COMPATIBLE
37803 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
37804
37805 #undef TARGET_MACHINE_DEPENDENT_REORG
37806 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
37807
37808 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
37809 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
37810
37811 #undef TARGET_BUILD_BUILTIN_VA_LIST
37812 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
37813
37814 #undef TARGET_ENUM_VA_LIST_P
37815 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
37816
37817 #undef TARGET_FN_ABI_VA_LIST
37818 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
37819
37820 #undef TARGET_CANONICAL_VA_LIST_TYPE
37821 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
37822
37823 #undef TARGET_EXPAND_BUILTIN_VA_START
37824 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
37825
37826 #undef TARGET_MD_ASM_CLOBBERS
37827 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
37828
37829 #undef TARGET_PROMOTE_PROTOTYPES
37830 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
37831 #undef TARGET_STRUCT_VALUE_RTX
37832 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
37833 #undef TARGET_SETUP_INCOMING_VARARGS
37834 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
37835 #undef TARGET_MUST_PASS_IN_STACK
37836 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
37837 #undef TARGET_FUNCTION_ARG_ADVANCE
37838 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
37839 #undef TARGET_FUNCTION_ARG
37840 #define TARGET_FUNCTION_ARG ix86_function_arg
37841 #undef TARGET_FUNCTION_ARG_BOUNDARY
37842 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
37843 #undef TARGET_PASS_BY_REFERENCE
37844 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
37845 #undef TARGET_INTERNAL_ARG_POINTER
37846 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
37847 #undef TARGET_UPDATE_STACK_BOUNDARY
37848 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
37849 #undef TARGET_GET_DRAP_RTX
37850 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
37851 #undef TARGET_STRICT_ARGUMENT_NAMING
37852 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
37853 #undef TARGET_STATIC_CHAIN
37854 #define TARGET_STATIC_CHAIN ix86_static_chain
37855 #undef TARGET_TRAMPOLINE_INIT
37856 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
37857 #undef TARGET_RETURN_POPS_ARGS
37858 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
37859
37860 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
37861 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
37862
37863 #undef TARGET_SCALAR_MODE_SUPPORTED_P
37864 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
37865
37866 #undef TARGET_VECTOR_MODE_SUPPORTED_P
37867 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
37868
37869 #undef TARGET_C_MODE_FOR_SUFFIX
37870 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
37871
37872 #ifdef HAVE_AS_TLS
37873 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
37874 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
37875 #endif
37876
37877 #ifdef SUBTARGET_INSERT_ATTRIBUTES
37878 #undef TARGET_INSERT_ATTRIBUTES
37879 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
37880 #endif
37881
37882 #undef TARGET_MANGLE_TYPE
37883 #define TARGET_MANGLE_TYPE ix86_mangle_type
37884
37885 #ifndef TARGET_MACHO
37886 #undef TARGET_STACK_PROTECT_FAIL
37887 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
37888 #endif
37889
37890 #undef TARGET_FUNCTION_VALUE
37891 #define TARGET_FUNCTION_VALUE ix86_function_value
37892
37893 #undef TARGET_FUNCTION_VALUE_REGNO_P
37894 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
37895
37896 #undef TARGET_PROMOTE_FUNCTION_MODE
37897 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
37898
37899 #undef TARGET_SECONDARY_RELOAD
37900 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
37901
37902 #undef TARGET_CLASS_MAX_NREGS
37903 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
37904
37905 #undef TARGET_PREFERRED_RELOAD_CLASS
37906 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
37907 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
37908 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
37909 #undef TARGET_CLASS_LIKELY_SPILLED_P
37910 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
37911
37912 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
37913 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
37914 ix86_builtin_vectorization_cost
37915 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
37916 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
37917 ix86_vectorize_vec_perm_const_ok
37918 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
37919 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
37920 ix86_preferred_simd_mode
37921 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
37922 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
37923 ix86_autovectorize_vector_sizes
37924
37925 #undef TARGET_SET_CURRENT_FUNCTION
37926 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
37927
37928 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
37929 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
37930
37931 #undef TARGET_OPTION_SAVE
37932 #define TARGET_OPTION_SAVE ix86_function_specific_save
37933
37934 #undef TARGET_OPTION_RESTORE
37935 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
37936
37937 #undef TARGET_OPTION_PRINT
37938 #define TARGET_OPTION_PRINT ix86_function_specific_print
37939
37940 #undef TARGET_CAN_INLINE_P
37941 #define TARGET_CAN_INLINE_P ix86_can_inline_p
37942
37943 #undef TARGET_EXPAND_TO_RTL_HOOK
37944 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
37945
37946 #undef TARGET_LEGITIMATE_ADDRESS_P
37947 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
37948
37949 #undef TARGET_LEGITIMATE_CONSTANT_P
37950 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
37951
37952 #undef TARGET_FRAME_POINTER_REQUIRED
37953 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
37954
37955 #undef TARGET_CAN_ELIMINATE
37956 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
37957
37958 #undef TARGET_EXTRA_LIVE_ON_ENTRY
37959 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
37960
37961 #undef TARGET_ASM_CODE_END
37962 #define TARGET_ASM_CODE_END ix86_code_end
37963
37964 #undef TARGET_CONDITIONAL_REGISTER_USAGE
37965 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
37966
37967 #if TARGET_MACHO
37968 #undef TARGET_INIT_LIBFUNCS
37969 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
37970 #endif
37971
37972 struct gcc_target targetm = TARGET_INITIALIZER;
37973 \f
37974 #include "gt-i386.h"