i386.md (UNSPEC_VSIBADDR): New.
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "cgraph.h"
51 #include "gimple.h"
52 #include "dwarf2.h"
53 #include "df.h"
54 #include "tm-constrs.h"
55 #include "params.h"
56 #include "cselib.h"
57 #include "debug.h"
58 #include "sched-int.h"
59 #include "sbitmap.h"
60 #include "fibheap.h"
61 #include "opts.h"
62 #include "diagnostic.h"
63
64 enum upper_128bits_state
65 {
66 unknown = 0,
67 unused,
68 used
69 };
70
71 typedef struct block_info_def
72 {
73 /* State of the upper 128bits of AVX registers at exit. */
74 enum upper_128bits_state state;
75 /* TRUE if state of the upper 128bits of AVX registers is unchanged
76 in this block. */
77 bool unchanged;
78 /* TRUE if block has been processed. */
79 bool processed;
80 /* TRUE if block has been scanned. */
81 bool scanned;
82 /* Previous state of the upper 128bits of AVX registers at entry. */
83 enum upper_128bits_state prev;
84 } *block_info;
85
86 #define BLOCK_INFO(B) ((block_info) (B)->aux)
87
88 enum call_avx256_state
89 {
90 /* Callee returns 256bit AVX register. */
91 callee_return_avx256 = -1,
92 /* Callee returns and passes 256bit AVX register. */
93 callee_return_pass_avx256,
94 /* Callee passes 256bit AVX register. */
95 callee_pass_avx256,
96 /* Callee doesn't return nor passe 256bit AVX register, or no
97 256bit AVX register in function return. */
98 call_no_avx256,
99 /* vzeroupper intrinsic. */
100 vzeroupper_intrinsic
101 };
102
103 /* Check if a 256bit AVX register is referenced in stores. */
104
105 static void
106 check_avx256_stores (rtx dest, const_rtx set, void *data)
107 {
108 if ((REG_P (dest)
109 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
110 || (GET_CODE (set) == SET
111 && REG_P (SET_SRC (set))
112 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
113 {
114 enum upper_128bits_state *state
115 = (enum upper_128bits_state *) data;
116 *state = used;
117 }
118 }
119
120 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
121 in basic block BB. Delete it if upper 128bit AVX registers are
122 unused. If it isn't deleted, move it to just before a jump insn.
123
124 STATE is state of the upper 128bits of AVX registers at entry. */
125
126 static void
127 move_or_delete_vzeroupper_2 (basic_block bb,
128 enum upper_128bits_state state)
129 {
130 rtx insn, bb_end;
131 rtx vzeroupper_insn = NULL_RTX;
132 rtx pat;
133 int avx256;
134 bool unchanged;
135
136 if (BLOCK_INFO (bb)->unchanged)
137 {
138 if (dump_file)
139 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
140 bb->index, state);
141
142 BLOCK_INFO (bb)->state = state;
143 return;
144 }
145
146 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
147 {
148 if (dump_file)
149 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
150 bb->index, BLOCK_INFO (bb)->state);
151 return;
152 }
153
154 BLOCK_INFO (bb)->prev = state;
155
156 if (dump_file)
157 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
158 bb->index, state);
159
160 unchanged = true;
161
162 /* BB_END changes when it is deleted. */
163 bb_end = BB_END (bb);
164 insn = BB_HEAD (bb);
165 while (insn != bb_end)
166 {
167 insn = NEXT_INSN (insn);
168
169 if (!NONDEBUG_INSN_P (insn))
170 continue;
171
172 /* Move vzeroupper before jump/call. */
173 if (JUMP_P (insn) || CALL_P (insn))
174 {
175 if (!vzeroupper_insn)
176 continue;
177
178 if (PREV_INSN (insn) != vzeroupper_insn)
179 {
180 if (dump_file)
181 {
182 fprintf (dump_file, "Move vzeroupper after:\n");
183 print_rtl_single (dump_file, PREV_INSN (insn));
184 fprintf (dump_file, "before:\n");
185 print_rtl_single (dump_file, insn);
186 }
187 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
188 PREV_INSN (insn));
189 }
190 vzeroupper_insn = NULL_RTX;
191 continue;
192 }
193
194 pat = PATTERN (insn);
195
196 /* Check insn for vzeroupper intrinsic. */
197 if (GET_CODE (pat) == UNSPEC_VOLATILE
198 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
199 {
200 if (dump_file)
201 {
202 /* Found vzeroupper intrinsic. */
203 fprintf (dump_file, "Found vzeroupper:\n");
204 print_rtl_single (dump_file, insn);
205 }
206 }
207 else
208 {
209 /* Check insn for vzeroall intrinsic. */
210 if (GET_CODE (pat) == PARALLEL
211 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
212 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
213 {
214 state = unused;
215 unchanged = false;
216
217 /* Delete pending vzeroupper insertion. */
218 if (vzeroupper_insn)
219 {
220 delete_insn (vzeroupper_insn);
221 vzeroupper_insn = NULL_RTX;
222 }
223 }
224 else if (state != used)
225 {
226 note_stores (pat, check_avx256_stores, &state);
227 if (state == used)
228 unchanged = false;
229 }
230 continue;
231 }
232
233 /* Process vzeroupper intrinsic. */
234 avx256 = INTVAL (XVECEXP (pat, 0, 0));
235
236 if (state == unused)
237 {
238 /* Since the upper 128bits are cleared, callee must not pass
239 256bit AVX register. We only need to check if callee
240 returns 256bit AVX register. */
241 if (avx256 == callee_return_avx256)
242 {
243 state = used;
244 unchanged = false;
245 }
246
247 /* Remove unnecessary vzeroupper since upper 128bits are
248 cleared. */
249 if (dump_file)
250 {
251 fprintf (dump_file, "Delete redundant vzeroupper:\n");
252 print_rtl_single (dump_file, insn);
253 }
254 delete_insn (insn);
255 }
256 else
257 {
258 /* Set state to UNUSED if callee doesn't return 256bit AVX
259 register. */
260 if (avx256 != callee_return_pass_avx256)
261 state = unused;
262
263 if (avx256 == callee_return_pass_avx256
264 || avx256 == callee_pass_avx256)
265 {
266 /* Must remove vzeroupper since callee passes in 256bit
267 AVX register. */
268 if (dump_file)
269 {
270 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
271 print_rtl_single (dump_file, insn);
272 }
273 delete_insn (insn);
274 }
275 else
276 {
277 vzeroupper_insn = insn;
278 unchanged = false;
279 }
280 }
281 }
282
283 BLOCK_INFO (bb)->state = state;
284 BLOCK_INFO (bb)->unchanged = unchanged;
285 BLOCK_INFO (bb)->scanned = true;
286
287 if (dump_file)
288 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
289 bb->index, unchanged ? "unchanged" : "changed",
290 state);
291 }
292
293 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
294 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
295 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
296 state is changed. */
297
298 static bool
299 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
300 {
301 edge e;
302 edge_iterator ei;
303 enum upper_128bits_state state, old_state, new_state;
304 bool seen_unknown;
305
306 if (dump_file)
307 fprintf (dump_file, " Process [bb %i]: status: %d\n",
308 block->index, BLOCK_INFO (block)->processed);
309
310 if (BLOCK_INFO (block)->processed)
311 return false;
312
313 state = unused;
314
315 /* Check all predecessor edges of this block. */
316 seen_unknown = false;
317 FOR_EACH_EDGE (e, ei, block->preds)
318 {
319 if (e->src == block)
320 continue;
321 switch (BLOCK_INFO (e->src)->state)
322 {
323 case unknown:
324 if (!unknown_is_unused)
325 seen_unknown = true;
326 case unused:
327 break;
328 case used:
329 state = used;
330 goto done;
331 }
332 }
333
334 if (seen_unknown)
335 state = unknown;
336
337 done:
338 old_state = BLOCK_INFO (block)->state;
339 move_or_delete_vzeroupper_2 (block, state);
340 new_state = BLOCK_INFO (block)->state;
341
342 if (state != unknown || new_state == used)
343 BLOCK_INFO (block)->processed = true;
344
345 /* Need to rescan if the upper 128bits of AVX registers are changed
346 to USED at exit. */
347 if (new_state != old_state)
348 {
349 if (new_state == used)
350 cfun->machine->rescan_vzeroupper_p = 1;
351 return true;
352 }
353 else
354 return false;
355 }
356
357 /* Go through the instruction stream looking for vzeroupper. Delete
358 it if upper 128bit AVX registers are unused. If it isn't deleted,
359 move it to just before a jump insn. */
360
361 static void
362 move_or_delete_vzeroupper (void)
363 {
364 edge e;
365 edge_iterator ei;
366 basic_block bb;
367 fibheap_t worklist, pending, fibheap_swap;
368 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
369 int *bb_order;
370 int *rc_order;
371 int i;
372
373 /* Set up block info for each basic block. */
374 alloc_aux_for_blocks (sizeof (struct block_info_def));
375
376 /* Process outgoing edges of entry point. */
377 if (dump_file)
378 fprintf (dump_file, "Process outgoing edges of entry point\n");
379
380 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
381 {
382 move_or_delete_vzeroupper_2 (e->dest,
383 cfun->machine->caller_pass_avx256_p
384 ? used : unused);
385 BLOCK_INFO (e->dest)->processed = true;
386 }
387
388 /* Compute reverse completion order of depth first search of the CFG
389 so that the data-flow runs faster. */
390 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
391 bb_order = XNEWVEC (int, last_basic_block);
392 pre_and_rev_post_order_compute (NULL, rc_order, false);
393 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
394 bb_order[rc_order[i]] = i;
395 free (rc_order);
396
397 worklist = fibheap_new ();
398 pending = fibheap_new ();
399 visited = sbitmap_alloc (last_basic_block);
400 in_worklist = sbitmap_alloc (last_basic_block);
401 in_pending = sbitmap_alloc (last_basic_block);
402 sbitmap_zero (in_worklist);
403
404 /* Don't check outgoing edges of entry point. */
405 sbitmap_ones (in_pending);
406 FOR_EACH_BB (bb)
407 if (BLOCK_INFO (bb)->processed)
408 RESET_BIT (in_pending, bb->index);
409 else
410 {
411 move_or_delete_vzeroupper_1 (bb, false);
412 fibheap_insert (pending, bb_order[bb->index], bb);
413 }
414
415 if (dump_file)
416 fprintf (dump_file, "Check remaining basic blocks\n");
417
418 while (!fibheap_empty (pending))
419 {
420 fibheap_swap = pending;
421 pending = worklist;
422 worklist = fibheap_swap;
423 sbitmap_swap = in_pending;
424 in_pending = in_worklist;
425 in_worklist = sbitmap_swap;
426
427 sbitmap_zero (visited);
428
429 cfun->machine->rescan_vzeroupper_p = 0;
430
431 while (!fibheap_empty (worklist))
432 {
433 bb = (basic_block) fibheap_extract_min (worklist);
434 RESET_BIT (in_worklist, bb->index);
435 gcc_assert (!TEST_BIT (visited, bb->index));
436 if (!TEST_BIT (visited, bb->index))
437 {
438 edge_iterator ei;
439
440 SET_BIT (visited, bb->index);
441
442 if (move_or_delete_vzeroupper_1 (bb, false))
443 FOR_EACH_EDGE (e, ei, bb->succs)
444 {
445 if (e->dest == EXIT_BLOCK_PTR
446 || BLOCK_INFO (e->dest)->processed)
447 continue;
448
449 if (TEST_BIT (visited, e->dest->index))
450 {
451 if (!TEST_BIT (in_pending, e->dest->index))
452 {
453 /* Send E->DEST to next round. */
454 SET_BIT (in_pending, e->dest->index);
455 fibheap_insert (pending,
456 bb_order[e->dest->index],
457 e->dest);
458 }
459 }
460 else if (!TEST_BIT (in_worklist, e->dest->index))
461 {
462 /* Add E->DEST to current round. */
463 SET_BIT (in_worklist, e->dest->index);
464 fibheap_insert (worklist, bb_order[e->dest->index],
465 e->dest);
466 }
467 }
468 }
469 }
470
471 if (!cfun->machine->rescan_vzeroupper_p)
472 break;
473 }
474
475 free (bb_order);
476 fibheap_delete (worklist);
477 fibheap_delete (pending);
478 sbitmap_free (visited);
479 sbitmap_free (in_worklist);
480 sbitmap_free (in_pending);
481
482 if (dump_file)
483 fprintf (dump_file, "Process remaining basic blocks\n");
484
485 FOR_EACH_BB (bb)
486 move_or_delete_vzeroupper_1 (bb, true);
487
488 free_aux_for_blocks ();
489 }
490
491 static rtx legitimize_dllimport_symbol (rtx, bool);
492
493 #ifndef CHECK_STACK_LIMIT
494 #define CHECK_STACK_LIMIT (-1)
495 #endif
496
497 /* Return index of given mode in mult and division cost tables. */
498 #define MODE_INDEX(mode) \
499 ((mode) == QImode ? 0 \
500 : (mode) == HImode ? 1 \
501 : (mode) == SImode ? 2 \
502 : (mode) == DImode ? 3 \
503 : 4)
504
505 /* Processor costs (relative to an add) */
506 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
507 #define COSTS_N_BYTES(N) ((N) * 2)
508
509 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
510
511 const
512 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
513 COSTS_N_BYTES (2), /* cost of an add instruction */
514 COSTS_N_BYTES (3), /* cost of a lea instruction */
515 COSTS_N_BYTES (2), /* variable shift costs */
516 COSTS_N_BYTES (3), /* constant shift costs */
517 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
518 COSTS_N_BYTES (3), /* HI */
519 COSTS_N_BYTES (3), /* SI */
520 COSTS_N_BYTES (3), /* DI */
521 COSTS_N_BYTES (5)}, /* other */
522 0, /* cost of multiply per each bit set */
523 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
524 COSTS_N_BYTES (3), /* HI */
525 COSTS_N_BYTES (3), /* SI */
526 COSTS_N_BYTES (3), /* DI */
527 COSTS_N_BYTES (5)}, /* other */
528 COSTS_N_BYTES (3), /* cost of movsx */
529 COSTS_N_BYTES (3), /* cost of movzx */
530 0, /* "large" insn */
531 2, /* MOVE_RATIO */
532 2, /* cost for loading QImode using movzbl */
533 {2, 2, 2}, /* cost of loading integer registers
534 in QImode, HImode and SImode.
535 Relative to reg-reg move (2). */
536 {2, 2, 2}, /* cost of storing integer registers */
537 2, /* cost of reg,reg fld/fst */
538 {2, 2, 2}, /* cost of loading fp registers
539 in SFmode, DFmode and XFmode */
540 {2, 2, 2}, /* cost of storing fp registers
541 in SFmode, DFmode and XFmode */
542 3, /* cost of moving MMX register */
543 {3, 3}, /* cost of loading MMX registers
544 in SImode and DImode */
545 {3, 3}, /* cost of storing MMX registers
546 in SImode and DImode */
547 3, /* cost of moving SSE register */
548 {3, 3, 3}, /* cost of loading SSE registers
549 in SImode, DImode and TImode */
550 {3, 3, 3}, /* cost of storing SSE registers
551 in SImode, DImode and TImode */
552 3, /* MMX or SSE register to integer */
553 0, /* size of l1 cache */
554 0, /* size of l2 cache */
555 0, /* size of prefetch block */
556 0, /* number of parallel prefetches */
557 2, /* Branch cost */
558 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
559 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
560 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
561 COSTS_N_BYTES (2), /* cost of FABS instruction. */
562 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
563 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
564 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
565 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 1, /* scalar_stmt_cost. */
569 1, /* scalar load_cost. */
570 1, /* scalar_store_cost. */
571 1, /* vec_stmt_cost. */
572 1, /* vec_to_scalar_cost. */
573 1, /* scalar_to_vec_cost. */
574 1, /* vec_align_load_cost. */
575 1, /* vec_unalign_load_cost. */
576 1, /* vec_store_cost. */
577 1, /* cond_taken_branch_cost. */
578 1, /* cond_not_taken_branch_cost. */
579 };
580
581 /* Processor costs (relative to an add) */
582 static const
583 struct processor_costs i386_cost = { /* 386 specific costs */
584 COSTS_N_INSNS (1), /* cost of an add instruction */
585 COSTS_N_INSNS (1), /* cost of a lea instruction */
586 COSTS_N_INSNS (3), /* variable shift costs */
587 COSTS_N_INSNS (2), /* constant shift costs */
588 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
589 COSTS_N_INSNS (6), /* HI */
590 COSTS_N_INSNS (6), /* SI */
591 COSTS_N_INSNS (6), /* DI */
592 COSTS_N_INSNS (6)}, /* other */
593 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
594 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
595 COSTS_N_INSNS (23), /* HI */
596 COSTS_N_INSNS (23), /* SI */
597 COSTS_N_INSNS (23), /* DI */
598 COSTS_N_INSNS (23)}, /* other */
599 COSTS_N_INSNS (3), /* cost of movsx */
600 COSTS_N_INSNS (2), /* cost of movzx */
601 15, /* "large" insn */
602 3, /* MOVE_RATIO */
603 4, /* cost for loading QImode using movzbl */
604 {2, 4, 2}, /* cost of loading integer registers
605 in QImode, HImode and SImode.
606 Relative to reg-reg move (2). */
607 {2, 4, 2}, /* cost of storing integer registers */
608 2, /* cost of reg,reg fld/fst */
609 {8, 8, 8}, /* cost of loading fp registers
610 in SFmode, DFmode and XFmode */
611 {8, 8, 8}, /* cost of storing fp registers
612 in SFmode, DFmode and XFmode */
613 2, /* cost of moving MMX register */
614 {4, 8}, /* cost of loading MMX registers
615 in SImode and DImode */
616 {4, 8}, /* cost of storing MMX registers
617 in SImode and DImode */
618 2, /* cost of moving SSE register */
619 {4, 8, 16}, /* cost of loading SSE registers
620 in SImode, DImode and TImode */
621 {4, 8, 16}, /* cost of storing SSE registers
622 in SImode, DImode and TImode */
623 3, /* MMX or SSE register to integer */
624 0, /* size of l1 cache */
625 0, /* size of l2 cache */
626 0, /* size of prefetch block */
627 0, /* number of parallel prefetches */
628 1, /* Branch cost */
629 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
630 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
631 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
632 COSTS_N_INSNS (22), /* cost of FABS instruction. */
633 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
634 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
635 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
636 DUMMY_STRINGOP_ALGS},
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 1, /* scalar_stmt_cost. */
640 1, /* scalar load_cost. */
641 1, /* scalar_store_cost. */
642 1, /* vec_stmt_cost. */
643 1, /* vec_to_scalar_cost. */
644 1, /* scalar_to_vec_cost. */
645 1, /* vec_align_load_cost. */
646 2, /* vec_unalign_load_cost. */
647 1, /* vec_store_cost. */
648 3, /* cond_taken_branch_cost. */
649 1, /* cond_not_taken_branch_cost. */
650 };
651
652 static const
653 struct processor_costs i486_cost = { /* 486 specific costs */
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (1), /* cost of a lea instruction */
656 COSTS_N_INSNS (3), /* variable shift costs */
657 COSTS_N_INSNS (2), /* constant shift costs */
658 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (12), /* HI */
660 COSTS_N_INSNS (12), /* SI */
661 COSTS_N_INSNS (12), /* DI */
662 COSTS_N_INSNS (12)}, /* other */
663 1, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (40), /* HI */
666 COSTS_N_INSNS (40), /* SI */
667 COSTS_N_INSNS (40), /* DI */
668 COSTS_N_INSNS (40)}, /* other */
669 COSTS_N_INSNS (3), /* cost of movsx */
670 COSTS_N_INSNS (2), /* cost of movzx */
671 15, /* "large" insn */
672 3, /* MOVE_RATIO */
673 4, /* cost for loading QImode using movzbl */
674 {2, 4, 2}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {2, 4, 2}, /* cost of storing integer registers */
678 2, /* cost of reg,reg fld/fst */
679 {8, 8, 8}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {8, 8, 8}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {4, 8}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {4, 8}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {4, 8, 16}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {4, 8, 16}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 3, /* MMX or SSE register to integer */
694 4, /* size of l1 cache. 486 has 8kB cache
695 shared for code and data, so 4kB is
696 not really precise. */
697 4, /* size of l2 cache */
698 0, /* size of prefetch block */
699 0, /* number of parallel prefetches */
700 1, /* Branch cost */
701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
702 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
703 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
704 COSTS_N_INSNS (3), /* cost of FABS instruction. */
705 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
706 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
707 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
708 DUMMY_STRINGOP_ALGS},
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 1, /* scalar_stmt_cost. */
712 1, /* scalar load_cost. */
713 1, /* scalar_store_cost. */
714 1, /* vec_stmt_cost. */
715 1, /* vec_to_scalar_cost. */
716 1, /* scalar_to_vec_cost. */
717 1, /* vec_align_load_cost. */
718 2, /* vec_unalign_load_cost. */
719 1, /* vec_store_cost. */
720 3, /* cond_taken_branch_cost. */
721 1, /* cond_not_taken_branch_cost. */
722 };
723
724 static const
725 struct processor_costs pentium_cost = {
726 COSTS_N_INSNS (1), /* cost of an add instruction */
727 COSTS_N_INSNS (1), /* cost of a lea instruction */
728 COSTS_N_INSNS (4), /* variable shift costs */
729 COSTS_N_INSNS (1), /* constant shift costs */
730 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
731 COSTS_N_INSNS (11), /* HI */
732 COSTS_N_INSNS (11), /* SI */
733 COSTS_N_INSNS (11), /* DI */
734 COSTS_N_INSNS (11)}, /* other */
735 0, /* cost of multiply per each bit set */
736 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
737 COSTS_N_INSNS (25), /* HI */
738 COSTS_N_INSNS (25), /* SI */
739 COSTS_N_INSNS (25), /* DI */
740 COSTS_N_INSNS (25)}, /* other */
741 COSTS_N_INSNS (3), /* cost of movsx */
742 COSTS_N_INSNS (2), /* cost of movzx */
743 8, /* "large" insn */
744 6, /* MOVE_RATIO */
745 6, /* cost for loading QImode using movzbl */
746 {2, 4, 2}, /* cost of loading integer registers
747 in QImode, HImode and SImode.
748 Relative to reg-reg move (2). */
749 {2, 4, 2}, /* cost of storing integer registers */
750 2, /* cost of reg,reg fld/fst */
751 {2, 2, 6}, /* cost of loading fp registers
752 in SFmode, DFmode and XFmode */
753 {4, 4, 6}, /* cost of storing fp registers
754 in SFmode, DFmode and XFmode */
755 8, /* cost of moving MMX register */
756 {8, 8}, /* cost of loading MMX registers
757 in SImode and DImode */
758 {8, 8}, /* cost of storing MMX registers
759 in SImode and DImode */
760 2, /* cost of moving SSE register */
761 {4, 8, 16}, /* cost of loading SSE registers
762 in SImode, DImode and TImode */
763 {4, 8, 16}, /* cost of storing SSE registers
764 in SImode, DImode and TImode */
765 3, /* MMX or SSE register to integer */
766 8, /* size of l1 cache. */
767 8, /* size of l2 cache */
768 0, /* size of prefetch block */
769 0, /* number of parallel prefetches */
770 2, /* Branch cost */
771 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
772 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
773 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
774 COSTS_N_INSNS (1), /* cost of FABS instruction. */
775 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
776 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
777 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
778 DUMMY_STRINGOP_ALGS},
779 {{libcall, {{-1, rep_prefix_4_byte}}},
780 DUMMY_STRINGOP_ALGS},
781 1, /* scalar_stmt_cost. */
782 1, /* scalar load_cost. */
783 1, /* scalar_store_cost. */
784 1, /* vec_stmt_cost. */
785 1, /* vec_to_scalar_cost. */
786 1, /* scalar_to_vec_cost. */
787 1, /* vec_align_load_cost. */
788 2, /* vec_unalign_load_cost. */
789 1, /* vec_store_cost. */
790 3, /* cond_taken_branch_cost. */
791 1, /* cond_not_taken_branch_cost. */
792 };
793
794 static const
795 struct processor_costs pentiumpro_cost = {
796 COSTS_N_INSNS (1), /* cost of an add instruction */
797 COSTS_N_INSNS (1), /* cost of a lea instruction */
798 COSTS_N_INSNS (1), /* variable shift costs */
799 COSTS_N_INSNS (1), /* constant shift costs */
800 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
801 COSTS_N_INSNS (4), /* HI */
802 COSTS_N_INSNS (4), /* SI */
803 COSTS_N_INSNS (4), /* DI */
804 COSTS_N_INSNS (4)}, /* other */
805 0, /* cost of multiply per each bit set */
806 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
807 COSTS_N_INSNS (17), /* HI */
808 COSTS_N_INSNS (17), /* SI */
809 COSTS_N_INSNS (17), /* DI */
810 COSTS_N_INSNS (17)}, /* other */
811 COSTS_N_INSNS (1), /* cost of movsx */
812 COSTS_N_INSNS (1), /* cost of movzx */
813 8, /* "large" insn */
814 6, /* MOVE_RATIO */
815 2, /* cost for loading QImode using movzbl */
816 {4, 4, 4}, /* cost of loading integer registers
817 in QImode, HImode and SImode.
818 Relative to reg-reg move (2). */
819 {2, 2, 2}, /* cost of storing integer registers */
820 2, /* cost of reg,reg fld/fst */
821 {2, 2, 6}, /* cost of loading fp registers
822 in SFmode, DFmode and XFmode */
823 {4, 4, 6}, /* cost of storing fp registers
824 in SFmode, DFmode and XFmode */
825 2, /* cost of moving MMX register */
826 {2, 2}, /* cost of loading MMX registers
827 in SImode and DImode */
828 {2, 2}, /* cost of storing MMX registers
829 in SImode and DImode */
830 2, /* cost of moving SSE register */
831 {2, 2, 8}, /* cost of loading SSE registers
832 in SImode, DImode and TImode */
833 {2, 2, 8}, /* cost of storing SSE registers
834 in SImode, DImode and TImode */
835 3, /* MMX or SSE register to integer */
836 8, /* size of l1 cache. */
837 256, /* size of l2 cache */
838 32, /* size of prefetch block */
839 6, /* number of parallel prefetches */
840 2, /* Branch cost */
841 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
842 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
843 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
844 COSTS_N_INSNS (2), /* cost of FABS instruction. */
845 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
846 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
847 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
848 (we ensure the alignment). For small blocks inline loop is still a
849 noticeable win, for bigger blocks either rep movsl or rep movsb is
850 way to go. Rep movsb has apparently more expensive startup time in CPU,
851 but after 4K the difference is down in the noise. */
852 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
853 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
854 DUMMY_STRINGOP_ALGS},
855 {{rep_prefix_4_byte, {{1024, unrolled_loop},
856 {8192, rep_prefix_4_byte}, {-1, libcall}}},
857 DUMMY_STRINGOP_ALGS},
858 1, /* scalar_stmt_cost. */
859 1, /* scalar load_cost. */
860 1, /* scalar_store_cost. */
861 1, /* vec_stmt_cost. */
862 1, /* vec_to_scalar_cost. */
863 1, /* scalar_to_vec_cost. */
864 1, /* vec_align_load_cost. */
865 2, /* vec_unalign_load_cost. */
866 1, /* vec_store_cost. */
867 3, /* cond_taken_branch_cost. */
868 1, /* cond_not_taken_branch_cost. */
869 };
870
871 static const
872 struct processor_costs geode_cost = {
873 COSTS_N_INSNS (1), /* cost of an add instruction */
874 COSTS_N_INSNS (1), /* cost of a lea instruction */
875 COSTS_N_INSNS (2), /* variable shift costs */
876 COSTS_N_INSNS (1), /* constant shift costs */
877 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
878 COSTS_N_INSNS (4), /* HI */
879 COSTS_N_INSNS (7), /* SI */
880 COSTS_N_INSNS (7), /* DI */
881 COSTS_N_INSNS (7)}, /* other */
882 0, /* cost of multiply per each bit set */
883 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
884 COSTS_N_INSNS (23), /* HI */
885 COSTS_N_INSNS (39), /* SI */
886 COSTS_N_INSNS (39), /* DI */
887 COSTS_N_INSNS (39)}, /* other */
888 COSTS_N_INSNS (1), /* cost of movsx */
889 COSTS_N_INSNS (1), /* cost of movzx */
890 8, /* "large" insn */
891 4, /* MOVE_RATIO */
892 1, /* cost for loading QImode using movzbl */
893 {1, 1, 1}, /* cost of loading integer registers
894 in QImode, HImode and SImode.
895 Relative to reg-reg move (2). */
896 {1, 1, 1}, /* cost of storing integer registers */
897 1, /* cost of reg,reg fld/fst */
898 {1, 1, 1}, /* cost of loading fp registers
899 in SFmode, DFmode and XFmode */
900 {4, 6, 6}, /* cost of storing fp registers
901 in SFmode, DFmode and XFmode */
902
903 1, /* cost of moving MMX register */
904 {1, 1}, /* cost of loading MMX registers
905 in SImode and DImode */
906 {1, 1}, /* cost of storing MMX registers
907 in SImode and DImode */
908 1, /* cost of moving SSE register */
909 {1, 1, 1}, /* cost of loading SSE registers
910 in SImode, DImode and TImode */
911 {1, 1, 1}, /* cost of storing SSE registers
912 in SImode, DImode and TImode */
913 1, /* MMX or SSE register to integer */
914 64, /* size of l1 cache. */
915 128, /* size of l2 cache. */
916 32, /* size of prefetch block */
917 1, /* number of parallel prefetches */
918 1, /* Branch cost */
919 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
920 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
921 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
922 COSTS_N_INSNS (1), /* cost of FABS instruction. */
923 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
924 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
925 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
926 DUMMY_STRINGOP_ALGS},
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 1, /* scalar_stmt_cost. */
930 1, /* scalar load_cost. */
931 1, /* scalar_store_cost. */
932 1, /* vec_stmt_cost. */
933 1, /* vec_to_scalar_cost. */
934 1, /* scalar_to_vec_cost. */
935 1, /* vec_align_load_cost. */
936 2, /* vec_unalign_load_cost. */
937 1, /* vec_store_cost. */
938 3, /* cond_taken_branch_cost. */
939 1, /* cond_not_taken_branch_cost. */
940 };
941
942 static const
943 struct processor_costs k6_cost = {
944 COSTS_N_INSNS (1), /* cost of an add instruction */
945 COSTS_N_INSNS (2), /* cost of a lea instruction */
946 COSTS_N_INSNS (1), /* variable shift costs */
947 COSTS_N_INSNS (1), /* constant shift costs */
948 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
949 COSTS_N_INSNS (3), /* HI */
950 COSTS_N_INSNS (3), /* SI */
951 COSTS_N_INSNS (3), /* DI */
952 COSTS_N_INSNS (3)}, /* other */
953 0, /* cost of multiply per each bit set */
954 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
955 COSTS_N_INSNS (18), /* HI */
956 COSTS_N_INSNS (18), /* SI */
957 COSTS_N_INSNS (18), /* DI */
958 COSTS_N_INSNS (18)}, /* other */
959 COSTS_N_INSNS (2), /* cost of movsx */
960 COSTS_N_INSNS (2), /* cost of movzx */
961 8, /* "large" insn */
962 4, /* MOVE_RATIO */
963 3, /* cost for loading QImode using movzbl */
964 {4, 5, 4}, /* cost of loading integer registers
965 in QImode, HImode and SImode.
966 Relative to reg-reg move (2). */
967 {2, 3, 2}, /* cost of storing integer registers */
968 4, /* cost of reg,reg fld/fst */
969 {6, 6, 6}, /* cost of loading fp registers
970 in SFmode, DFmode and XFmode */
971 {4, 4, 4}, /* cost of storing fp registers
972 in SFmode, DFmode and XFmode */
973 2, /* cost of moving MMX register */
974 {2, 2}, /* cost of loading MMX registers
975 in SImode and DImode */
976 {2, 2}, /* cost of storing MMX registers
977 in SImode and DImode */
978 2, /* cost of moving SSE register */
979 {2, 2, 8}, /* cost of loading SSE registers
980 in SImode, DImode and TImode */
981 {2, 2, 8}, /* cost of storing SSE registers
982 in SImode, DImode and TImode */
983 6, /* MMX or SSE register to integer */
984 32, /* size of l1 cache. */
985 32, /* size of l2 cache. Some models
986 have integrated l2 cache, but
987 optimizing for k6 is not important
988 enough to worry about that. */
989 32, /* size of prefetch block */
990 1, /* number of parallel prefetches */
991 1, /* Branch cost */
992 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
993 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
994 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
995 COSTS_N_INSNS (2), /* cost of FABS instruction. */
996 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
997 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
998 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
999 DUMMY_STRINGOP_ALGS},
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 1, /* scalar_stmt_cost. */
1003 1, /* scalar load_cost. */
1004 1, /* scalar_store_cost. */
1005 1, /* vec_stmt_cost. */
1006 1, /* vec_to_scalar_cost. */
1007 1, /* scalar_to_vec_cost. */
1008 1, /* vec_align_load_cost. */
1009 2, /* vec_unalign_load_cost. */
1010 1, /* vec_store_cost. */
1011 3, /* cond_taken_branch_cost. */
1012 1, /* cond_not_taken_branch_cost. */
1013 };
1014
1015 static const
1016 struct processor_costs athlon_cost = {
1017 COSTS_N_INSNS (1), /* cost of an add instruction */
1018 COSTS_N_INSNS (2), /* cost of a lea instruction */
1019 COSTS_N_INSNS (1), /* variable shift costs */
1020 COSTS_N_INSNS (1), /* constant shift costs */
1021 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1022 COSTS_N_INSNS (5), /* HI */
1023 COSTS_N_INSNS (5), /* SI */
1024 COSTS_N_INSNS (5), /* DI */
1025 COSTS_N_INSNS (5)}, /* other */
1026 0, /* cost of multiply per each bit set */
1027 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1028 COSTS_N_INSNS (26), /* HI */
1029 COSTS_N_INSNS (42), /* SI */
1030 COSTS_N_INSNS (74), /* DI */
1031 COSTS_N_INSNS (74)}, /* other */
1032 COSTS_N_INSNS (1), /* cost of movsx */
1033 COSTS_N_INSNS (1), /* cost of movzx */
1034 8, /* "large" insn */
1035 9, /* MOVE_RATIO */
1036 4, /* cost for loading QImode using movzbl */
1037 {3, 4, 3}, /* cost of loading integer registers
1038 in QImode, HImode and SImode.
1039 Relative to reg-reg move (2). */
1040 {3, 4, 3}, /* cost of storing integer registers */
1041 4, /* cost of reg,reg fld/fst */
1042 {4, 4, 12}, /* cost of loading fp registers
1043 in SFmode, DFmode and XFmode */
1044 {6, 6, 8}, /* cost of storing fp registers
1045 in SFmode, DFmode and XFmode */
1046 2, /* cost of moving MMX register */
1047 {4, 4}, /* cost of loading MMX registers
1048 in SImode and DImode */
1049 {4, 4}, /* cost of storing MMX registers
1050 in SImode and DImode */
1051 2, /* cost of moving SSE register */
1052 {4, 4, 6}, /* cost of loading SSE registers
1053 in SImode, DImode and TImode */
1054 {4, 4, 5}, /* cost of storing SSE registers
1055 in SImode, DImode and TImode */
1056 5, /* MMX or SSE register to integer */
1057 64, /* size of l1 cache. */
1058 256, /* size of l2 cache. */
1059 64, /* size of prefetch block */
1060 6, /* number of parallel prefetches */
1061 5, /* Branch cost */
1062 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1063 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1064 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1065 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1066 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1067 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1068 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1069 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1070 128 bytes for memset. */
1071 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1072 DUMMY_STRINGOP_ALGS},
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 1, /* scalar_stmt_cost. */
1076 1, /* scalar load_cost. */
1077 1, /* scalar_store_cost. */
1078 1, /* vec_stmt_cost. */
1079 1, /* vec_to_scalar_cost. */
1080 1, /* scalar_to_vec_cost. */
1081 1, /* vec_align_load_cost. */
1082 2, /* vec_unalign_load_cost. */
1083 1, /* vec_store_cost. */
1084 3, /* cond_taken_branch_cost. */
1085 1, /* cond_not_taken_branch_cost. */
1086 };
1087
1088 static const
1089 struct processor_costs k8_cost = {
1090 COSTS_N_INSNS (1), /* cost of an add instruction */
1091 COSTS_N_INSNS (2), /* cost of a lea instruction */
1092 COSTS_N_INSNS (1), /* variable shift costs */
1093 COSTS_N_INSNS (1), /* constant shift costs */
1094 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1095 COSTS_N_INSNS (4), /* HI */
1096 COSTS_N_INSNS (3), /* SI */
1097 COSTS_N_INSNS (4), /* DI */
1098 COSTS_N_INSNS (5)}, /* other */
1099 0, /* cost of multiply per each bit set */
1100 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1101 COSTS_N_INSNS (26), /* HI */
1102 COSTS_N_INSNS (42), /* SI */
1103 COSTS_N_INSNS (74), /* DI */
1104 COSTS_N_INSNS (74)}, /* other */
1105 COSTS_N_INSNS (1), /* cost of movsx */
1106 COSTS_N_INSNS (1), /* cost of movzx */
1107 8, /* "large" insn */
1108 9, /* MOVE_RATIO */
1109 4, /* cost for loading QImode using movzbl */
1110 {3, 4, 3}, /* cost of loading integer registers
1111 in QImode, HImode and SImode.
1112 Relative to reg-reg move (2). */
1113 {3, 4, 3}, /* cost of storing integer registers */
1114 4, /* cost of reg,reg fld/fst */
1115 {4, 4, 12}, /* cost of loading fp registers
1116 in SFmode, DFmode and XFmode */
1117 {6, 6, 8}, /* cost of storing fp registers
1118 in SFmode, DFmode and XFmode */
1119 2, /* cost of moving MMX register */
1120 {3, 3}, /* cost of loading MMX registers
1121 in SImode and DImode */
1122 {4, 4}, /* cost of storing MMX registers
1123 in SImode and DImode */
1124 2, /* cost of moving SSE register */
1125 {4, 3, 6}, /* cost of loading SSE registers
1126 in SImode, DImode and TImode */
1127 {4, 4, 5}, /* cost of storing SSE registers
1128 in SImode, DImode and TImode */
1129 5, /* MMX or SSE register to integer */
1130 64, /* size of l1 cache. */
1131 512, /* size of l2 cache. */
1132 64, /* size of prefetch block */
1133 /* New AMD processors never drop prefetches; if they cannot be performed
1134 immediately, they are queued. We set number of simultaneous prefetches
1135 to a large constant to reflect this (it probably is not a good idea not
1136 to limit number of prefetches at all, as their execution also takes some
1137 time). */
1138 100, /* number of parallel prefetches */
1139 3, /* Branch cost */
1140 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1141 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1142 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1143 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1144 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1145 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1146 /* K8 has optimized REP instruction for medium sized blocks, but for very
1147 small blocks it is better to use loop. For large blocks, libcall can
1148 do nontemporary accesses and beat inline considerably. */
1149 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1150 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1151 {{libcall, {{8, loop}, {24, unrolled_loop},
1152 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1153 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1154 4, /* scalar_stmt_cost. */
1155 2, /* scalar load_cost. */
1156 2, /* scalar_store_cost. */
1157 5, /* vec_stmt_cost. */
1158 0, /* vec_to_scalar_cost. */
1159 2, /* scalar_to_vec_cost. */
1160 2, /* vec_align_load_cost. */
1161 3, /* vec_unalign_load_cost. */
1162 3, /* vec_store_cost. */
1163 3, /* cond_taken_branch_cost. */
1164 2, /* cond_not_taken_branch_cost. */
1165 };
1166
1167 struct processor_costs amdfam10_cost = {
1168 COSTS_N_INSNS (1), /* cost of an add instruction */
1169 COSTS_N_INSNS (2), /* cost of a lea instruction */
1170 COSTS_N_INSNS (1), /* variable shift costs */
1171 COSTS_N_INSNS (1), /* constant shift costs */
1172 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1173 COSTS_N_INSNS (4), /* HI */
1174 COSTS_N_INSNS (3), /* SI */
1175 COSTS_N_INSNS (4), /* DI */
1176 COSTS_N_INSNS (5)}, /* other */
1177 0, /* cost of multiply per each bit set */
1178 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1179 COSTS_N_INSNS (35), /* HI */
1180 COSTS_N_INSNS (51), /* SI */
1181 COSTS_N_INSNS (83), /* DI */
1182 COSTS_N_INSNS (83)}, /* other */
1183 COSTS_N_INSNS (1), /* cost of movsx */
1184 COSTS_N_INSNS (1), /* cost of movzx */
1185 8, /* "large" insn */
1186 9, /* MOVE_RATIO */
1187 4, /* cost for loading QImode using movzbl */
1188 {3, 4, 3}, /* cost of loading integer registers
1189 in QImode, HImode and SImode.
1190 Relative to reg-reg move (2). */
1191 {3, 4, 3}, /* cost of storing integer registers */
1192 4, /* cost of reg,reg fld/fst */
1193 {4, 4, 12}, /* cost of loading fp registers
1194 in SFmode, DFmode and XFmode */
1195 {6, 6, 8}, /* cost of storing fp registers
1196 in SFmode, DFmode and XFmode */
1197 2, /* cost of moving MMX register */
1198 {3, 3}, /* cost of loading MMX registers
1199 in SImode and DImode */
1200 {4, 4}, /* cost of storing MMX registers
1201 in SImode and DImode */
1202 2, /* cost of moving SSE register */
1203 {4, 4, 3}, /* cost of loading SSE registers
1204 in SImode, DImode and TImode */
1205 {4, 4, 5}, /* cost of storing SSE registers
1206 in SImode, DImode and TImode */
1207 3, /* MMX or SSE register to integer */
1208 /* On K8:
1209 MOVD reg64, xmmreg Double FSTORE 4
1210 MOVD reg32, xmmreg Double FSTORE 4
1211 On AMDFAM10:
1212 MOVD reg64, xmmreg Double FADD 3
1213 1/1 1/1
1214 MOVD reg32, xmmreg Double FADD 3
1215 1/1 1/1 */
1216 64, /* size of l1 cache. */
1217 512, /* size of l2 cache. */
1218 64, /* size of prefetch block */
1219 /* New AMD processors never drop prefetches; if they cannot be performed
1220 immediately, they are queued. We set number of simultaneous prefetches
1221 to a large constant to reflect this (it probably is not a good idea not
1222 to limit number of prefetches at all, as their execution also takes some
1223 time). */
1224 100, /* number of parallel prefetches */
1225 2, /* Branch cost */
1226 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1227 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1228 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1229 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1230 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1231 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1232
1233 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1234 very small blocks it is better to use loop. For large blocks, libcall can
1235 do nontemporary accesses and beat inline considerably. */
1236 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1237 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1238 {{libcall, {{8, loop}, {24, unrolled_loop},
1239 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1240 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1241 4, /* scalar_stmt_cost. */
1242 2, /* scalar load_cost. */
1243 2, /* scalar_store_cost. */
1244 6, /* vec_stmt_cost. */
1245 0, /* vec_to_scalar_cost. */
1246 2, /* scalar_to_vec_cost. */
1247 2, /* vec_align_load_cost. */
1248 2, /* vec_unalign_load_cost. */
1249 2, /* vec_store_cost. */
1250 2, /* cond_taken_branch_cost. */
1251 1, /* cond_not_taken_branch_cost. */
1252 };
1253
1254 struct processor_costs bdver1_cost = {
1255 COSTS_N_INSNS (1), /* cost of an add instruction */
1256 COSTS_N_INSNS (1), /* cost of a lea instruction */
1257 COSTS_N_INSNS (1), /* variable shift costs */
1258 COSTS_N_INSNS (1), /* constant shift costs */
1259 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1260 COSTS_N_INSNS (4), /* HI */
1261 COSTS_N_INSNS (4), /* SI */
1262 COSTS_N_INSNS (6), /* DI */
1263 COSTS_N_INSNS (6)}, /* other */
1264 0, /* cost of multiply per each bit set */
1265 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1266 COSTS_N_INSNS (35), /* HI */
1267 COSTS_N_INSNS (51), /* SI */
1268 COSTS_N_INSNS (83), /* DI */
1269 COSTS_N_INSNS (83)}, /* other */
1270 COSTS_N_INSNS (1), /* cost of movsx */
1271 COSTS_N_INSNS (1), /* cost of movzx */
1272 8, /* "large" insn */
1273 9, /* MOVE_RATIO */
1274 4, /* cost for loading QImode using movzbl */
1275 {5, 5, 4}, /* cost of loading integer registers
1276 in QImode, HImode and SImode.
1277 Relative to reg-reg move (2). */
1278 {4, 4, 4}, /* cost of storing integer registers */
1279 2, /* cost of reg,reg fld/fst */
1280 {5, 5, 12}, /* cost of loading fp registers
1281 in SFmode, DFmode and XFmode */
1282 {4, 4, 8}, /* cost of storing fp registers
1283 in SFmode, DFmode and XFmode */
1284 2, /* cost of moving MMX register */
1285 {4, 4}, /* cost of loading MMX registers
1286 in SImode and DImode */
1287 {4, 4}, /* cost of storing MMX registers
1288 in SImode and DImode */
1289 2, /* cost of moving SSE register */
1290 {4, 4, 4}, /* cost of loading SSE registers
1291 in SImode, DImode and TImode */
1292 {4, 4, 4}, /* cost of storing SSE registers
1293 in SImode, DImode and TImode */
1294 2, /* MMX or SSE register to integer */
1295 /* On K8:
1296 MOVD reg64, xmmreg Double FSTORE 4
1297 MOVD reg32, xmmreg Double FSTORE 4
1298 On AMDFAM10:
1299 MOVD reg64, xmmreg Double FADD 3
1300 1/1 1/1
1301 MOVD reg32, xmmreg Double FADD 3
1302 1/1 1/1 */
1303 16, /* size of l1 cache. */
1304 2048, /* size of l2 cache. */
1305 64, /* size of prefetch block */
1306 /* New AMD processors never drop prefetches; if they cannot be performed
1307 immediately, they are queued. We set number of simultaneous prefetches
1308 to a large constant to reflect this (it probably is not a good idea not
1309 to limit number of prefetches at all, as their execution also takes some
1310 time). */
1311 100, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1319
1320 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1321 very small blocks it is better to use loop. For large blocks, libcall
1322 can do nontemporary accesses and beat inline considerably. */
1323 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1324 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1325 {{libcall, {{8, loop}, {24, unrolled_loop},
1326 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1327 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1328 6, /* scalar_stmt_cost. */
1329 4, /* scalar load_cost. */
1330 4, /* scalar_store_cost. */
1331 6, /* vec_stmt_cost. */
1332 0, /* vec_to_scalar_cost. */
1333 2, /* scalar_to_vec_cost. */
1334 4, /* vec_align_load_cost. */
1335 4, /* vec_unalign_load_cost. */
1336 4, /* vec_store_cost. */
1337 2, /* cond_taken_branch_cost. */
1338 1, /* cond_not_taken_branch_cost. */
1339 };
1340
1341 struct processor_costs bdver2_cost = {
1342 COSTS_N_INSNS (1), /* cost of an add instruction */
1343 COSTS_N_INSNS (1), /* cost of a lea instruction */
1344 COSTS_N_INSNS (1), /* variable shift costs */
1345 COSTS_N_INSNS (1), /* constant shift costs */
1346 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1347 COSTS_N_INSNS (4), /* HI */
1348 COSTS_N_INSNS (4), /* SI */
1349 COSTS_N_INSNS (6), /* DI */
1350 COSTS_N_INSNS (6)}, /* other */
1351 0, /* cost of multiply per each bit set */
1352 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1353 COSTS_N_INSNS (35), /* HI */
1354 COSTS_N_INSNS (51), /* SI */
1355 COSTS_N_INSNS (83), /* DI */
1356 COSTS_N_INSNS (83)}, /* other */
1357 COSTS_N_INSNS (1), /* cost of movsx */
1358 COSTS_N_INSNS (1), /* cost of movzx */
1359 8, /* "large" insn */
1360 9, /* MOVE_RATIO */
1361 4, /* cost for loading QImode using movzbl */
1362 {5, 5, 4}, /* cost of loading integer registers
1363 in QImode, HImode and SImode.
1364 Relative to reg-reg move (2). */
1365 {4, 4, 4}, /* cost of storing integer registers */
1366 2, /* cost of reg,reg fld/fst */
1367 {5, 5, 12}, /* cost of loading fp registers
1368 in SFmode, DFmode and XFmode */
1369 {4, 4, 8}, /* cost of storing fp registers
1370 in SFmode, DFmode and XFmode */
1371 2, /* cost of moving MMX register */
1372 {4, 4}, /* cost of loading MMX registers
1373 in SImode and DImode */
1374 {4, 4}, /* cost of storing MMX registers
1375 in SImode and DImode */
1376 2, /* cost of moving SSE register */
1377 {4, 4, 4}, /* cost of loading SSE registers
1378 in SImode, DImode and TImode */
1379 {4, 4, 4}, /* cost of storing SSE registers
1380 in SImode, DImode and TImode */
1381 2, /* MMX or SSE register to integer */
1382 /* On K8:
1383 MOVD reg64, xmmreg Double FSTORE 4
1384 MOVD reg32, xmmreg Double FSTORE 4
1385 On AMDFAM10:
1386 MOVD reg64, xmmreg Double FADD 3
1387 1/1 1/1
1388 MOVD reg32, xmmreg Double FADD 3
1389 1/1 1/1 */
1390 16, /* size of l1 cache. */
1391 2048, /* size of l2 cache. */
1392 64, /* size of prefetch block */
1393 /* New AMD processors never drop prefetches; if they cannot be performed
1394 immediately, they are queued. We set number of simultaneous prefetches
1395 to a large constant to reflect this (it probably is not a good idea not
1396 to limit number of prefetches at all, as their execution also takes some
1397 time). */
1398 100, /* number of parallel prefetches */
1399 2, /* Branch cost */
1400 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1401 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1402 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1403 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1404 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1405 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1406
1407 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1408 very small blocks it is better to use loop. For large blocks, libcall
1409 can do nontemporary accesses and beat inline considerably. */
1410 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1411 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1412 {{libcall, {{8, loop}, {24, unrolled_loop},
1413 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1414 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1415 6, /* scalar_stmt_cost. */
1416 4, /* scalar load_cost. */
1417 4, /* scalar_store_cost. */
1418 6, /* vec_stmt_cost. */
1419 0, /* vec_to_scalar_cost. */
1420 2, /* scalar_to_vec_cost. */
1421 4, /* vec_align_load_cost. */
1422 4, /* vec_unalign_load_cost. */
1423 4, /* vec_store_cost. */
1424 2, /* cond_taken_branch_cost. */
1425 1, /* cond_not_taken_branch_cost. */
1426 };
1427
1428 struct processor_costs btver1_cost = {
1429 COSTS_N_INSNS (1), /* cost of an add instruction */
1430 COSTS_N_INSNS (2), /* cost of a lea instruction */
1431 COSTS_N_INSNS (1), /* variable shift costs */
1432 COSTS_N_INSNS (1), /* constant shift costs */
1433 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1434 COSTS_N_INSNS (4), /* HI */
1435 COSTS_N_INSNS (3), /* SI */
1436 COSTS_N_INSNS (4), /* DI */
1437 COSTS_N_INSNS (5)}, /* other */
1438 0, /* cost of multiply per each bit set */
1439 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1440 COSTS_N_INSNS (35), /* HI */
1441 COSTS_N_INSNS (51), /* SI */
1442 COSTS_N_INSNS (83), /* DI */
1443 COSTS_N_INSNS (83)}, /* other */
1444 COSTS_N_INSNS (1), /* cost of movsx */
1445 COSTS_N_INSNS (1), /* cost of movzx */
1446 8, /* "large" insn */
1447 9, /* MOVE_RATIO */
1448 4, /* cost for loading QImode using movzbl */
1449 {3, 4, 3}, /* cost of loading integer registers
1450 in QImode, HImode and SImode.
1451 Relative to reg-reg move (2). */
1452 {3, 4, 3}, /* cost of storing integer registers */
1453 4, /* cost of reg,reg fld/fst */
1454 {4, 4, 12}, /* cost of loading fp registers
1455 in SFmode, DFmode and XFmode */
1456 {6, 6, 8}, /* cost of storing fp registers
1457 in SFmode, DFmode and XFmode */
1458 2, /* cost of moving MMX register */
1459 {3, 3}, /* cost of loading MMX registers
1460 in SImode and DImode */
1461 {4, 4}, /* cost of storing MMX registers
1462 in SImode and DImode */
1463 2, /* cost of moving SSE register */
1464 {4, 4, 3}, /* cost of loading SSE registers
1465 in SImode, DImode and TImode */
1466 {4, 4, 5}, /* cost of storing SSE registers
1467 in SImode, DImode and TImode */
1468 3, /* MMX or SSE register to integer */
1469 /* On K8:
1470 MOVD reg64, xmmreg Double FSTORE 4
1471 MOVD reg32, xmmreg Double FSTORE 4
1472 On AMDFAM10:
1473 MOVD reg64, xmmreg Double FADD 3
1474 1/1 1/1
1475 MOVD reg32, xmmreg Double FADD 3
1476 1/1 1/1 */
1477 32, /* size of l1 cache. */
1478 512, /* size of l2 cache. */
1479 64, /* size of prefetch block */
1480 100, /* number of parallel prefetches */
1481 2, /* Branch cost */
1482 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1483 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1484 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1485 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1486 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1487 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1488
1489 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1490 very small blocks it is better to use loop. For large blocks, libcall can
1491 do nontemporary accesses and beat inline considerably. */
1492 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1493 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1494 {{libcall, {{8, loop}, {24, unrolled_loop},
1495 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1496 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1497 4, /* scalar_stmt_cost. */
1498 2, /* scalar load_cost. */
1499 2, /* scalar_store_cost. */
1500 6, /* vec_stmt_cost. */
1501 0, /* vec_to_scalar_cost. */
1502 2, /* scalar_to_vec_cost. */
1503 2, /* vec_align_load_cost. */
1504 2, /* vec_unalign_load_cost. */
1505 2, /* vec_store_cost. */
1506 2, /* cond_taken_branch_cost. */
1507 1, /* cond_not_taken_branch_cost. */
1508 };
1509
1510 static const
1511 struct processor_costs pentium4_cost = {
1512 COSTS_N_INSNS (1), /* cost of an add instruction */
1513 COSTS_N_INSNS (3), /* cost of a lea instruction */
1514 COSTS_N_INSNS (4), /* variable shift costs */
1515 COSTS_N_INSNS (4), /* constant shift costs */
1516 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1517 COSTS_N_INSNS (15), /* HI */
1518 COSTS_N_INSNS (15), /* SI */
1519 COSTS_N_INSNS (15), /* DI */
1520 COSTS_N_INSNS (15)}, /* other */
1521 0, /* cost of multiply per each bit set */
1522 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1523 COSTS_N_INSNS (56), /* HI */
1524 COSTS_N_INSNS (56), /* SI */
1525 COSTS_N_INSNS (56), /* DI */
1526 COSTS_N_INSNS (56)}, /* other */
1527 COSTS_N_INSNS (1), /* cost of movsx */
1528 COSTS_N_INSNS (1), /* cost of movzx */
1529 16, /* "large" insn */
1530 6, /* MOVE_RATIO */
1531 2, /* cost for loading QImode using movzbl */
1532 {4, 5, 4}, /* cost of loading integer registers
1533 in QImode, HImode and SImode.
1534 Relative to reg-reg move (2). */
1535 {2, 3, 2}, /* cost of storing integer registers */
1536 2, /* cost of reg,reg fld/fst */
1537 {2, 2, 6}, /* cost of loading fp registers
1538 in SFmode, DFmode and XFmode */
1539 {4, 4, 6}, /* cost of storing fp registers
1540 in SFmode, DFmode and XFmode */
1541 2, /* cost of moving MMX register */
1542 {2, 2}, /* cost of loading MMX registers
1543 in SImode and DImode */
1544 {2, 2}, /* cost of storing MMX registers
1545 in SImode and DImode */
1546 12, /* cost of moving SSE register */
1547 {12, 12, 12}, /* cost of loading SSE registers
1548 in SImode, DImode and TImode */
1549 {2, 2, 8}, /* cost of storing SSE registers
1550 in SImode, DImode and TImode */
1551 10, /* MMX or SSE register to integer */
1552 8, /* size of l1 cache. */
1553 256, /* size of l2 cache. */
1554 64, /* size of prefetch block */
1555 6, /* number of parallel prefetches */
1556 2, /* Branch cost */
1557 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1558 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1559 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1560 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1561 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1562 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1563 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1564 DUMMY_STRINGOP_ALGS},
1565 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1566 {-1, libcall}}},
1567 DUMMY_STRINGOP_ALGS},
1568 1, /* scalar_stmt_cost. */
1569 1, /* scalar load_cost. */
1570 1, /* scalar_store_cost. */
1571 1, /* vec_stmt_cost. */
1572 1, /* vec_to_scalar_cost. */
1573 1, /* scalar_to_vec_cost. */
1574 1, /* vec_align_load_cost. */
1575 2, /* vec_unalign_load_cost. */
1576 1, /* vec_store_cost. */
1577 3, /* cond_taken_branch_cost. */
1578 1, /* cond_not_taken_branch_cost. */
1579 };
1580
1581 static const
1582 struct processor_costs nocona_cost = {
1583 COSTS_N_INSNS (1), /* cost of an add instruction */
1584 COSTS_N_INSNS (1), /* cost of a lea instruction */
1585 COSTS_N_INSNS (1), /* variable shift costs */
1586 COSTS_N_INSNS (1), /* constant shift costs */
1587 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1588 COSTS_N_INSNS (10), /* HI */
1589 COSTS_N_INSNS (10), /* SI */
1590 COSTS_N_INSNS (10), /* DI */
1591 COSTS_N_INSNS (10)}, /* other */
1592 0, /* cost of multiply per each bit set */
1593 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1594 COSTS_N_INSNS (66), /* HI */
1595 COSTS_N_INSNS (66), /* SI */
1596 COSTS_N_INSNS (66), /* DI */
1597 COSTS_N_INSNS (66)}, /* other */
1598 COSTS_N_INSNS (1), /* cost of movsx */
1599 COSTS_N_INSNS (1), /* cost of movzx */
1600 16, /* "large" insn */
1601 17, /* MOVE_RATIO */
1602 4, /* cost for loading QImode using movzbl */
1603 {4, 4, 4}, /* cost of loading integer registers
1604 in QImode, HImode and SImode.
1605 Relative to reg-reg move (2). */
1606 {4, 4, 4}, /* cost of storing integer registers */
1607 3, /* cost of reg,reg fld/fst */
1608 {12, 12, 12}, /* cost of loading fp registers
1609 in SFmode, DFmode and XFmode */
1610 {4, 4, 4}, /* cost of storing fp registers
1611 in SFmode, DFmode and XFmode */
1612 6, /* cost of moving MMX register */
1613 {12, 12}, /* cost of loading MMX registers
1614 in SImode and DImode */
1615 {12, 12}, /* cost of storing MMX registers
1616 in SImode and DImode */
1617 6, /* cost of moving SSE register */
1618 {12, 12, 12}, /* cost of loading SSE registers
1619 in SImode, DImode and TImode */
1620 {12, 12, 12}, /* cost of storing SSE registers
1621 in SImode, DImode and TImode */
1622 8, /* MMX or SSE register to integer */
1623 8, /* size of l1 cache. */
1624 1024, /* size of l2 cache. */
1625 128, /* size of prefetch block */
1626 8, /* number of parallel prefetches */
1627 1, /* Branch cost */
1628 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1629 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1630 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1631 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1632 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1633 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1634 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1635 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1636 {100000, unrolled_loop}, {-1, libcall}}}},
1637 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1638 {-1, libcall}}},
1639 {libcall, {{24, loop}, {64, unrolled_loop},
1640 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1641 1, /* scalar_stmt_cost. */
1642 1, /* scalar load_cost. */
1643 1, /* scalar_store_cost. */
1644 1, /* vec_stmt_cost. */
1645 1, /* vec_to_scalar_cost. */
1646 1, /* scalar_to_vec_cost. */
1647 1, /* vec_align_load_cost. */
1648 2, /* vec_unalign_load_cost. */
1649 1, /* vec_store_cost. */
1650 3, /* cond_taken_branch_cost. */
1651 1, /* cond_not_taken_branch_cost. */
1652 };
1653
1654 static const
1655 struct processor_costs atom_cost = {
1656 COSTS_N_INSNS (1), /* cost of an add instruction */
1657 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1658 COSTS_N_INSNS (1), /* variable shift costs */
1659 COSTS_N_INSNS (1), /* constant shift costs */
1660 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1661 COSTS_N_INSNS (4), /* HI */
1662 COSTS_N_INSNS (3), /* SI */
1663 COSTS_N_INSNS (4), /* DI */
1664 COSTS_N_INSNS (2)}, /* other */
1665 0, /* cost of multiply per each bit set */
1666 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1667 COSTS_N_INSNS (26), /* HI */
1668 COSTS_N_INSNS (42), /* SI */
1669 COSTS_N_INSNS (74), /* DI */
1670 COSTS_N_INSNS (74)}, /* other */
1671 COSTS_N_INSNS (1), /* cost of movsx */
1672 COSTS_N_INSNS (1), /* cost of movzx */
1673 8, /* "large" insn */
1674 17, /* MOVE_RATIO */
1675 4, /* cost for loading QImode using movzbl */
1676 {4, 4, 4}, /* cost of loading integer registers
1677 in QImode, HImode and SImode.
1678 Relative to reg-reg move (2). */
1679 {4, 4, 4}, /* cost of storing integer registers */
1680 4, /* cost of reg,reg fld/fst */
1681 {12, 12, 12}, /* cost of loading fp registers
1682 in SFmode, DFmode and XFmode */
1683 {6, 6, 8}, /* cost of storing fp registers
1684 in SFmode, DFmode and XFmode */
1685 2, /* cost of moving MMX register */
1686 {8, 8}, /* cost of loading MMX registers
1687 in SImode and DImode */
1688 {8, 8}, /* cost of storing MMX registers
1689 in SImode and DImode */
1690 2, /* cost of moving SSE register */
1691 {8, 8, 8}, /* cost of loading SSE registers
1692 in SImode, DImode and TImode */
1693 {8, 8, 8}, /* cost of storing SSE registers
1694 in SImode, DImode and TImode */
1695 5, /* MMX or SSE register to integer */
1696 32, /* size of l1 cache. */
1697 256, /* size of l2 cache. */
1698 64, /* size of prefetch block */
1699 6, /* number of parallel prefetches */
1700 3, /* Branch cost */
1701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1702 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1703 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1704 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1705 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1706 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1707 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1708 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1709 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1710 {{libcall, {{8, loop}, {15, unrolled_loop},
1711 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1712 {libcall, {{24, loop}, {32, unrolled_loop},
1713 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1714 1, /* scalar_stmt_cost. */
1715 1, /* scalar load_cost. */
1716 1, /* scalar_store_cost. */
1717 1, /* vec_stmt_cost. */
1718 1, /* vec_to_scalar_cost. */
1719 1, /* scalar_to_vec_cost. */
1720 1, /* vec_align_load_cost. */
1721 2, /* vec_unalign_load_cost. */
1722 1, /* vec_store_cost. */
1723 3, /* cond_taken_branch_cost. */
1724 1, /* cond_not_taken_branch_cost. */
1725 };
1726
1727 /* Generic64 should produce code tuned for Nocona and K8. */
1728 static const
1729 struct processor_costs generic64_cost = {
1730 COSTS_N_INSNS (1), /* cost of an add instruction */
1731 /* On all chips taken into consideration lea is 2 cycles and more. With
1732 this cost however our current implementation of synth_mult results in
1733 use of unnecessary temporary registers causing regression on several
1734 SPECfp benchmarks. */
1735 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1736 COSTS_N_INSNS (1), /* variable shift costs */
1737 COSTS_N_INSNS (1), /* constant shift costs */
1738 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1739 COSTS_N_INSNS (4), /* HI */
1740 COSTS_N_INSNS (3), /* SI */
1741 COSTS_N_INSNS (4), /* DI */
1742 COSTS_N_INSNS (2)}, /* other */
1743 0, /* cost of multiply per each bit set */
1744 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1745 COSTS_N_INSNS (26), /* HI */
1746 COSTS_N_INSNS (42), /* SI */
1747 COSTS_N_INSNS (74), /* DI */
1748 COSTS_N_INSNS (74)}, /* other */
1749 COSTS_N_INSNS (1), /* cost of movsx */
1750 COSTS_N_INSNS (1), /* cost of movzx */
1751 8, /* "large" insn */
1752 17, /* MOVE_RATIO */
1753 4, /* cost for loading QImode using movzbl */
1754 {4, 4, 4}, /* cost of loading integer registers
1755 in QImode, HImode and SImode.
1756 Relative to reg-reg move (2). */
1757 {4, 4, 4}, /* cost of storing integer registers */
1758 4, /* cost of reg,reg fld/fst */
1759 {12, 12, 12}, /* cost of loading fp registers
1760 in SFmode, DFmode and XFmode */
1761 {6, 6, 8}, /* cost of storing fp registers
1762 in SFmode, DFmode and XFmode */
1763 2, /* cost of moving MMX register */
1764 {8, 8}, /* cost of loading MMX registers
1765 in SImode and DImode */
1766 {8, 8}, /* cost of storing MMX registers
1767 in SImode and DImode */
1768 2, /* cost of moving SSE register */
1769 {8, 8, 8}, /* cost of loading SSE registers
1770 in SImode, DImode and TImode */
1771 {8, 8, 8}, /* cost of storing SSE registers
1772 in SImode, DImode and TImode */
1773 5, /* MMX or SSE register to integer */
1774 32, /* size of l1 cache. */
1775 512, /* size of l2 cache. */
1776 64, /* size of prefetch block */
1777 6, /* number of parallel prefetches */
1778 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1779 value is increased to perhaps more appropriate value of 5. */
1780 3, /* Branch cost */
1781 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1782 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1783 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1784 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1785 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1786 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1787 {DUMMY_STRINGOP_ALGS,
1788 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1789 {DUMMY_STRINGOP_ALGS,
1790 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1791 1, /* scalar_stmt_cost. */
1792 1, /* scalar load_cost. */
1793 1, /* scalar_store_cost. */
1794 1, /* vec_stmt_cost. */
1795 1, /* vec_to_scalar_cost. */
1796 1, /* scalar_to_vec_cost. */
1797 1, /* vec_align_load_cost. */
1798 2, /* vec_unalign_load_cost. */
1799 1, /* vec_store_cost. */
1800 3, /* cond_taken_branch_cost. */
1801 1, /* cond_not_taken_branch_cost. */
1802 };
1803
1804 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1805 Athlon and K8. */
1806 static const
1807 struct processor_costs generic32_cost = {
1808 COSTS_N_INSNS (1), /* cost of an add instruction */
1809 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1810 COSTS_N_INSNS (1), /* variable shift costs */
1811 COSTS_N_INSNS (1), /* constant shift costs */
1812 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1813 COSTS_N_INSNS (4), /* HI */
1814 COSTS_N_INSNS (3), /* SI */
1815 COSTS_N_INSNS (4), /* DI */
1816 COSTS_N_INSNS (2)}, /* other */
1817 0, /* cost of multiply per each bit set */
1818 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1819 COSTS_N_INSNS (26), /* HI */
1820 COSTS_N_INSNS (42), /* SI */
1821 COSTS_N_INSNS (74), /* DI */
1822 COSTS_N_INSNS (74)}, /* other */
1823 COSTS_N_INSNS (1), /* cost of movsx */
1824 COSTS_N_INSNS (1), /* cost of movzx */
1825 8, /* "large" insn */
1826 17, /* MOVE_RATIO */
1827 4, /* cost for loading QImode using movzbl */
1828 {4, 4, 4}, /* cost of loading integer registers
1829 in QImode, HImode and SImode.
1830 Relative to reg-reg move (2). */
1831 {4, 4, 4}, /* cost of storing integer registers */
1832 4, /* cost of reg,reg fld/fst */
1833 {12, 12, 12}, /* cost of loading fp registers
1834 in SFmode, DFmode and XFmode */
1835 {6, 6, 8}, /* cost of storing fp registers
1836 in SFmode, DFmode and XFmode */
1837 2, /* cost of moving MMX register */
1838 {8, 8}, /* cost of loading MMX registers
1839 in SImode and DImode */
1840 {8, 8}, /* cost of storing MMX registers
1841 in SImode and DImode */
1842 2, /* cost of moving SSE register */
1843 {8, 8, 8}, /* cost of loading SSE registers
1844 in SImode, DImode and TImode */
1845 {8, 8, 8}, /* cost of storing SSE registers
1846 in SImode, DImode and TImode */
1847 5, /* MMX or SSE register to integer */
1848 32, /* size of l1 cache. */
1849 256, /* size of l2 cache. */
1850 64, /* size of prefetch block */
1851 6, /* number of parallel prefetches */
1852 3, /* Branch cost */
1853 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1854 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1855 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1856 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1857 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1858 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1859 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1860 DUMMY_STRINGOP_ALGS},
1861 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1862 DUMMY_STRINGOP_ALGS},
1863 1, /* scalar_stmt_cost. */
1864 1, /* scalar load_cost. */
1865 1, /* scalar_store_cost. */
1866 1, /* vec_stmt_cost. */
1867 1, /* vec_to_scalar_cost. */
1868 1, /* scalar_to_vec_cost. */
1869 1, /* vec_align_load_cost. */
1870 2, /* vec_unalign_load_cost. */
1871 1, /* vec_store_cost. */
1872 3, /* cond_taken_branch_cost. */
1873 1, /* cond_not_taken_branch_cost. */
1874 };
1875
1876 const struct processor_costs *ix86_cost = &pentium_cost;
1877
1878 /* Processor feature/optimization bitmasks. */
1879 #define m_386 (1<<PROCESSOR_I386)
1880 #define m_486 (1<<PROCESSOR_I486)
1881 #define m_PENT (1<<PROCESSOR_PENTIUM)
1882 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1883 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1884 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1885 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1886 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1887 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1888 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1889 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1890 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1891 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1892 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1893 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1894 #define m_ATOM (1<<PROCESSOR_ATOM)
1895
1896 #define m_GEODE (1<<PROCESSOR_GEODE)
1897 #define m_K6 (1<<PROCESSOR_K6)
1898 #define m_K6_GEODE (m_K6 | m_GEODE)
1899 #define m_K8 (1<<PROCESSOR_K8)
1900 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1901 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1902 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1903 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1904 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1905 #define m_BDVER (m_BDVER1 | m_BDVER2)
1906 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1907 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1908
1909 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1910 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1911
1912 /* Generic instruction choice should be common subset of supported CPUs
1913 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1914 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1915
1916 /* Feature tests against the various tunings. */
1917 unsigned char ix86_tune_features[X86_TUNE_LAST];
1918
1919 /* Feature tests against the various tunings used to create ix86_tune_features
1920 based on the processor mask. */
1921 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1922 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1923 negatively, so enabling for Generic64 seems like good code size
1924 tradeoff. We can't enable it for 32bit generic because it does not
1925 work well with PPro base chips. */
1926 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1927
1928 /* X86_TUNE_PUSH_MEMORY */
1929 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1930
1931 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1932 m_486 | m_PENT,
1933
1934 /* X86_TUNE_UNROLL_STRLEN */
1935 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1936
1937 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1938 on simulation result. But after P4 was made, no performance benefit
1939 was observed with branch hints. It also increases the code size.
1940 As a result, icc never generates branch hints. */
1941 0,
1942
1943 /* X86_TUNE_DOUBLE_WITH_ADD */
1944 ~m_386,
1945
1946 /* X86_TUNE_USE_SAHF */
1947 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1948
1949 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1950 partial dependencies. */
1951 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1952
1953 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1954 register stalls on Generic32 compilation setting as well. However
1955 in current implementation the partial register stalls are not eliminated
1956 very well - they can be introduced via subregs synthesized by combine
1957 and can happen in caller/callee saving sequences. Because this option
1958 pays back little on PPro based chips and is in conflict with partial reg
1959 dependencies used by Athlon/P4 based chips, it is better to leave it off
1960 for generic32 for now. */
1961 m_PPRO,
1962
1963 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1964 m_CORE2I7 | m_GENERIC,
1965
1966 /* X86_TUNE_USE_HIMODE_FIOP */
1967 m_386 | m_486 | m_K6_GEODE,
1968
1969 /* X86_TUNE_USE_SIMODE_FIOP */
1970 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1971
1972 /* X86_TUNE_USE_MOV0 */
1973 m_K6,
1974
1975 /* X86_TUNE_USE_CLTD */
1976 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1977
1978 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1979 m_PENT4,
1980
1981 /* X86_TUNE_SPLIT_LONG_MOVES */
1982 m_PPRO,
1983
1984 /* X86_TUNE_READ_MODIFY_WRITE */
1985 ~m_PENT,
1986
1987 /* X86_TUNE_READ_MODIFY */
1988 ~(m_PENT | m_PPRO),
1989
1990 /* X86_TUNE_PROMOTE_QIMODE */
1991 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1992
1993 /* X86_TUNE_FAST_PREFIX */
1994 ~(m_386 | m_486 | m_PENT),
1995
1996 /* X86_TUNE_SINGLE_STRINGOP */
1997 m_386 | m_P4_NOCONA,
1998
1999 /* X86_TUNE_QIMODE_MATH */
2000 ~0,
2001
2002 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2003 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2004 might be considered for Generic32 if our scheme for avoiding partial
2005 stalls was more effective. */
2006 ~m_PPRO,
2007
2008 /* X86_TUNE_PROMOTE_QI_REGS */
2009 0,
2010
2011 /* X86_TUNE_PROMOTE_HI_REGS */
2012 m_PPRO,
2013
2014 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2015 over esp addition. */
2016 m_386 | m_486 | m_PENT | m_PPRO,
2017
2018 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2019 over esp addition. */
2020 m_PENT,
2021
2022 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2023 over esp subtraction. */
2024 m_386 | m_486 | m_PENT | m_K6_GEODE,
2025
2026 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2027 over esp subtraction. */
2028 m_PENT | m_K6_GEODE,
2029
2030 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2031 for DFmode copies */
2032 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2033
2034 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2035 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2036
2037 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2038 conflict here in between PPro/Pentium4 based chips that thread 128bit
2039 SSE registers as single units versus K8 based chips that divide SSE
2040 registers to two 64bit halves. This knob promotes all store destinations
2041 to be 128bit to allow register renaming on 128bit SSE units, but usually
2042 results in one extra microop on 64bit SSE units. Experimental results
2043 shows that disabling this option on P4 brings over 20% SPECfp regression,
2044 while enabling it on K8 brings roughly 2.4% regression that can be partly
2045 masked by careful scheduling of moves. */
2046 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2047
2048 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2049 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2050
2051 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2052 m_COREI7 | m_BDVER,
2053
2054 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2055 m_BDVER ,
2056
2057 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2058 are resolved on SSE register parts instead of whole registers, so we may
2059 maintain just lower part of scalar values in proper format leaving the
2060 upper part undefined. */
2061 m_ATHLON_K8,
2062
2063 /* X86_TUNE_SSE_TYPELESS_STORES */
2064 m_AMD_MULTIPLE,
2065
2066 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2067 m_PPRO | m_P4_NOCONA,
2068
2069 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2070 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2071
2072 /* X86_TUNE_PROLOGUE_USING_MOVE */
2073 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2074
2075 /* X86_TUNE_EPILOGUE_USING_MOVE */
2076 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2077
2078 /* X86_TUNE_SHIFT1 */
2079 ~m_486,
2080
2081 /* X86_TUNE_USE_FFREEP */
2082 m_AMD_MULTIPLE,
2083
2084 /* X86_TUNE_INTER_UNIT_MOVES */
2085 ~(m_AMD_MULTIPLE | m_GENERIC),
2086
2087 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2088 ~(m_AMDFAM10 | m_BDVER ),
2089
2090 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2091 than 4 branch instructions in the 16 byte window. */
2092 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2093
2094 /* X86_TUNE_SCHEDULE */
2095 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2096
2097 /* X86_TUNE_USE_BT */
2098 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2099
2100 /* X86_TUNE_USE_INCDEC */
2101 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2102
2103 /* X86_TUNE_PAD_RETURNS */
2104 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2105
2106 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2107 m_ATOM,
2108
2109 /* X86_TUNE_EXT_80387_CONSTANTS */
2110 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2111
2112 /* X86_TUNE_SHORTEN_X87_SSE */
2113 ~m_K8,
2114
2115 /* X86_TUNE_AVOID_VECTOR_DECODE */
2116 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2117
2118 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2119 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2120 ~(m_386 | m_486),
2121
2122 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2123 vector path on AMD machines. */
2124 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2125
2126 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2127 machines. */
2128 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2129
2130 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2131 than a MOV. */
2132 m_PENT,
2133
2134 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2135 but one byte longer. */
2136 m_PENT,
2137
2138 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2139 operand that cannot be represented using a modRM byte. The XOR
2140 replacement is long decoded, so this split helps here as well. */
2141 m_K6,
2142
2143 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2144 from FP to FP. */
2145 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2146
2147 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2148 from integer to FP. */
2149 m_AMDFAM10,
2150
2151 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2152 with a subsequent conditional jump instruction into a single
2153 compare-and-branch uop. */
2154 m_BDVER,
2155
2156 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2157 will impact LEA instruction selection. */
2158 m_ATOM,
2159
2160 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2161 instructions. */
2162 ~m_ATOM,
2163
2164 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2165 at -O3. For the moment, the prefetching seems badly tuned for Intel
2166 chips. */
2167 m_K6_GEODE | m_AMD_MULTIPLE,
2168
2169 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2170 the auto-vectorizer. */
2171 m_BDVER,
2172
2173 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2174 during reassociation of integer computation. */
2175 m_ATOM,
2176
2177 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2178 during reassociation of fp computation. */
2179 m_ATOM
2180 };
2181
2182 /* Feature tests against the various architecture variations. */
2183 unsigned char ix86_arch_features[X86_ARCH_LAST];
2184
2185 /* Feature tests against the various architecture variations, used to create
2186 ix86_arch_features based on the processor mask. */
2187 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2188 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
2189 ~(m_386 | m_486 | m_PENT | m_K6),
2190
2191 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2192 ~m_386,
2193
2194 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2195 ~(m_386 | m_486),
2196
2197 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2198 ~m_386,
2199
2200 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2201 ~m_386,
2202 };
2203
2204 static const unsigned int x86_accumulate_outgoing_args
2205 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2206
2207 static const unsigned int x86_arch_always_fancy_math_387
2208 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2209
2210 static const unsigned int x86_avx256_split_unaligned_load
2211 = m_COREI7 | m_GENERIC;
2212
2213 static const unsigned int x86_avx256_split_unaligned_store
2214 = m_COREI7 | m_BDVER | m_GENERIC;
2215
2216 /* In case the average insn count for single function invocation is
2217 lower than this constant, emit fast (but longer) prologue and
2218 epilogue code. */
2219 #define FAST_PROLOGUE_INSN_COUNT 20
2220
2221 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2222 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2223 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2224 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2225
2226 /* Array of the smallest class containing reg number REGNO, indexed by
2227 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2228
2229 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2230 {
2231 /* ax, dx, cx, bx */
2232 AREG, DREG, CREG, BREG,
2233 /* si, di, bp, sp */
2234 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2235 /* FP registers */
2236 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2237 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2238 /* arg pointer */
2239 NON_Q_REGS,
2240 /* flags, fpsr, fpcr, frame */
2241 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2242 /* SSE registers */
2243 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2244 SSE_REGS, SSE_REGS,
2245 /* MMX registers */
2246 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2247 MMX_REGS, MMX_REGS,
2248 /* REX registers */
2249 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2250 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2251 /* SSE REX registers */
2252 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2253 SSE_REGS, SSE_REGS,
2254 };
2255
2256 /* The "default" register map used in 32bit mode. */
2257
2258 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2259 {
2260 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2261 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2262 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2263 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2264 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2265 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2266 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2267 };
2268
2269 /* The "default" register map used in 64bit mode. */
2270
2271 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2272 {
2273 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2274 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2275 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2276 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2277 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2278 8,9,10,11,12,13,14,15, /* extended integer registers */
2279 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2280 };
2281
2282 /* Define the register numbers to be used in Dwarf debugging information.
2283 The SVR4 reference port C compiler uses the following register numbers
2284 in its Dwarf output code:
2285 0 for %eax (gcc regno = 0)
2286 1 for %ecx (gcc regno = 2)
2287 2 for %edx (gcc regno = 1)
2288 3 for %ebx (gcc regno = 3)
2289 4 for %esp (gcc regno = 7)
2290 5 for %ebp (gcc regno = 6)
2291 6 for %esi (gcc regno = 4)
2292 7 for %edi (gcc regno = 5)
2293 The following three DWARF register numbers are never generated by
2294 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2295 believes these numbers have these meanings.
2296 8 for %eip (no gcc equivalent)
2297 9 for %eflags (gcc regno = 17)
2298 10 for %trapno (no gcc equivalent)
2299 It is not at all clear how we should number the FP stack registers
2300 for the x86 architecture. If the version of SDB on x86/svr4 were
2301 a bit less brain dead with respect to floating-point then we would
2302 have a precedent to follow with respect to DWARF register numbers
2303 for x86 FP registers, but the SDB on x86/svr4 is so completely
2304 broken with respect to FP registers that it is hardly worth thinking
2305 of it as something to strive for compatibility with.
2306 The version of x86/svr4 SDB I have at the moment does (partially)
2307 seem to believe that DWARF register number 11 is associated with
2308 the x86 register %st(0), but that's about all. Higher DWARF
2309 register numbers don't seem to be associated with anything in
2310 particular, and even for DWARF regno 11, SDB only seems to under-
2311 stand that it should say that a variable lives in %st(0) (when
2312 asked via an `=' command) if we said it was in DWARF regno 11,
2313 but SDB still prints garbage when asked for the value of the
2314 variable in question (via a `/' command).
2315 (Also note that the labels SDB prints for various FP stack regs
2316 when doing an `x' command are all wrong.)
2317 Note that these problems generally don't affect the native SVR4
2318 C compiler because it doesn't allow the use of -O with -g and
2319 because when it is *not* optimizing, it allocates a memory
2320 location for each floating-point variable, and the memory
2321 location is what gets described in the DWARF AT_location
2322 attribute for the variable in question.
2323 Regardless of the severe mental illness of the x86/svr4 SDB, we
2324 do something sensible here and we use the following DWARF
2325 register numbers. Note that these are all stack-top-relative
2326 numbers.
2327 11 for %st(0) (gcc regno = 8)
2328 12 for %st(1) (gcc regno = 9)
2329 13 for %st(2) (gcc regno = 10)
2330 14 for %st(3) (gcc regno = 11)
2331 15 for %st(4) (gcc regno = 12)
2332 16 for %st(5) (gcc regno = 13)
2333 17 for %st(6) (gcc regno = 14)
2334 18 for %st(7) (gcc regno = 15)
2335 */
2336 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2337 {
2338 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2339 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2340 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2341 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2342 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2343 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2344 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2345 };
2346
2347 /* Define parameter passing and return registers. */
2348
2349 static int const x86_64_int_parameter_registers[6] =
2350 {
2351 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2352 };
2353
2354 static int const x86_64_ms_abi_int_parameter_registers[4] =
2355 {
2356 CX_REG, DX_REG, R8_REG, R9_REG
2357 };
2358
2359 static int const x86_64_int_return_registers[4] =
2360 {
2361 AX_REG, DX_REG, DI_REG, SI_REG
2362 };
2363
2364 /* Define the structure for the machine field in struct function. */
2365
2366 struct GTY(()) stack_local_entry {
2367 unsigned short mode;
2368 unsigned short n;
2369 rtx rtl;
2370 struct stack_local_entry *next;
2371 };
2372
2373 /* Structure describing stack frame layout.
2374 Stack grows downward:
2375
2376 [arguments]
2377 <- ARG_POINTER
2378 saved pc
2379
2380 saved static chain if ix86_static_chain_on_stack
2381
2382 saved frame pointer if frame_pointer_needed
2383 <- HARD_FRAME_POINTER
2384 [saved regs]
2385 <- regs_save_offset
2386 [padding0]
2387
2388 [saved SSE regs]
2389 <- sse_regs_save_offset
2390 [padding1] |
2391 | <- FRAME_POINTER
2392 [va_arg registers] |
2393 |
2394 [frame] |
2395 |
2396 [padding2] | = to_allocate
2397 <- STACK_POINTER
2398 */
2399 struct ix86_frame
2400 {
2401 int nsseregs;
2402 int nregs;
2403 int va_arg_size;
2404 int red_zone_size;
2405 int outgoing_arguments_size;
2406 HOST_WIDE_INT frame;
2407
2408 /* The offsets relative to ARG_POINTER. */
2409 HOST_WIDE_INT frame_pointer_offset;
2410 HOST_WIDE_INT hard_frame_pointer_offset;
2411 HOST_WIDE_INT stack_pointer_offset;
2412 HOST_WIDE_INT hfp_save_offset;
2413 HOST_WIDE_INT reg_save_offset;
2414 HOST_WIDE_INT sse_reg_save_offset;
2415
2416 /* When save_regs_using_mov is set, emit prologue using
2417 move instead of push instructions. */
2418 bool save_regs_using_mov;
2419 };
2420
2421 /* Which cpu are we scheduling for. */
2422 enum attr_cpu ix86_schedule;
2423
2424 /* Which cpu are we optimizing for. */
2425 enum processor_type ix86_tune;
2426
2427 /* Which instruction set architecture to use. */
2428 enum processor_type ix86_arch;
2429
2430 /* true if sse prefetch instruction is not NOOP. */
2431 int x86_prefetch_sse;
2432
2433 /* -mstackrealign option */
2434 static const char ix86_force_align_arg_pointer_string[]
2435 = "force_align_arg_pointer";
2436
2437 static rtx (*ix86_gen_leave) (void);
2438 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2439 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2440 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2441 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2442 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2443 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2444 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2445 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2446 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2447
2448 /* Preferred alignment for stack boundary in bits. */
2449 unsigned int ix86_preferred_stack_boundary;
2450
2451 /* Alignment for incoming stack boundary in bits specified at
2452 command line. */
2453 static unsigned int ix86_user_incoming_stack_boundary;
2454
2455 /* Default alignment for incoming stack boundary in bits. */
2456 static unsigned int ix86_default_incoming_stack_boundary;
2457
2458 /* Alignment for incoming stack boundary in bits. */
2459 unsigned int ix86_incoming_stack_boundary;
2460
2461 /* Calling abi specific va_list type nodes. */
2462 static GTY(()) tree sysv_va_list_type_node;
2463 static GTY(()) tree ms_va_list_type_node;
2464
2465 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2466 char internal_label_prefix[16];
2467 int internal_label_prefix_len;
2468
2469 /* Fence to use after loop using movnt. */
2470 tree x86_mfence;
2471
2472 /* Register class used for passing given 64bit part of the argument.
2473 These represent classes as documented by the PS ABI, with the exception
2474 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2475 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2476
2477 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2478 whenever possible (upper half does contain padding). */
2479 enum x86_64_reg_class
2480 {
2481 X86_64_NO_CLASS,
2482 X86_64_INTEGER_CLASS,
2483 X86_64_INTEGERSI_CLASS,
2484 X86_64_SSE_CLASS,
2485 X86_64_SSESF_CLASS,
2486 X86_64_SSEDF_CLASS,
2487 X86_64_SSEUP_CLASS,
2488 X86_64_X87_CLASS,
2489 X86_64_X87UP_CLASS,
2490 X86_64_COMPLEX_X87_CLASS,
2491 X86_64_MEMORY_CLASS
2492 };
2493
2494 #define MAX_CLASSES 4
2495
2496 /* Table of constants used by fldpi, fldln2, etc.... */
2497 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2498 static bool ext_80387_constants_init = 0;
2499
2500 \f
2501 static struct machine_function * ix86_init_machine_status (void);
2502 static rtx ix86_function_value (const_tree, const_tree, bool);
2503 static bool ix86_function_value_regno_p (const unsigned int);
2504 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2505 const_tree);
2506 static rtx ix86_static_chain (const_tree, bool);
2507 static int ix86_function_regparm (const_tree, const_tree);
2508 static void ix86_compute_frame_layout (struct ix86_frame *);
2509 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2510 rtx, rtx, int);
2511 static void ix86_add_new_builtins (HOST_WIDE_INT);
2512 static tree ix86_canonical_va_list_type (tree);
2513 static void predict_jump (int);
2514 static unsigned int split_stack_prologue_scratch_regno (void);
2515 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2516
2517 enum ix86_function_specific_strings
2518 {
2519 IX86_FUNCTION_SPECIFIC_ARCH,
2520 IX86_FUNCTION_SPECIFIC_TUNE,
2521 IX86_FUNCTION_SPECIFIC_MAX
2522 };
2523
2524 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2525 const char *, enum fpmath_unit, bool);
2526 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2527 static void ix86_function_specific_save (struct cl_target_option *);
2528 static void ix86_function_specific_restore (struct cl_target_option *);
2529 static void ix86_function_specific_print (FILE *, int,
2530 struct cl_target_option *);
2531 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2532 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2533 struct gcc_options *);
2534 static bool ix86_can_inline_p (tree, tree);
2535 static void ix86_set_current_function (tree);
2536 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2537
2538 static enum calling_abi ix86_function_abi (const_tree);
2539
2540 \f
2541 #ifndef SUBTARGET32_DEFAULT_CPU
2542 #define SUBTARGET32_DEFAULT_CPU "i386"
2543 #endif
2544
2545 /* The svr4 ABI for the i386 says that records and unions are returned
2546 in memory. */
2547 #ifndef DEFAULT_PCC_STRUCT_RETURN
2548 #define DEFAULT_PCC_STRUCT_RETURN 1
2549 #endif
2550
2551 /* Whether -mtune= or -march= were specified */
2552 static int ix86_tune_defaulted;
2553 static int ix86_arch_specified;
2554
2555 /* Vectorization library interface and handlers. */
2556 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2557
2558 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2559 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2560
2561 /* Processor target table, indexed by processor number */
2562 struct ptt
2563 {
2564 const struct processor_costs *cost; /* Processor costs */
2565 const int align_loop; /* Default alignments. */
2566 const int align_loop_max_skip;
2567 const int align_jump;
2568 const int align_jump_max_skip;
2569 const int align_func;
2570 };
2571
2572 static const struct ptt processor_target_table[PROCESSOR_max] =
2573 {
2574 {&i386_cost, 4, 3, 4, 3, 4},
2575 {&i486_cost, 16, 15, 16, 15, 16},
2576 {&pentium_cost, 16, 7, 16, 7, 16},
2577 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2578 {&geode_cost, 0, 0, 0, 0, 0},
2579 {&k6_cost, 32, 7, 32, 7, 32},
2580 {&athlon_cost, 16, 7, 16, 7, 16},
2581 {&pentium4_cost, 0, 0, 0, 0, 0},
2582 {&k8_cost, 16, 7, 16, 7, 16},
2583 {&nocona_cost, 0, 0, 0, 0, 0},
2584 /* Core 2 32-bit. */
2585 {&generic32_cost, 16, 10, 16, 10, 16},
2586 /* Core 2 64-bit. */
2587 {&generic64_cost, 16, 10, 16, 10, 16},
2588 /* Core i7 32-bit. */
2589 {&generic32_cost, 16, 10, 16, 10, 16},
2590 /* Core i7 64-bit. */
2591 {&generic64_cost, 16, 10, 16, 10, 16},
2592 {&generic32_cost, 16, 7, 16, 7, 16},
2593 {&generic64_cost, 16, 10, 16, 10, 16},
2594 {&amdfam10_cost, 32, 24, 32, 7, 32},
2595 {&bdver1_cost, 32, 24, 32, 7, 32},
2596 {&bdver2_cost, 32, 24, 32, 7, 32},
2597 {&btver1_cost, 32, 24, 32, 7, 32},
2598 {&atom_cost, 16, 15, 16, 7, 16}
2599 };
2600
2601 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2602 {
2603 "generic",
2604 "i386",
2605 "i486",
2606 "pentium",
2607 "pentium-mmx",
2608 "pentiumpro",
2609 "pentium2",
2610 "pentium3",
2611 "pentium4",
2612 "pentium-m",
2613 "prescott",
2614 "nocona",
2615 "core2",
2616 "corei7",
2617 "atom",
2618 "geode",
2619 "k6",
2620 "k6-2",
2621 "k6-3",
2622 "athlon",
2623 "athlon-4",
2624 "k8",
2625 "amdfam10",
2626 "bdver1",
2627 "bdver2",
2628 "btver1"
2629 };
2630 \f
2631 /* Return true if a red-zone is in use. */
2632
2633 static inline bool
2634 ix86_using_red_zone (void)
2635 {
2636 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2637 }
2638 \f
2639 /* Return a string that documents the current -m options. The caller is
2640 responsible for freeing the string. */
2641
2642 static char *
2643 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2644 const char *tune, enum fpmath_unit fpmath,
2645 bool add_nl_p)
2646 {
2647 struct ix86_target_opts
2648 {
2649 const char *option; /* option string */
2650 HOST_WIDE_INT mask; /* isa mask options */
2651 };
2652
2653 /* This table is ordered so that options like -msse4.2 that imply
2654 preceding options while match those first. */
2655 static struct ix86_target_opts isa_opts[] =
2656 {
2657 { "-m64", OPTION_MASK_ISA_64BIT },
2658 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2659 { "-mfma", OPTION_MASK_ISA_FMA },
2660 { "-mxop", OPTION_MASK_ISA_XOP },
2661 { "-mlwp", OPTION_MASK_ISA_LWP },
2662 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2663 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2664 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2665 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2666 { "-msse3", OPTION_MASK_ISA_SSE3 },
2667 { "-msse2", OPTION_MASK_ISA_SSE2 },
2668 { "-msse", OPTION_MASK_ISA_SSE },
2669 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2670 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2671 { "-mmmx", OPTION_MASK_ISA_MMX },
2672 { "-mabm", OPTION_MASK_ISA_ABM },
2673 { "-mbmi", OPTION_MASK_ISA_BMI },
2674 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2675 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2676 { "-mtbm", OPTION_MASK_ISA_TBM },
2677 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2678 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2679 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2680 { "-maes", OPTION_MASK_ISA_AES },
2681 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2682 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2683 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2684 { "-mf16c", OPTION_MASK_ISA_F16C },
2685 };
2686
2687 /* Flag options. */
2688 static struct ix86_target_opts flag_opts[] =
2689 {
2690 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2691 { "-m80387", MASK_80387 },
2692 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2693 { "-malign-double", MASK_ALIGN_DOUBLE },
2694 { "-mcld", MASK_CLD },
2695 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2696 { "-mieee-fp", MASK_IEEE_FP },
2697 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2698 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2699 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2700 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2701 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2702 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2703 { "-mno-red-zone", MASK_NO_RED_ZONE },
2704 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2705 { "-mrecip", MASK_RECIP },
2706 { "-mrtd", MASK_RTD },
2707 { "-msseregparm", MASK_SSEREGPARM },
2708 { "-mstack-arg-probe", MASK_STACK_PROBE },
2709 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2710 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2711 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2712 { "-mvzeroupper", MASK_VZEROUPPER },
2713 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2714 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2715 { "-mprefer-avx128", MASK_PREFER_AVX128},
2716 };
2717
2718 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2719
2720 char isa_other[40];
2721 char target_other[40];
2722 unsigned num = 0;
2723 unsigned i, j;
2724 char *ret;
2725 char *ptr;
2726 size_t len;
2727 size_t line_len;
2728 size_t sep_len;
2729
2730 memset (opts, '\0', sizeof (opts));
2731
2732 /* Add -march= option. */
2733 if (arch)
2734 {
2735 opts[num][0] = "-march=";
2736 opts[num++][1] = arch;
2737 }
2738
2739 /* Add -mtune= option. */
2740 if (tune)
2741 {
2742 opts[num][0] = "-mtune=";
2743 opts[num++][1] = tune;
2744 }
2745
2746 /* Pick out the options in isa options. */
2747 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2748 {
2749 if ((isa & isa_opts[i].mask) != 0)
2750 {
2751 opts[num++][0] = isa_opts[i].option;
2752 isa &= ~ isa_opts[i].mask;
2753 }
2754 }
2755
2756 if (isa && add_nl_p)
2757 {
2758 opts[num++][0] = isa_other;
2759 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2760 isa);
2761 }
2762
2763 /* Add flag options. */
2764 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2765 {
2766 if ((flags & flag_opts[i].mask) != 0)
2767 {
2768 opts[num++][0] = flag_opts[i].option;
2769 flags &= ~ flag_opts[i].mask;
2770 }
2771 }
2772
2773 if (flags && add_nl_p)
2774 {
2775 opts[num++][0] = target_other;
2776 sprintf (target_other, "(other flags: %#x)", flags);
2777 }
2778
2779 /* Add -fpmath= option. */
2780 if (fpmath)
2781 {
2782 opts[num][0] = "-mfpmath=";
2783 switch ((int) fpmath)
2784 {
2785 case FPMATH_387:
2786 opts[num++][1] = "387";
2787 break;
2788
2789 case FPMATH_SSE:
2790 opts[num++][1] = "sse";
2791 break;
2792
2793 case FPMATH_387 | FPMATH_SSE:
2794 opts[num++][1] = "sse+387";
2795 break;
2796
2797 default:
2798 gcc_unreachable ();
2799 }
2800 }
2801
2802 /* Any options? */
2803 if (num == 0)
2804 return NULL;
2805
2806 gcc_assert (num < ARRAY_SIZE (opts));
2807
2808 /* Size the string. */
2809 len = 0;
2810 sep_len = (add_nl_p) ? 3 : 1;
2811 for (i = 0; i < num; i++)
2812 {
2813 len += sep_len;
2814 for (j = 0; j < 2; j++)
2815 if (opts[i][j])
2816 len += strlen (opts[i][j]);
2817 }
2818
2819 /* Build the string. */
2820 ret = ptr = (char *) xmalloc (len);
2821 line_len = 0;
2822
2823 for (i = 0; i < num; i++)
2824 {
2825 size_t len2[2];
2826
2827 for (j = 0; j < 2; j++)
2828 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2829
2830 if (i != 0)
2831 {
2832 *ptr++ = ' ';
2833 line_len++;
2834
2835 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2836 {
2837 *ptr++ = '\\';
2838 *ptr++ = '\n';
2839 line_len = 0;
2840 }
2841 }
2842
2843 for (j = 0; j < 2; j++)
2844 if (opts[i][j])
2845 {
2846 memcpy (ptr, opts[i][j], len2[j]);
2847 ptr += len2[j];
2848 line_len += len2[j];
2849 }
2850 }
2851
2852 *ptr = '\0';
2853 gcc_assert (ret + len >= ptr);
2854
2855 return ret;
2856 }
2857
2858 /* Return true, if profiling code should be emitted before
2859 prologue. Otherwise it returns false.
2860 Note: For x86 with "hotfix" it is sorried. */
2861 static bool
2862 ix86_profile_before_prologue (void)
2863 {
2864 return flag_fentry != 0;
2865 }
2866
2867 /* Function that is callable from the debugger to print the current
2868 options. */
2869 void
2870 ix86_debug_options (void)
2871 {
2872 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2873 ix86_arch_string, ix86_tune_string,
2874 ix86_fpmath, true);
2875
2876 if (opts)
2877 {
2878 fprintf (stderr, "%s\n\n", opts);
2879 free (opts);
2880 }
2881 else
2882 fputs ("<no options>\n\n", stderr);
2883
2884 return;
2885 }
2886 \f
2887 /* Override various settings based on options. If MAIN_ARGS_P, the
2888 options are from the command line, otherwise they are from
2889 attributes. */
2890
2891 static void
2892 ix86_option_override_internal (bool main_args_p)
2893 {
2894 int i;
2895 unsigned int ix86_arch_mask, ix86_tune_mask;
2896 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2897 const char *prefix;
2898 const char *suffix;
2899 const char *sw;
2900
2901 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2902 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2903 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2904 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2905 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2906 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2907 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2908 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2909 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2910 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2911 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2912 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2913 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2914 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2915 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2916 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2917 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2918 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2919 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2920 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2921 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2922 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2923 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2924 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2925 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2926 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2927 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2928 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2929 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2930 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2931 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2932 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2933 /* if this reaches 64, need to widen struct pta flags below */
2934
2935 static struct pta
2936 {
2937 const char *const name; /* processor name or nickname. */
2938 const enum processor_type processor;
2939 const enum attr_cpu schedule;
2940 const unsigned HOST_WIDE_INT flags;
2941 }
2942 const processor_alias_table[] =
2943 {
2944 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2945 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2946 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2947 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2948 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2949 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2950 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2951 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2952 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2953 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2954 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2955 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2956 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2957 PTA_MMX | PTA_SSE},
2958 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2959 PTA_MMX | PTA_SSE},
2960 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2961 PTA_MMX | PTA_SSE | PTA_SSE2},
2962 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2963 PTA_MMX |PTA_SSE | PTA_SSE2},
2964 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2965 PTA_MMX | PTA_SSE | PTA_SSE2},
2966 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2967 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2968 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2969 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2970 | PTA_CX16 | PTA_NO_SAHF},
2971 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2972 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2973 | PTA_SSSE3 | PTA_CX16},
2974 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2975 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2976 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
2977 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2978 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2979 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2980 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
2981 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2982 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2983 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2984 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2985 | PTA_RDRND | PTA_F16C},
2986 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2987 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2988 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2989 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2990 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2991 | PTA_FMA | PTA_MOVBE},
2992 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2993 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2994 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
2995 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2996 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
2997 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2998 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2999 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3000 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3001 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3002 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3003 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3004 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3005 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3006 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3007 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3008 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3009 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3010 {"x86-64", PROCESSOR_K8, CPU_K8,
3011 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3012 {"k8", PROCESSOR_K8, CPU_K8,
3013 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3014 | PTA_SSE2 | PTA_NO_SAHF},
3015 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3016 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3017 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3018 {"opteron", PROCESSOR_K8, CPU_K8,
3019 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3020 | PTA_SSE2 | PTA_NO_SAHF},
3021 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3022 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3023 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3024 {"athlon64", PROCESSOR_K8, CPU_K8,
3025 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3026 | PTA_SSE2 | PTA_NO_SAHF},
3027 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3028 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3029 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3030 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3031 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3032 | PTA_SSE2 | PTA_NO_SAHF},
3033 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3034 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3035 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3036 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3037 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3038 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3039 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3040 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3041 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3042 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3043 | PTA_XOP | PTA_LWP},
3044 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3045 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3046 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3047 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3048 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3049 | PTA_FMA},
3050 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3051 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3052 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3053 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3054 0 /* flags are only used for -march switch. */ },
3055 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3056 PTA_64BIT /* flags are only used for -march switch. */ },
3057 };
3058
3059 /* -mrecip options. */
3060 static struct
3061 {
3062 const char *string; /* option name */
3063 unsigned int mask; /* mask bits to set */
3064 }
3065 const recip_options[] =
3066 {
3067 { "all", RECIP_MASK_ALL },
3068 { "none", RECIP_MASK_NONE },
3069 { "div", RECIP_MASK_DIV },
3070 { "sqrt", RECIP_MASK_SQRT },
3071 { "vec-div", RECIP_MASK_VEC_DIV },
3072 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3073 };
3074
3075 int const pta_size = ARRAY_SIZE (processor_alias_table);
3076
3077 /* Set up prefix/suffix so the error messages refer to either the command
3078 line argument, or the attribute(target). */
3079 if (main_args_p)
3080 {
3081 prefix = "-m";
3082 suffix = "";
3083 sw = "switch";
3084 }
3085 else
3086 {
3087 prefix = "option(\"";
3088 suffix = "\")";
3089 sw = "attribute";
3090 }
3091
3092 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3093 SUBTARGET_OVERRIDE_OPTIONS;
3094 #endif
3095
3096 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3097 SUBSUBTARGET_OVERRIDE_OPTIONS;
3098 #endif
3099
3100 if (TARGET_X32)
3101 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3102
3103 /* -fPIC is the default for x86_64. */
3104 if (TARGET_MACHO && TARGET_64BIT)
3105 flag_pic = 2;
3106
3107 /* Need to check -mtune=generic first. */
3108 if (ix86_tune_string)
3109 {
3110 if (!strcmp (ix86_tune_string, "generic")
3111 || !strcmp (ix86_tune_string, "i686")
3112 /* As special support for cross compilers we read -mtune=native
3113 as -mtune=generic. With native compilers we won't see the
3114 -mtune=native, as it was changed by the driver. */
3115 || !strcmp (ix86_tune_string, "native"))
3116 {
3117 if (TARGET_64BIT)
3118 ix86_tune_string = "generic64";
3119 else
3120 ix86_tune_string = "generic32";
3121 }
3122 /* If this call is for setting the option attribute, allow the
3123 generic32/generic64 that was previously set. */
3124 else if (!main_args_p
3125 && (!strcmp (ix86_tune_string, "generic32")
3126 || !strcmp (ix86_tune_string, "generic64")))
3127 ;
3128 else if (!strncmp (ix86_tune_string, "generic", 7))
3129 error ("bad value (%s) for %stune=%s %s",
3130 ix86_tune_string, prefix, suffix, sw);
3131 else if (!strcmp (ix86_tune_string, "x86-64"))
3132 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3133 "%stune=k8%s or %stune=generic%s instead as appropriate",
3134 prefix, suffix, prefix, suffix, prefix, suffix);
3135 }
3136 else
3137 {
3138 if (ix86_arch_string)
3139 ix86_tune_string = ix86_arch_string;
3140 if (!ix86_tune_string)
3141 {
3142 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3143 ix86_tune_defaulted = 1;
3144 }
3145
3146 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3147 need to use a sensible tune option. */
3148 if (!strcmp (ix86_tune_string, "generic")
3149 || !strcmp (ix86_tune_string, "x86-64")
3150 || !strcmp (ix86_tune_string, "i686"))
3151 {
3152 if (TARGET_64BIT)
3153 ix86_tune_string = "generic64";
3154 else
3155 ix86_tune_string = "generic32";
3156 }
3157 }
3158
3159 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3160 {
3161 /* rep; movq isn't available in 32-bit code. */
3162 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3163 ix86_stringop_alg = no_stringop;
3164 }
3165
3166 if (!ix86_arch_string)
3167 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3168 else
3169 ix86_arch_specified = 1;
3170
3171 if (!global_options_set.x_ix86_abi)
3172 ix86_abi = DEFAULT_ABI;
3173
3174 if (global_options_set.x_ix86_cmodel)
3175 {
3176 switch (ix86_cmodel)
3177 {
3178 case CM_SMALL:
3179 case CM_SMALL_PIC:
3180 if (flag_pic)
3181 ix86_cmodel = CM_SMALL_PIC;
3182 if (!TARGET_64BIT)
3183 error ("code model %qs not supported in the %s bit mode",
3184 "small", "32");
3185 break;
3186
3187 case CM_MEDIUM:
3188 case CM_MEDIUM_PIC:
3189 if (flag_pic)
3190 ix86_cmodel = CM_MEDIUM_PIC;
3191 if (!TARGET_64BIT)
3192 error ("code model %qs not supported in the %s bit mode",
3193 "medium", "32");
3194 else if (TARGET_X32)
3195 error ("code model %qs not supported in x32 mode",
3196 "medium");
3197 break;
3198
3199 case CM_LARGE:
3200 case CM_LARGE_PIC:
3201 if (flag_pic)
3202 ix86_cmodel = CM_LARGE_PIC;
3203 if (!TARGET_64BIT)
3204 error ("code model %qs not supported in the %s bit mode",
3205 "large", "32");
3206 else if (TARGET_X32)
3207 error ("code model %qs not supported in x32 mode",
3208 "medium");
3209 break;
3210
3211 case CM_32:
3212 if (flag_pic)
3213 error ("code model %s does not support PIC mode", "32");
3214 if (TARGET_64BIT)
3215 error ("code model %qs not supported in the %s bit mode",
3216 "32", "64");
3217 break;
3218
3219 case CM_KERNEL:
3220 if (flag_pic)
3221 {
3222 error ("code model %s does not support PIC mode", "kernel");
3223 ix86_cmodel = CM_32;
3224 }
3225 if (!TARGET_64BIT)
3226 error ("code model %qs not supported in the %s bit mode",
3227 "kernel", "32");
3228 break;
3229
3230 default:
3231 gcc_unreachable ();
3232 }
3233 }
3234 else
3235 {
3236 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3237 use of rip-relative addressing. This eliminates fixups that
3238 would otherwise be needed if this object is to be placed in a
3239 DLL, and is essentially just as efficient as direct addressing. */
3240 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3241 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3242 else if (TARGET_64BIT)
3243 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3244 else
3245 ix86_cmodel = CM_32;
3246 }
3247 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3248 {
3249 error ("-masm=intel not supported in this configuration");
3250 ix86_asm_dialect = ASM_ATT;
3251 }
3252 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3253 sorry ("%i-bit mode not compiled in",
3254 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3255
3256 for (i = 0; i < pta_size; i++)
3257 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3258 {
3259 ix86_schedule = processor_alias_table[i].schedule;
3260 ix86_arch = processor_alias_table[i].processor;
3261 /* Default cpu tuning to the architecture. */
3262 ix86_tune = ix86_arch;
3263
3264 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3265 error ("CPU you selected does not support x86-64 "
3266 "instruction set");
3267
3268 if (processor_alias_table[i].flags & PTA_MMX
3269 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3270 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3271 if (processor_alias_table[i].flags & PTA_3DNOW
3272 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3273 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3274 if (processor_alias_table[i].flags & PTA_3DNOW_A
3275 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3276 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3277 if (processor_alias_table[i].flags & PTA_SSE
3278 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3279 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3280 if (processor_alias_table[i].flags & PTA_SSE2
3281 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3282 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3283 if (processor_alias_table[i].flags & PTA_SSE3
3284 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3285 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3286 if (processor_alias_table[i].flags & PTA_SSSE3
3287 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3288 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3289 if (processor_alias_table[i].flags & PTA_SSE4_1
3290 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3291 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3292 if (processor_alias_table[i].flags & PTA_SSE4_2
3293 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3294 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3295 if (processor_alias_table[i].flags & PTA_AVX
3296 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3297 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3298 if (processor_alias_table[i].flags & PTA_AVX2
3299 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3300 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3301 if (processor_alias_table[i].flags & PTA_FMA
3302 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3303 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3304 if (processor_alias_table[i].flags & PTA_SSE4A
3305 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3306 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3307 if (processor_alias_table[i].flags & PTA_FMA4
3308 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3309 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3310 if (processor_alias_table[i].flags & PTA_XOP
3311 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3312 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3313 if (processor_alias_table[i].flags & PTA_LWP
3314 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3315 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3316 if (processor_alias_table[i].flags & PTA_ABM
3317 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3318 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3319 if (processor_alias_table[i].flags & PTA_BMI
3320 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3321 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3322 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3323 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3324 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3325 if (processor_alias_table[i].flags & PTA_TBM
3326 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3327 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3328 if (processor_alias_table[i].flags & PTA_BMI2
3329 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3330 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3331 if (processor_alias_table[i].flags & PTA_CX16
3332 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3333 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3334 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3335 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3336 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3337 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3338 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3339 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3340 if (processor_alias_table[i].flags & PTA_MOVBE
3341 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3342 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3343 if (processor_alias_table[i].flags & PTA_AES
3344 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3345 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3346 if (processor_alias_table[i].flags & PTA_PCLMUL
3347 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3348 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3349 if (processor_alias_table[i].flags & PTA_FSGSBASE
3350 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3351 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3352 if (processor_alias_table[i].flags & PTA_RDRND
3353 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3354 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3355 if (processor_alias_table[i].flags & PTA_F16C
3356 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3357 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3358 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3359 x86_prefetch_sse = true;
3360
3361 break;
3362 }
3363
3364 if (!strcmp (ix86_arch_string, "generic"))
3365 error ("generic CPU can be used only for %stune=%s %s",
3366 prefix, suffix, sw);
3367 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3368 error ("bad value (%s) for %sarch=%s %s",
3369 ix86_arch_string, prefix, suffix, sw);
3370
3371 ix86_arch_mask = 1u << ix86_arch;
3372 for (i = 0; i < X86_ARCH_LAST; ++i)
3373 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3374
3375 for (i = 0; i < pta_size; i++)
3376 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3377 {
3378 ix86_schedule = processor_alias_table[i].schedule;
3379 ix86_tune = processor_alias_table[i].processor;
3380 if (TARGET_64BIT)
3381 {
3382 if (!(processor_alias_table[i].flags & PTA_64BIT))
3383 {
3384 if (ix86_tune_defaulted)
3385 {
3386 ix86_tune_string = "x86-64";
3387 for (i = 0; i < pta_size; i++)
3388 if (! strcmp (ix86_tune_string,
3389 processor_alias_table[i].name))
3390 break;
3391 ix86_schedule = processor_alias_table[i].schedule;
3392 ix86_tune = processor_alias_table[i].processor;
3393 }
3394 else
3395 error ("CPU you selected does not support x86-64 "
3396 "instruction set");
3397 }
3398 }
3399 else
3400 {
3401 /* Adjust tuning when compiling for 32-bit ABI. */
3402 switch (ix86_tune)
3403 {
3404 case PROCESSOR_GENERIC64:
3405 ix86_tune = PROCESSOR_GENERIC32;
3406 ix86_schedule = CPU_PENTIUMPRO;
3407 break;
3408
3409 case PROCESSOR_CORE2_64:
3410 ix86_tune = PROCESSOR_CORE2_32;
3411 break;
3412
3413 case PROCESSOR_COREI7_64:
3414 ix86_tune = PROCESSOR_COREI7_32;
3415 break;
3416
3417 default:
3418 break;
3419 }
3420 }
3421 /* Intel CPUs have always interpreted SSE prefetch instructions as
3422 NOPs; so, we can enable SSE prefetch instructions even when
3423 -mtune (rather than -march) points us to a processor that has them.
3424 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3425 higher processors. */
3426 if (TARGET_CMOVE
3427 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3428 x86_prefetch_sse = true;
3429 break;
3430 }
3431
3432 if (ix86_tune_specified && i == pta_size)
3433 error ("bad value (%s) for %stune=%s %s",
3434 ix86_tune_string, prefix, suffix, sw);
3435
3436 ix86_tune_mask = 1u << ix86_tune;
3437 for (i = 0; i < X86_TUNE_LAST; ++i)
3438 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3439
3440 #ifndef USE_IX86_FRAME_POINTER
3441 #define USE_IX86_FRAME_POINTER 0
3442 #endif
3443
3444 #ifndef USE_X86_64_FRAME_POINTER
3445 #define USE_X86_64_FRAME_POINTER 0
3446 #endif
3447
3448 /* Set the default values for switches whose default depends on TARGET_64BIT
3449 in case they weren't overwritten by command line options. */
3450 if (TARGET_64BIT)
3451 {
3452 if (optimize > 1 && !global_options_set.x_flag_zee)
3453 flag_zee = 1;
3454 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3455 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3456 if (flag_asynchronous_unwind_tables == 2)
3457 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3458 if (flag_pcc_struct_return == 2)
3459 flag_pcc_struct_return = 0;
3460 }
3461 else
3462 {
3463 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3464 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3465 if (flag_asynchronous_unwind_tables == 2)
3466 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3467 if (flag_pcc_struct_return == 2)
3468 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3469 }
3470
3471 if (optimize_size)
3472 ix86_cost = &ix86_size_cost;
3473 else
3474 ix86_cost = processor_target_table[ix86_tune].cost;
3475
3476 /* Arrange to set up i386_stack_locals for all functions. */
3477 init_machine_status = ix86_init_machine_status;
3478
3479 /* Validate -mregparm= value. */
3480 if (global_options_set.x_ix86_regparm)
3481 {
3482 if (TARGET_64BIT)
3483 warning (0, "-mregparm is ignored in 64-bit mode");
3484 if (ix86_regparm > REGPARM_MAX)
3485 {
3486 error ("-mregparm=%d is not between 0 and %d",
3487 ix86_regparm, REGPARM_MAX);
3488 ix86_regparm = 0;
3489 }
3490 }
3491 if (TARGET_64BIT)
3492 ix86_regparm = REGPARM_MAX;
3493
3494 /* Default align_* from the processor table. */
3495 if (align_loops == 0)
3496 {
3497 align_loops = processor_target_table[ix86_tune].align_loop;
3498 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3499 }
3500 if (align_jumps == 0)
3501 {
3502 align_jumps = processor_target_table[ix86_tune].align_jump;
3503 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3504 }
3505 if (align_functions == 0)
3506 {
3507 align_functions = processor_target_table[ix86_tune].align_func;
3508 }
3509
3510 /* Provide default for -mbranch-cost= value. */
3511 if (!global_options_set.x_ix86_branch_cost)
3512 ix86_branch_cost = ix86_cost->branch_cost;
3513
3514 if (TARGET_64BIT)
3515 {
3516 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3517
3518 /* Enable by default the SSE and MMX builtins. Do allow the user to
3519 explicitly disable any of these. In particular, disabling SSE and
3520 MMX for kernel code is extremely useful. */
3521 if (!ix86_arch_specified)
3522 ix86_isa_flags
3523 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3524 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3525
3526 if (TARGET_RTD)
3527 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3528 }
3529 else
3530 {
3531 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3532
3533 if (!ix86_arch_specified)
3534 ix86_isa_flags
3535 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3536
3537 /* i386 ABI does not specify red zone. It still makes sense to use it
3538 when programmer takes care to stack from being destroyed. */
3539 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3540 target_flags |= MASK_NO_RED_ZONE;
3541 }
3542
3543 /* Keep nonleaf frame pointers. */
3544 if (flag_omit_frame_pointer)
3545 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3546 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3547 flag_omit_frame_pointer = 1;
3548
3549 /* If we're doing fast math, we don't care about comparison order
3550 wrt NaNs. This lets us use a shorter comparison sequence. */
3551 if (flag_finite_math_only)
3552 target_flags &= ~MASK_IEEE_FP;
3553
3554 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3555 since the insns won't need emulation. */
3556 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3557 target_flags &= ~MASK_NO_FANCY_MATH_387;
3558
3559 /* Likewise, if the target doesn't have a 387, or we've specified
3560 software floating point, don't use 387 inline intrinsics. */
3561 if (!TARGET_80387)
3562 target_flags |= MASK_NO_FANCY_MATH_387;
3563
3564 /* Turn on MMX builtins for -msse. */
3565 if (TARGET_SSE)
3566 {
3567 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3568 x86_prefetch_sse = true;
3569 }
3570
3571 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3572 if (TARGET_SSE4_2 || TARGET_ABM)
3573 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3574
3575 /* Turn on lzcnt instruction for -mabm. */
3576 if (TARGET_ABM)
3577 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3578
3579 /* Validate -mpreferred-stack-boundary= value or default it to
3580 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3581 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3582 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3583 {
3584 int min = (TARGET_64BIT ? 4 : 2);
3585 int max = (TARGET_SEH ? 4 : 12);
3586
3587 if (ix86_preferred_stack_boundary_arg < min
3588 || ix86_preferred_stack_boundary_arg > max)
3589 {
3590 if (min == max)
3591 error ("-mpreferred-stack-boundary is not supported "
3592 "for this target");
3593 else
3594 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3595 ix86_preferred_stack_boundary_arg, min, max);
3596 }
3597 else
3598 ix86_preferred_stack_boundary
3599 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3600 }
3601
3602 /* Set the default value for -mstackrealign. */
3603 if (ix86_force_align_arg_pointer == -1)
3604 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3605
3606 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3607
3608 /* Validate -mincoming-stack-boundary= value or default it to
3609 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3610 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3611 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3612 {
3613 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3614 || ix86_incoming_stack_boundary_arg > 12)
3615 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3616 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3617 else
3618 {
3619 ix86_user_incoming_stack_boundary
3620 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3621 ix86_incoming_stack_boundary
3622 = ix86_user_incoming_stack_boundary;
3623 }
3624 }
3625
3626 /* Accept -msseregparm only if at least SSE support is enabled. */
3627 if (TARGET_SSEREGPARM
3628 && ! TARGET_SSE)
3629 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3630
3631 if (global_options_set.x_ix86_fpmath)
3632 {
3633 if (ix86_fpmath & FPMATH_SSE)
3634 {
3635 if (!TARGET_SSE)
3636 {
3637 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3638 ix86_fpmath = FPMATH_387;
3639 }
3640 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3641 {
3642 warning (0, "387 instruction set disabled, using SSE arithmetics");
3643 ix86_fpmath = FPMATH_SSE;
3644 }
3645 }
3646 }
3647 else
3648 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3649
3650 /* If the i387 is disabled, then do not return values in it. */
3651 if (!TARGET_80387)
3652 target_flags &= ~MASK_FLOAT_RETURNS;
3653
3654 /* Use external vectorized library in vectorizing intrinsics. */
3655 if (global_options_set.x_ix86_veclibabi_type)
3656 switch (ix86_veclibabi_type)
3657 {
3658 case ix86_veclibabi_type_svml:
3659 ix86_veclib_handler = ix86_veclibabi_svml;
3660 break;
3661
3662 case ix86_veclibabi_type_acml:
3663 ix86_veclib_handler = ix86_veclibabi_acml;
3664 break;
3665
3666 default:
3667 gcc_unreachable ();
3668 }
3669
3670 if ((!USE_IX86_FRAME_POINTER
3671 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3672 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3673 && !optimize_size)
3674 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3675
3676 /* ??? Unwind info is not correct around the CFG unless either a frame
3677 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3678 unwind info generation to be aware of the CFG and propagating states
3679 around edges. */
3680 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3681 || flag_exceptions || flag_non_call_exceptions)
3682 && flag_omit_frame_pointer
3683 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3684 {
3685 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3686 warning (0, "unwind tables currently require either a frame pointer "
3687 "or %saccumulate-outgoing-args%s for correctness",
3688 prefix, suffix);
3689 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3690 }
3691
3692 /* If stack probes are required, the space used for large function
3693 arguments on the stack must also be probed, so enable
3694 -maccumulate-outgoing-args so this happens in the prologue. */
3695 if (TARGET_STACK_PROBE
3696 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3697 {
3698 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3699 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3700 "for correctness", prefix, suffix);
3701 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3702 }
3703
3704 /* For sane SSE instruction set generation we need fcomi instruction.
3705 It is safe to enable all CMOVE instructions. Also, RDRAND intrinsic
3706 expands to a sequence that includes conditional move. */
3707 if (TARGET_SSE || TARGET_RDRND)
3708 TARGET_CMOVE = 1;
3709
3710 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3711 {
3712 char *p;
3713 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3714 p = strchr (internal_label_prefix, 'X');
3715 internal_label_prefix_len = p - internal_label_prefix;
3716 *p = '\0';
3717 }
3718
3719 /* When scheduling description is not available, disable scheduler pass
3720 so it won't slow down the compilation and make x87 code slower. */
3721 if (!TARGET_SCHEDULE)
3722 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3723
3724 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3725 ix86_cost->simultaneous_prefetches,
3726 global_options.x_param_values,
3727 global_options_set.x_param_values);
3728 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3729 global_options.x_param_values,
3730 global_options_set.x_param_values);
3731 maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3732 global_options.x_param_values,
3733 global_options_set.x_param_values);
3734 maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3735 global_options.x_param_values,
3736 global_options_set.x_param_values);
3737
3738 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3739 if (flag_prefetch_loop_arrays < 0
3740 && HAVE_prefetch
3741 && optimize >= 3
3742 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3743 flag_prefetch_loop_arrays = 1;
3744
3745 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3746 can be optimized to ap = __builtin_next_arg (0). */
3747 if (!TARGET_64BIT && !flag_split_stack)
3748 targetm.expand_builtin_va_start = NULL;
3749
3750 if (TARGET_64BIT)
3751 {
3752 ix86_gen_leave = gen_leave_rex64;
3753 ix86_gen_add3 = gen_adddi3;
3754 ix86_gen_sub3 = gen_subdi3;
3755 ix86_gen_sub3_carry = gen_subdi3_carry;
3756 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3757 ix86_gen_monitor = gen_sse3_monitor64;
3758 ix86_gen_andsp = gen_anddi3;
3759 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3760 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3761 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3762 }
3763 else
3764 {
3765 ix86_gen_leave = gen_leave;
3766 ix86_gen_add3 = gen_addsi3;
3767 ix86_gen_sub3 = gen_subsi3;
3768 ix86_gen_sub3_carry = gen_subsi3_carry;
3769 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3770 ix86_gen_monitor = gen_sse3_monitor;
3771 ix86_gen_andsp = gen_andsi3;
3772 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3773 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3774 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3775 }
3776
3777 #ifdef USE_IX86_CLD
3778 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3779 if (!TARGET_64BIT)
3780 target_flags |= MASK_CLD & ~target_flags_explicit;
3781 #endif
3782
3783 if (!TARGET_64BIT && flag_pic)
3784 {
3785 if (flag_fentry > 0)
3786 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3787 "with -fpic");
3788 flag_fentry = 0;
3789 }
3790 else if (TARGET_SEH)
3791 {
3792 if (flag_fentry == 0)
3793 sorry ("-mno-fentry isn%'t compatible with SEH");
3794 flag_fentry = 1;
3795 }
3796 else if (flag_fentry < 0)
3797 {
3798 #if defined(PROFILE_BEFORE_PROLOGUE)
3799 flag_fentry = 1;
3800 #else
3801 flag_fentry = 0;
3802 #endif
3803 }
3804
3805 if (TARGET_AVX)
3806 {
3807 /* When not optimize for size, enable vzeroupper optimization for
3808 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3809 AVX unaligned load/store. */
3810 if (!optimize_size)
3811 {
3812 if (flag_expensive_optimizations
3813 && !(target_flags_explicit & MASK_VZEROUPPER))
3814 target_flags |= MASK_VZEROUPPER;
3815 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3816 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3817 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3818 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3819 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3820 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3821 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
3822 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3823 target_flags |= MASK_PREFER_AVX128;
3824 }
3825 }
3826 else
3827 {
3828 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3829 target_flags &= ~MASK_VZEROUPPER;
3830 }
3831
3832 if (ix86_recip_name)
3833 {
3834 char *p = ASTRDUP (ix86_recip_name);
3835 char *q;
3836 unsigned int mask, i;
3837 bool invert;
3838
3839 while ((q = strtok (p, ",")) != NULL)
3840 {
3841 p = NULL;
3842 if (*q == '!')
3843 {
3844 invert = true;
3845 q++;
3846 }
3847 else
3848 invert = false;
3849
3850 if (!strcmp (q, "default"))
3851 mask = RECIP_MASK_ALL;
3852 else
3853 {
3854 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3855 if (!strcmp (q, recip_options[i].string))
3856 {
3857 mask = recip_options[i].mask;
3858 break;
3859 }
3860
3861 if (i == ARRAY_SIZE (recip_options))
3862 {
3863 error ("unknown option for -mrecip=%s", q);
3864 invert = false;
3865 mask = RECIP_MASK_NONE;
3866 }
3867 }
3868
3869 recip_mask_explicit |= mask;
3870 if (invert)
3871 recip_mask &= ~mask;
3872 else
3873 recip_mask |= mask;
3874 }
3875 }
3876
3877 if (TARGET_RECIP)
3878 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3879 else if (target_flags_explicit & MASK_RECIP)
3880 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3881
3882 /* Save the initial options in case the user does function specific
3883 options. */
3884 if (main_args_p)
3885 target_option_default_node = target_option_current_node
3886 = build_target_option_node ();
3887 }
3888
3889 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
3890
3891 static bool
3892 function_pass_avx256_p (const_rtx val)
3893 {
3894 if (!val)
3895 return false;
3896
3897 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3898 return true;
3899
3900 if (GET_CODE (val) == PARALLEL)
3901 {
3902 int i;
3903 rtx r;
3904
3905 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
3906 {
3907 r = XVECEXP (val, 0, i);
3908 if (GET_CODE (r) == EXPR_LIST
3909 && XEXP (r, 0)
3910 && REG_P (XEXP (r, 0))
3911 && (GET_MODE (XEXP (r, 0)) == OImode
3912 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
3913 return true;
3914 }
3915 }
3916
3917 return false;
3918 }
3919
3920 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3921
3922 static void
3923 ix86_option_override (void)
3924 {
3925 ix86_option_override_internal (true);
3926 }
3927
3928 /* Update register usage after having seen the compiler flags. */
3929
3930 static void
3931 ix86_conditional_register_usage (void)
3932 {
3933 int i;
3934 unsigned int j;
3935
3936 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3937 {
3938 if (fixed_regs[i] > 1)
3939 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
3940 if (call_used_regs[i] > 1)
3941 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
3942 }
3943
3944 /* The PIC register, if it exists, is fixed. */
3945 j = PIC_OFFSET_TABLE_REGNUM;
3946 if (j != INVALID_REGNUM)
3947 fixed_regs[j] = call_used_regs[j] = 1;
3948
3949 /* The 64-bit MS_ABI changes the set of call-used registers. */
3950 if (TARGET_64BIT_MS_ABI)
3951 {
3952 call_used_regs[SI_REG] = 0;
3953 call_used_regs[DI_REG] = 0;
3954 call_used_regs[XMM6_REG] = 0;
3955 call_used_regs[XMM7_REG] = 0;
3956 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3957 call_used_regs[i] = 0;
3958 }
3959
3960 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
3961 other call-clobbered regs for 64-bit. */
3962 if (TARGET_64BIT)
3963 {
3964 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3965
3966 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3967 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3968 && call_used_regs[i])
3969 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3970 }
3971
3972 /* If MMX is disabled, squash the registers. */
3973 if (! TARGET_MMX)
3974 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3975 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3976 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3977
3978 /* If SSE is disabled, squash the registers. */
3979 if (! TARGET_SSE)
3980 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3981 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3982 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3983
3984 /* If the FPU is disabled, squash the registers. */
3985 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3986 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3987 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3988 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3989
3990 /* If 32-bit, squash the 64-bit registers. */
3991 if (! TARGET_64BIT)
3992 {
3993 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3994 reg_names[i] = "";
3995 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3996 reg_names[i] = "";
3997 }
3998 }
3999
4000 \f
4001 /* Save the current options */
4002
4003 static void
4004 ix86_function_specific_save (struct cl_target_option *ptr)
4005 {
4006 ptr->arch = ix86_arch;
4007 ptr->schedule = ix86_schedule;
4008 ptr->tune = ix86_tune;
4009 ptr->branch_cost = ix86_branch_cost;
4010 ptr->tune_defaulted = ix86_tune_defaulted;
4011 ptr->arch_specified = ix86_arch_specified;
4012 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4013 ptr->ix86_target_flags_explicit = target_flags_explicit;
4014 ptr->x_recip_mask_explicit = recip_mask_explicit;
4015
4016 /* The fields are char but the variables are not; make sure the
4017 values fit in the fields. */
4018 gcc_assert (ptr->arch == ix86_arch);
4019 gcc_assert (ptr->schedule == ix86_schedule);
4020 gcc_assert (ptr->tune == ix86_tune);
4021 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4022 }
4023
4024 /* Restore the current options */
4025
4026 static void
4027 ix86_function_specific_restore (struct cl_target_option *ptr)
4028 {
4029 enum processor_type old_tune = ix86_tune;
4030 enum processor_type old_arch = ix86_arch;
4031 unsigned int ix86_arch_mask, ix86_tune_mask;
4032 int i;
4033
4034 ix86_arch = (enum processor_type) ptr->arch;
4035 ix86_schedule = (enum attr_cpu) ptr->schedule;
4036 ix86_tune = (enum processor_type) ptr->tune;
4037 ix86_branch_cost = ptr->branch_cost;
4038 ix86_tune_defaulted = ptr->tune_defaulted;
4039 ix86_arch_specified = ptr->arch_specified;
4040 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4041 target_flags_explicit = ptr->ix86_target_flags_explicit;
4042 recip_mask_explicit = ptr->x_recip_mask_explicit;
4043
4044 /* Recreate the arch feature tests if the arch changed */
4045 if (old_arch != ix86_arch)
4046 {
4047 ix86_arch_mask = 1u << ix86_arch;
4048 for (i = 0; i < X86_ARCH_LAST; ++i)
4049 ix86_arch_features[i]
4050 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4051 }
4052
4053 /* Recreate the tune optimization tests */
4054 if (old_tune != ix86_tune)
4055 {
4056 ix86_tune_mask = 1u << ix86_tune;
4057 for (i = 0; i < X86_TUNE_LAST; ++i)
4058 ix86_tune_features[i]
4059 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4060 }
4061 }
4062
4063 /* Print the current options */
4064
4065 static void
4066 ix86_function_specific_print (FILE *file, int indent,
4067 struct cl_target_option *ptr)
4068 {
4069 char *target_string
4070 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4071 NULL, NULL, ptr->x_ix86_fpmath, false);
4072
4073 fprintf (file, "%*sarch = %d (%s)\n",
4074 indent, "",
4075 ptr->arch,
4076 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4077 ? cpu_names[ptr->arch]
4078 : "<unknown>"));
4079
4080 fprintf (file, "%*stune = %d (%s)\n",
4081 indent, "",
4082 ptr->tune,
4083 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4084 ? cpu_names[ptr->tune]
4085 : "<unknown>"));
4086
4087 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4088
4089 if (target_string)
4090 {
4091 fprintf (file, "%*s%s\n", indent, "", target_string);
4092 free (target_string);
4093 }
4094 }
4095
4096 \f
4097 /* Inner function to process the attribute((target(...))), take an argument and
4098 set the current options from the argument. If we have a list, recursively go
4099 over the list. */
4100
4101 static bool
4102 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4103 struct gcc_options *enum_opts_set)
4104 {
4105 char *next_optstr;
4106 bool ret = true;
4107
4108 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4109 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4110 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4111 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4112 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4113
4114 enum ix86_opt_type
4115 {
4116 ix86_opt_unknown,
4117 ix86_opt_yes,
4118 ix86_opt_no,
4119 ix86_opt_str,
4120 ix86_opt_enum,
4121 ix86_opt_isa
4122 };
4123
4124 static const struct
4125 {
4126 const char *string;
4127 size_t len;
4128 enum ix86_opt_type type;
4129 int opt;
4130 int mask;
4131 } attrs[] = {
4132 /* isa options */
4133 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4134 IX86_ATTR_ISA ("abm", OPT_mabm),
4135 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4136 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4137 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4138 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4139 IX86_ATTR_ISA ("aes", OPT_maes),
4140 IX86_ATTR_ISA ("avx", OPT_mavx),
4141 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4142 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4143 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4144 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4145 IX86_ATTR_ISA ("sse", OPT_msse),
4146 IX86_ATTR_ISA ("sse2", OPT_msse2),
4147 IX86_ATTR_ISA ("sse3", OPT_msse3),
4148 IX86_ATTR_ISA ("sse4", OPT_msse4),
4149 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4150 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4151 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4152 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4153 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4154 IX86_ATTR_ISA ("fma", OPT_mfma),
4155 IX86_ATTR_ISA ("xop", OPT_mxop),
4156 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4157 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4158 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4159 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4160
4161 /* enum options */
4162 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4163
4164 /* string options */
4165 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4166 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4167
4168 /* flag options */
4169 IX86_ATTR_YES ("cld",
4170 OPT_mcld,
4171 MASK_CLD),
4172
4173 IX86_ATTR_NO ("fancy-math-387",
4174 OPT_mfancy_math_387,
4175 MASK_NO_FANCY_MATH_387),
4176
4177 IX86_ATTR_YES ("ieee-fp",
4178 OPT_mieee_fp,
4179 MASK_IEEE_FP),
4180
4181 IX86_ATTR_YES ("inline-all-stringops",
4182 OPT_minline_all_stringops,
4183 MASK_INLINE_ALL_STRINGOPS),
4184
4185 IX86_ATTR_YES ("inline-stringops-dynamically",
4186 OPT_minline_stringops_dynamically,
4187 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4188
4189 IX86_ATTR_NO ("align-stringops",
4190 OPT_mno_align_stringops,
4191 MASK_NO_ALIGN_STRINGOPS),
4192
4193 IX86_ATTR_YES ("recip",
4194 OPT_mrecip,
4195 MASK_RECIP),
4196
4197 };
4198
4199 /* If this is a list, recurse to get the options. */
4200 if (TREE_CODE (args) == TREE_LIST)
4201 {
4202 bool ret = true;
4203
4204 for (; args; args = TREE_CHAIN (args))
4205 if (TREE_VALUE (args)
4206 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4207 p_strings, enum_opts_set))
4208 ret = false;
4209
4210 return ret;
4211 }
4212
4213 else if (TREE_CODE (args) != STRING_CST)
4214 gcc_unreachable ();
4215
4216 /* Handle multiple arguments separated by commas. */
4217 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4218
4219 while (next_optstr && *next_optstr != '\0')
4220 {
4221 char *p = next_optstr;
4222 char *orig_p = p;
4223 char *comma = strchr (next_optstr, ',');
4224 const char *opt_string;
4225 size_t len, opt_len;
4226 int opt;
4227 bool opt_set_p;
4228 char ch;
4229 unsigned i;
4230 enum ix86_opt_type type = ix86_opt_unknown;
4231 int mask = 0;
4232
4233 if (comma)
4234 {
4235 *comma = '\0';
4236 len = comma - next_optstr;
4237 next_optstr = comma + 1;
4238 }
4239 else
4240 {
4241 len = strlen (p);
4242 next_optstr = NULL;
4243 }
4244
4245 /* Recognize no-xxx. */
4246 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4247 {
4248 opt_set_p = false;
4249 p += 3;
4250 len -= 3;
4251 }
4252 else
4253 opt_set_p = true;
4254
4255 /* Find the option. */
4256 ch = *p;
4257 opt = N_OPTS;
4258 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4259 {
4260 type = attrs[i].type;
4261 opt_len = attrs[i].len;
4262 if (ch == attrs[i].string[0]
4263 && ((type != ix86_opt_str && type != ix86_opt_enum)
4264 ? len == opt_len
4265 : len > opt_len)
4266 && memcmp (p, attrs[i].string, opt_len) == 0)
4267 {
4268 opt = attrs[i].opt;
4269 mask = attrs[i].mask;
4270 opt_string = attrs[i].string;
4271 break;
4272 }
4273 }
4274
4275 /* Process the option. */
4276 if (opt == N_OPTS)
4277 {
4278 error ("attribute(target(\"%s\")) is unknown", orig_p);
4279 ret = false;
4280 }
4281
4282 else if (type == ix86_opt_isa)
4283 {
4284 struct cl_decoded_option decoded;
4285
4286 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4287 ix86_handle_option (&global_options, &global_options_set,
4288 &decoded, input_location);
4289 }
4290
4291 else if (type == ix86_opt_yes || type == ix86_opt_no)
4292 {
4293 if (type == ix86_opt_no)
4294 opt_set_p = !opt_set_p;
4295
4296 if (opt_set_p)
4297 target_flags |= mask;
4298 else
4299 target_flags &= ~mask;
4300 }
4301
4302 else if (type == ix86_opt_str)
4303 {
4304 if (p_strings[opt])
4305 {
4306 error ("option(\"%s\") was already specified", opt_string);
4307 ret = false;
4308 }
4309 else
4310 p_strings[opt] = xstrdup (p + opt_len);
4311 }
4312
4313 else if (type == ix86_opt_enum)
4314 {
4315 bool arg_ok;
4316 int value;
4317
4318 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4319 if (arg_ok)
4320 set_option (&global_options, enum_opts_set, opt, value,
4321 p + opt_len, DK_UNSPECIFIED, input_location,
4322 global_dc);
4323 else
4324 {
4325 error ("attribute(target(\"%s\")) is unknown", orig_p);
4326 ret = false;
4327 }
4328 }
4329
4330 else
4331 gcc_unreachable ();
4332 }
4333
4334 return ret;
4335 }
4336
4337 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4338
4339 tree
4340 ix86_valid_target_attribute_tree (tree args)
4341 {
4342 const char *orig_arch_string = ix86_arch_string;
4343 const char *orig_tune_string = ix86_tune_string;
4344 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4345 int orig_tune_defaulted = ix86_tune_defaulted;
4346 int orig_arch_specified = ix86_arch_specified;
4347 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4348 tree t = NULL_TREE;
4349 int i;
4350 struct cl_target_option *def
4351 = TREE_TARGET_OPTION (target_option_default_node);
4352 struct gcc_options enum_opts_set;
4353
4354 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4355
4356 /* Process each of the options on the chain. */
4357 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4358 &enum_opts_set))
4359 return NULL_TREE;
4360
4361 /* If the changed options are different from the default, rerun
4362 ix86_option_override_internal, and then save the options away.
4363 The string options are are attribute options, and will be undone
4364 when we copy the save structure. */
4365 if (ix86_isa_flags != def->x_ix86_isa_flags
4366 || target_flags != def->x_target_flags
4367 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4368 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4369 || enum_opts_set.x_ix86_fpmath)
4370 {
4371 /* If we are using the default tune= or arch=, undo the string assigned,
4372 and use the default. */
4373 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4374 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4375 else if (!orig_arch_specified)
4376 ix86_arch_string = NULL;
4377
4378 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4379 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4380 else if (orig_tune_defaulted)
4381 ix86_tune_string = NULL;
4382
4383 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4384 if (enum_opts_set.x_ix86_fpmath)
4385 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4386 else if (!TARGET_64BIT && TARGET_SSE)
4387 {
4388 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4389 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4390 }
4391
4392 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4393 ix86_option_override_internal (false);
4394
4395 /* Add any builtin functions with the new isa if any. */
4396 ix86_add_new_builtins (ix86_isa_flags);
4397
4398 /* Save the current options unless we are validating options for
4399 #pragma. */
4400 t = build_target_option_node ();
4401
4402 ix86_arch_string = orig_arch_string;
4403 ix86_tune_string = orig_tune_string;
4404 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4405
4406 /* Free up memory allocated to hold the strings */
4407 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4408 free (option_strings[i]);
4409 }
4410
4411 return t;
4412 }
4413
4414 /* Hook to validate attribute((target("string"))). */
4415
4416 static bool
4417 ix86_valid_target_attribute_p (tree fndecl,
4418 tree ARG_UNUSED (name),
4419 tree args,
4420 int ARG_UNUSED (flags))
4421 {
4422 struct cl_target_option cur_target;
4423 bool ret = true;
4424 tree old_optimize = build_optimization_node ();
4425 tree new_target, new_optimize;
4426 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4427
4428 /* If the function changed the optimization levels as well as setting target
4429 options, start with the optimizations specified. */
4430 if (func_optimize && func_optimize != old_optimize)
4431 cl_optimization_restore (&global_options,
4432 TREE_OPTIMIZATION (func_optimize));
4433
4434 /* The target attributes may also change some optimization flags, so update
4435 the optimization options if necessary. */
4436 cl_target_option_save (&cur_target, &global_options);
4437 new_target = ix86_valid_target_attribute_tree (args);
4438 new_optimize = build_optimization_node ();
4439
4440 if (!new_target)
4441 ret = false;
4442
4443 else if (fndecl)
4444 {
4445 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4446
4447 if (old_optimize != new_optimize)
4448 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4449 }
4450
4451 cl_target_option_restore (&global_options, &cur_target);
4452
4453 if (old_optimize != new_optimize)
4454 cl_optimization_restore (&global_options,
4455 TREE_OPTIMIZATION (old_optimize));
4456
4457 return ret;
4458 }
4459
4460 \f
4461 /* Hook to determine if one function can safely inline another. */
4462
4463 static bool
4464 ix86_can_inline_p (tree caller, tree callee)
4465 {
4466 bool ret = false;
4467 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4468 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4469
4470 /* If callee has no option attributes, then it is ok to inline. */
4471 if (!callee_tree)
4472 ret = true;
4473
4474 /* If caller has no option attributes, but callee does then it is not ok to
4475 inline. */
4476 else if (!caller_tree)
4477 ret = false;
4478
4479 else
4480 {
4481 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4482 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4483
4484 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4485 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4486 function. */
4487 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4488 != callee_opts->x_ix86_isa_flags)
4489 ret = false;
4490
4491 /* See if we have the same non-isa options. */
4492 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4493 ret = false;
4494
4495 /* See if arch, tune, etc. are the same. */
4496 else if (caller_opts->arch != callee_opts->arch)
4497 ret = false;
4498
4499 else if (caller_opts->tune != callee_opts->tune)
4500 ret = false;
4501
4502 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4503 ret = false;
4504
4505 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4506 ret = false;
4507
4508 else
4509 ret = true;
4510 }
4511
4512 return ret;
4513 }
4514
4515 \f
4516 /* Remember the last target of ix86_set_current_function. */
4517 static GTY(()) tree ix86_previous_fndecl;
4518
4519 /* Establish appropriate back-end context for processing the function
4520 FNDECL. The argument might be NULL to indicate processing at top
4521 level, outside of any function scope. */
4522 static void
4523 ix86_set_current_function (tree fndecl)
4524 {
4525 /* Only change the context if the function changes. This hook is called
4526 several times in the course of compiling a function, and we don't want to
4527 slow things down too much or call target_reinit when it isn't safe. */
4528 if (fndecl && fndecl != ix86_previous_fndecl)
4529 {
4530 tree old_tree = (ix86_previous_fndecl
4531 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4532 : NULL_TREE);
4533
4534 tree new_tree = (fndecl
4535 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4536 : NULL_TREE);
4537
4538 ix86_previous_fndecl = fndecl;
4539 if (old_tree == new_tree)
4540 ;
4541
4542 else if (new_tree)
4543 {
4544 cl_target_option_restore (&global_options,
4545 TREE_TARGET_OPTION (new_tree));
4546 target_reinit ();
4547 }
4548
4549 else if (old_tree)
4550 {
4551 struct cl_target_option *def
4552 = TREE_TARGET_OPTION (target_option_current_node);
4553
4554 cl_target_option_restore (&global_options, def);
4555 target_reinit ();
4556 }
4557 }
4558 }
4559
4560 \f
4561 /* Return true if this goes in large data/bss. */
4562
4563 static bool
4564 ix86_in_large_data_p (tree exp)
4565 {
4566 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4567 return false;
4568
4569 /* Functions are never large data. */
4570 if (TREE_CODE (exp) == FUNCTION_DECL)
4571 return false;
4572
4573 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4574 {
4575 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4576 if (strcmp (section, ".ldata") == 0
4577 || strcmp (section, ".lbss") == 0)
4578 return true;
4579 return false;
4580 }
4581 else
4582 {
4583 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4584
4585 /* If this is an incomplete type with size 0, then we can't put it
4586 in data because it might be too big when completed. */
4587 if (!size || size > ix86_section_threshold)
4588 return true;
4589 }
4590
4591 return false;
4592 }
4593
4594 /* Switch to the appropriate section for output of DECL.
4595 DECL is either a `VAR_DECL' node or a constant of some sort.
4596 RELOC indicates whether forming the initial value of DECL requires
4597 link-time relocations. */
4598
4599 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4600 ATTRIBUTE_UNUSED;
4601
4602 static section *
4603 x86_64_elf_select_section (tree decl, int reloc,
4604 unsigned HOST_WIDE_INT align)
4605 {
4606 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4607 && ix86_in_large_data_p (decl))
4608 {
4609 const char *sname = NULL;
4610 unsigned int flags = SECTION_WRITE;
4611 switch (categorize_decl_for_section (decl, reloc))
4612 {
4613 case SECCAT_DATA:
4614 sname = ".ldata";
4615 break;
4616 case SECCAT_DATA_REL:
4617 sname = ".ldata.rel";
4618 break;
4619 case SECCAT_DATA_REL_LOCAL:
4620 sname = ".ldata.rel.local";
4621 break;
4622 case SECCAT_DATA_REL_RO:
4623 sname = ".ldata.rel.ro";
4624 break;
4625 case SECCAT_DATA_REL_RO_LOCAL:
4626 sname = ".ldata.rel.ro.local";
4627 break;
4628 case SECCAT_BSS:
4629 sname = ".lbss";
4630 flags |= SECTION_BSS;
4631 break;
4632 case SECCAT_RODATA:
4633 case SECCAT_RODATA_MERGE_STR:
4634 case SECCAT_RODATA_MERGE_STR_INIT:
4635 case SECCAT_RODATA_MERGE_CONST:
4636 sname = ".lrodata";
4637 flags = 0;
4638 break;
4639 case SECCAT_SRODATA:
4640 case SECCAT_SDATA:
4641 case SECCAT_SBSS:
4642 gcc_unreachable ();
4643 case SECCAT_TEXT:
4644 case SECCAT_TDATA:
4645 case SECCAT_TBSS:
4646 /* We don't split these for medium model. Place them into
4647 default sections and hope for best. */
4648 break;
4649 }
4650 if (sname)
4651 {
4652 /* We might get called with string constants, but get_named_section
4653 doesn't like them as they are not DECLs. Also, we need to set
4654 flags in that case. */
4655 if (!DECL_P (decl))
4656 return get_section (sname, flags, NULL);
4657 return get_named_section (decl, sname, reloc);
4658 }
4659 }
4660 return default_elf_select_section (decl, reloc, align);
4661 }
4662
4663 /* Build up a unique section name, expressed as a
4664 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4665 RELOC indicates whether the initial value of EXP requires
4666 link-time relocations. */
4667
4668 static void ATTRIBUTE_UNUSED
4669 x86_64_elf_unique_section (tree decl, int reloc)
4670 {
4671 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4672 && ix86_in_large_data_p (decl))
4673 {
4674 const char *prefix = NULL;
4675 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4676 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4677
4678 switch (categorize_decl_for_section (decl, reloc))
4679 {
4680 case SECCAT_DATA:
4681 case SECCAT_DATA_REL:
4682 case SECCAT_DATA_REL_LOCAL:
4683 case SECCAT_DATA_REL_RO:
4684 case SECCAT_DATA_REL_RO_LOCAL:
4685 prefix = one_only ? ".ld" : ".ldata";
4686 break;
4687 case SECCAT_BSS:
4688 prefix = one_only ? ".lb" : ".lbss";
4689 break;
4690 case SECCAT_RODATA:
4691 case SECCAT_RODATA_MERGE_STR:
4692 case SECCAT_RODATA_MERGE_STR_INIT:
4693 case SECCAT_RODATA_MERGE_CONST:
4694 prefix = one_only ? ".lr" : ".lrodata";
4695 break;
4696 case SECCAT_SRODATA:
4697 case SECCAT_SDATA:
4698 case SECCAT_SBSS:
4699 gcc_unreachable ();
4700 case SECCAT_TEXT:
4701 case SECCAT_TDATA:
4702 case SECCAT_TBSS:
4703 /* We don't split these for medium model. Place them into
4704 default sections and hope for best. */
4705 break;
4706 }
4707 if (prefix)
4708 {
4709 const char *name, *linkonce;
4710 char *string;
4711
4712 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4713 name = targetm.strip_name_encoding (name);
4714
4715 /* If we're using one_only, then there needs to be a .gnu.linkonce
4716 prefix to the section name. */
4717 linkonce = one_only ? ".gnu.linkonce" : "";
4718
4719 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4720
4721 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4722 return;
4723 }
4724 }
4725 default_unique_section (decl, reloc);
4726 }
4727
4728 #ifdef COMMON_ASM_OP
4729 /* This says how to output assembler code to declare an
4730 uninitialized external linkage data object.
4731
4732 For medium model x86-64 we need to use .largecomm opcode for
4733 large objects. */
4734 void
4735 x86_elf_aligned_common (FILE *file,
4736 const char *name, unsigned HOST_WIDE_INT size,
4737 int align)
4738 {
4739 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4740 && size > (unsigned int)ix86_section_threshold)
4741 fputs (".largecomm\t", file);
4742 else
4743 fputs (COMMON_ASM_OP, file);
4744 assemble_name (file, name);
4745 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4746 size, align / BITS_PER_UNIT);
4747 }
4748 #endif
4749
4750 /* Utility function for targets to use in implementing
4751 ASM_OUTPUT_ALIGNED_BSS. */
4752
4753 void
4754 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4755 const char *name, unsigned HOST_WIDE_INT size,
4756 int align)
4757 {
4758 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4759 && size > (unsigned int)ix86_section_threshold)
4760 switch_to_section (get_named_section (decl, ".lbss", 0));
4761 else
4762 switch_to_section (bss_section);
4763 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4764 #ifdef ASM_DECLARE_OBJECT_NAME
4765 last_assemble_variable_decl = decl;
4766 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4767 #else
4768 /* Standard thing is just output label for the object. */
4769 ASM_OUTPUT_LABEL (file, name);
4770 #endif /* ASM_DECLARE_OBJECT_NAME */
4771 ASM_OUTPUT_SKIP (file, size ? size : 1);
4772 }
4773 \f
4774 /* Decide whether we must probe the stack before any space allocation
4775 on this target. It's essentially TARGET_STACK_PROBE except when
4776 -fstack-check causes the stack to be already probed differently. */
4777
4778 bool
4779 ix86_target_stack_probe (void)
4780 {
4781 /* Do not probe the stack twice if static stack checking is enabled. */
4782 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4783 return false;
4784
4785 return TARGET_STACK_PROBE;
4786 }
4787 \f
4788 /* Decide whether we can make a sibling call to a function. DECL is the
4789 declaration of the function being targeted by the call and EXP is the
4790 CALL_EXPR representing the call. */
4791
4792 static bool
4793 ix86_function_ok_for_sibcall (tree decl, tree exp)
4794 {
4795 tree type, decl_or_type;
4796 rtx a, b;
4797
4798 /* If we are generating position-independent code, we cannot sibcall
4799 optimize any indirect call, or a direct call to a global function,
4800 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4801 if (!TARGET_MACHO
4802 && !TARGET_64BIT
4803 && flag_pic
4804 && (!decl || !targetm.binds_local_p (decl)))
4805 return false;
4806
4807 /* If we need to align the outgoing stack, then sibcalling would
4808 unalign the stack, which may break the called function. */
4809 if (ix86_minimum_incoming_stack_boundary (true)
4810 < PREFERRED_STACK_BOUNDARY)
4811 return false;
4812
4813 if (decl)
4814 {
4815 decl_or_type = decl;
4816 type = TREE_TYPE (decl);
4817 }
4818 else
4819 {
4820 /* We're looking at the CALL_EXPR, we need the type of the function. */
4821 type = CALL_EXPR_FN (exp); /* pointer expression */
4822 type = TREE_TYPE (type); /* pointer type */
4823 type = TREE_TYPE (type); /* function type */
4824 decl_or_type = type;
4825 }
4826
4827 /* Check that the return value locations are the same. Like
4828 if we are returning floats on the 80387 register stack, we cannot
4829 make a sibcall from a function that doesn't return a float to a
4830 function that does or, conversely, from a function that does return
4831 a float to a function that doesn't; the necessary stack adjustment
4832 would not be executed. This is also the place we notice
4833 differences in the return value ABI. Note that it is ok for one
4834 of the functions to have void return type as long as the return
4835 value of the other is passed in a register. */
4836 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4837 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4838 cfun->decl, false);
4839 if (STACK_REG_P (a) || STACK_REG_P (b))
4840 {
4841 if (!rtx_equal_p (a, b))
4842 return false;
4843 }
4844 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4845 {
4846 /* Disable sibcall if we need to generate vzeroupper after
4847 callee returns. */
4848 if (TARGET_VZEROUPPER
4849 && cfun->machine->callee_return_avx256_p
4850 && !cfun->machine->caller_return_avx256_p)
4851 return false;
4852 }
4853 else if (!rtx_equal_p (a, b))
4854 return false;
4855
4856 if (TARGET_64BIT)
4857 {
4858 /* The SYSV ABI has more call-clobbered registers;
4859 disallow sibcalls from MS to SYSV. */
4860 if (cfun->machine->call_abi == MS_ABI
4861 && ix86_function_type_abi (type) == SYSV_ABI)
4862 return false;
4863 }
4864 else
4865 {
4866 /* If this call is indirect, we'll need to be able to use a
4867 call-clobbered register for the address of the target function.
4868 Make sure that all such registers are not used for passing
4869 parameters. Note that DLLIMPORT functions are indirect. */
4870 if (!decl
4871 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4872 {
4873 if (ix86_function_regparm (type, NULL) >= 3)
4874 {
4875 /* ??? Need to count the actual number of registers to be used,
4876 not the possible number of registers. Fix later. */
4877 return false;
4878 }
4879 }
4880 }
4881
4882 /* Otherwise okay. That also includes certain types of indirect calls. */
4883 return true;
4884 }
4885
4886 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4887 and "sseregparm" calling convention attributes;
4888 arguments as in struct attribute_spec.handler. */
4889
4890 static tree
4891 ix86_handle_cconv_attribute (tree *node, tree name,
4892 tree args,
4893 int flags ATTRIBUTE_UNUSED,
4894 bool *no_add_attrs)
4895 {
4896 if (TREE_CODE (*node) != FUNCTION_TYPE
4897 && TREE_CODE (*node) != METHOD_TYPE
4898 && TREE_CODE (*node) != FIELD_DECL
4899 && TREE_CODE (*node) != TYPE_DECL)
4900 {
4901 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4902 name);
4903 *no_add_attrs = true;
4904 return NULL_TREE;
4905 }
4906
4907 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4908 if (is_attribute_p ("regparm", name))
4909 {
4910 tree cst;
4911
4912 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4913 {
4914 error ("fastcall and regparm attributes are not compatible");
4915 }
4916
4917 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4918 {
4919 error ("regparam and thiscall attributes are not compatible");
4920 }
4921
4922 cst = TREE_VALUE (args);
4923 if (TREE_CODE (cst) != INTEGER_CST)
4924 {
4925 warning (OPT_Wattributes,
4926 "%qE attribute requires an integer constant argument",
4927 name);
4928 *no_add_attrs = true;
4929 }
4930 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4931 {
4932 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4933 name, REGPARM_MAX);
4934 *no_add_attrs = true;
4935 }
4936
4937 return NULL_TREE;
4938 }
4939
4940 if (TARGET_64BIT)
4941 {
4942 /* Do not warn when emulating the MS ABI. */
4943 if ((TREE_CODE (*node) != FUNCTION_TYPE
4944 && TREE_CODE (*node) != METHOD_TYPE)
4945 || ix86_function_type_abi (*node) != MS_ABI)
4946 warning (OPT_Wattributes, "%qE attribute ignored",
4947 name);
4948 *no_add_attrs = true;
4949 return NULL_TREE;
4950 }
4951
4952 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4953 if (is_attribute_p ("fastcall", name))
4954 {
4955 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4956 {
4957 error ("fastcall and cdecl attributes are not compatible");
4958 }
4959 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4960 {
4961 error ("fastcall and stdcall attributes are not compatible");
4962 }
4963 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4964 {
4965 error ("fastcall and regparm attributes are not compatible");
4966 }
4967 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4968 {
4969 error ("fastcall and thiscall attributes are not compatible");
4970 }
4971 }
4972
4973 /* Can combine stdcall with fastcall (redundant), regparm and
4974 sseregparm. */
4975 else if (is_attribute_p ("stdcall", name))
4976 {
4977 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4978 {
4979 error ("stdcall and cdecl attributes are not compatible");
4980 }
4981 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4982 {
4983 error ("stdcall and fastcall attributes are not compatible");
4984 }
4985 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4986 {
4987 error ("stdcall and thiscall attributes are not compatible");
4988 }
4989 }
4990
4991 /* Can combine cdecl with regparm and sseregparm. */
4992 else if (is_attribute_p ("cdecl", name))
4993 {
4994 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4995 {
4996 error ("stdcall and cdecl attributes are not compatible");
4997 }
4998 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4999 {
5000 error ("fastcall and cdecl attributes are not compatible");
5001 }
5002 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5003 {
5004 error ("cdecl and thiscall attributes are not compatible");
5005 }
5006 }
5007 else if (is_attribute_p ("thiscall", name))
5008 {
5009 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5010 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5011 name);
5012 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5013 {
5014 error ("stdcall and thiscall attributes are not compatible");
5015 }
5016 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5017 {
5018 error ("fastcall and thiscall attributes are not compatible");
5019 }
5020 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5021 {
5022 error ("cdecl and thiscall attributes are not compatible");
5023 }
5024 }
5025
5026 /* Can combine sseregparm with all attributes. */
5027
5028 return NULL_TREE;
5029 }
5030
5031 /* This function determines from TYPE the calling-convention. */
5032
5033 unsigned int
5034 ix86_get_callcvt (const_tree type)
5035 {
5036 unsigned int ret = 0;
5037 bool is_stdarg;
5038 tree attrs;
5039
5040 if (TARGET_64BIT)
5041 return IX86_CALLCVT_CDECL;
5042
5043 attrs = TYPE_ATTRIBUTES (type);
5044 if (attrs != NULL_TREE)
5045 {
5046 if (lookup_attribute ("cdecl", attrs))
5047 ret |= IX86_CALLCVT_CDECL;
5048 else if (lookup_attribute ("stdcall", attrs))
5049 ret |= IX86_CALLCVT_STDCALL;
5050 else if (lookup_attribute ("fastcall", attrs))
5051 ret |= IX86_CALLCVT_FASTCALL;
5052 else if (lookup_attribute ("thiscall", attrs))
5053 ret |= IX86_CALLCVT_THISCALL;
5054
5055 /* Regparam isn't allowed for thiscall and fastcall. */
5056 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5057 {
5058 if (lookup_attribute ("regparm", attrs))
5059 ret |= IX86_CALLCVT_REGPARM;
5060 if (lookup_attribute ("sseregparm", attrs))
5061 ret |= IX86_CALLCVT_SSEREGPARM;
5062 }
5063
5064 if (IX86_BASE_CALLCVT(ret) != 0)
5065 return ret;
5066 }
5067
5068 is_stdarg = stdarg_p (type);
5069 if (TARGET_RTD && !is_stdarg)
5070 return IX86_CALLCVT_STDCALL | ret;
5071
5072 if (ret != 0
5073 || is_stdarg
5074 || TREE_CODE (type) != METHOD_TYPE
5075 || ix86_function_type_abi (type) != MS_ABI)
5076 return IX86_CALLCVT_CDECL | ret;
5077
5078 return IX86_CALLCVT_THISCALL;
5079 }
5080
5081 /* Return 0 if the attributes for two types are incompatible, 1 if they
5082 are compatible, and 2 if they are nearly compatible (which causes a
5083 warning to be generated). */
5084
5085 static int
5086 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5087 {
5088 unsigned int ccvt1, ccvt2;
5089
5090 if (TREE_CODE (type1) != FUNCTION_TYPE
5091 && TREE_CODE (type1) != METHOD_TYPE)
5092 return 1;
5093
5094 ccvt1 = ix86_get_callcvt (type1);
5095 ccvt2 = ix86_get_callcvt (type2);
5096 if (ccvt1 != ccvt2)
5097 return 0;
5098 if (ix86_function_regparm (type1, NULL)
5099 != ix86_function_regparm (type2, NULL))
5100 return 0;
5101
5102 return 1;
5103 }
5104 \f
5105 /* Return the regparm value for a function with the indicated TYPE and DECL.
5106 DECL may be NULL when calling function indirectly
5107 or considering a libcall. */
5108
5109 static int
5110 ix86_function_regparm (const_tree type, const_tree decl)
5111 {
5112 tree attr;
5113 int regparm;
5114 unsigned int ccvt;
5115
5116 if (TARGET_64BIT)
5117 return (ix86_function_type_abi (type) == SYSV_ABI
5118 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5119 ccvt = ix86_get_callcvt (type);
5120 regparm = ix86_regparm;
5121
5122 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5123 {
5124 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5125 if (attr)
5126 {
5127 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5128 return regparm;
5129 }
5130 }
5131 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5132 return 2;
5133 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5134 return 1;
5135
5136 /* Use register calling convention for local functions when possible. */
5137 if (decl
5138 && TREE_CODE (decl) == FUNCTION_DECL
5139 && optimize
5140 && !(profile_flag && !flag_fentry))
5141 {
5142 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5143 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5144 if (i && i->local && i->can_change_signature)
5145 {
5146 int local_regparm, globals = 0, regno;
5147
5148 /* Make sure no regparm register is taken by a
5149 fixed register variable. */
5150 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5151 if (fixed_regs[local_regparm])
5152 break;
5153
5154 /* We don't want to use regparm(3) for nested functions as
5155 these use a static chain pointer in the third argument. */
5156 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5157 local_regparm = 2;
5158
5159 /* In 32-bit mode save a register for the split stack. */
5160 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5161 local_regparm = 2;
5162
5163 /* Each fixed register usage increases register pressure,
5164 so less registers should be used for argument passing.
5165 This functionality can be overriden by an explicit
5166 regparm value. */
5167 for (regno = 0; regno <= DI_REG; regno++)
5168 if (fixed_regs[regno])
5169 globals++;
5170
5171 local_regparm
5172 = globals < local_regparm ? local_regparm - globals : 0;
5173
5174 if (local_regparm > regparm)
5175 regparm = local_regparm;
5176 }
5177 }
5178
5179 return regparm;
5180 }
5181
5182 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5183 DFmode (2) arguments in SSE registers for a function with the
5184 indicated TYPE and DECL. DECL may be NULL when calling function
5185 indirectly or considering a libcall. Otherwise return 0. */
5186
5187 static int
5188 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5189 {
5190 gcc_assert (!TARGET_64BIT);
5191
5192 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5193 by the sseregparm attribute. */
5194 if (TARGET_SSEREGPARM
5195 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5196 {
5197 if (!TARGET_SSE)
5198 {
5199 if (warn)
5200 {
5201 if (decl)
5202 error ("calling %qD with attribute sseregparm without "
5203 "SSE/SSE2 enabled", decl);
5204 else
5205 error ("calling %qT with attribute sseregparm without "
5206 "SSE/SSE2 enabled", type);
5207 }
5208 return 0;
5209 }
5210
5211 return 2;
5212 }
5213
5214 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5215 (and DFmode for SSE2) arguments in SSE registers. */
5216 if (decl && TARGET_SSE_MATH && optimize
5217 && !(profile_flag && !flag_fentry))
5218 {
5219 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5220 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5221 if (i && i->local && i->can_change_signature)
5222 return TARGET_SSE2 ? 2 : 1;
5223 }
5224
5225 return 0;
5226 }
5227
5228 /* Return true if EAX is live at the start of the function. Used by
5229 ix86_expand_prologue to determine if we need special help before
5230 calling allocate_stack_worker. */
5231
5232 static bool
5233 ix86_eax_live_at_start_p (void)
5234 {
5235 /* Cheat. Don't bother working forward from ix86_function_regparm
5236 to the function type to whether an actual argument is located in
5237 eax. Instead just look at cfg info, which is still close enough
5238 to correct at this point. This gives false positives for broken
5239 functions that might use uninitialized data that happens to be
5240 allocated in eax, but who cares? */
5241 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5242 }
5243
5244 static bool
5245 ix86_keep_aggregate_return_pointer (tree fntype)
5246 {
5247 tree attr;
5248
5249 if (!TARGET_64BIT)
5250 {
5251 attr = lookup_attribute ("callee_pop_aggregate_return",
5252 TYPE_ATTRIBUTES (fntype));
5253 if (attr)
5254 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5255
5256 /* For 32-bit MS-ABI the default is to keep aggregate
5257 return pointer. */
5258 if (ix86_function_type_abi (fntype) == MS_ABI)
5259 return true;
5260 }
5261 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5262 }
5263
5264 /* Value is the number of bytes of arguments automatically
5265 popped when returning from a subroutine call.
5266 FUNDECL is the declaration node of the function (as a tree),
5267 FUNTYPE is the data type of the function (as a tree),
5268 or for a library call it is an identifier node for the subroutine name.
5269 SIZE is the number of bytes of arguments passed on the stack.
5270
5271 On the 80386, the RTD insn may be used to pop them if the number
5272 of args is fixed, but if the number is variable then the caller
5273 must pop them all. RTD can't be used for library calls now
5274 because the library is compiled with the Unix compiler.
5275 Use of RTD is a selectable option, since it is incompatible with
5276 standard Unix calling sequences. If the option is not selected,
5277 the caller must always pop the args.
5278
5279 The attribute stdcall is equivalent to RTD on a per module basis. */
5280
5281 static int
5282 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5283 {
5284 unsigned int ccvt;
5285
5286 /* None of the 64-bit ABIs pop arguments. */
5287 if (TARGET_64BIT)
5288 return 0;
5289
5290 ccvt = ix86_get_callcvt (funtype);
5291
5292 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5293 | IX86_CALLCVT_THISCALL)) != 0
5294 && ! stdarg_p (funtype))
5295 return size;
5296
5297 /* Lose any fake structure return argument if it is passed on the stack. */
5298 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5299 && !ix86_keep_aggregate_return_pointer (funtype))
5300 {
5301 int nregs = ix86_function_regparm (funtype, fundecl);
5302 if (nregs == 0)
5303 return GET_MODE_SIZE (Pmode);
5304 }
5305
5306 return 0;
5307 }
5308 \f
5309 /* Argument support functions. */
5310
5311 /* Return true when register may be used to pass function parameters. */
5312 bool
5313 ix86_function_arg_regno_p (int regno)
5314 {
5315 int i;
5316 const int *parm_regs;
5317
5318 if (!TARGET_64BIT)
5319 {
5320 if (TARGET_MACHO)
5321 return (regno < REGPARM_MAX
5322 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5323 else
5324 return (regno < REGPARM_MAX
5325 || (TARGET_MMX && MMX_REGNO_P (regno)
5326 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5327 || (TARGET_SSE && SSE_REGNO_P (regno)
5328 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5329 }
5330
5331 if (TARGET_MACHO)
5332 {
5333 if (SSE_REGNO_P (regno) && TARGET_SSE)
5334 return true;
5335 }
5336 else
5337 {
5338 if (TARGET_SSE && SSE_REGNO_P (regno)
5339 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5340 return true;
5341 }
5342
5343 /* TODO: The function should depend on current function ABI but
5344 builtins.c would need updating then. Therefore we use the
5345 default ABI. */
5346
5347 /* RAX is used as hidden argument to va_arg functions. */
5348 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5349 return true;
5350
5351 if (ix86_abi == MS_ABI)
5352 parm_regs = x86_64_ms_abi_int_parameter_registers;
5353 else
5354 parm_regs = x86_64_int_parameter_registers;
5355 for (i = 0; i < (ix86_abi == MS_ABI
5356 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5357 if (regno == parm_regs[i])
5358 return true;
5359 return false;
5360 }
5361
5362 /* Return if we do not know how to pass TYPE solely in registers. */
5363
5364 static bool
5365 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5366 {
5367 if (must_pass_in_stack_var_size_or_pad (mode, type))
5368 return true;
5369
5370 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5371 The layout_type routine is crafty and tries to trick us into passing
5372 currently unsupported vector types on the stack by using TImode. */
5373 return (!TARGET_64BIT && mode == TImode
5374 && type && TREE_CODE (type) != VECTOR_TYPE);
5375 }
5376
5377 /* It returns the size, in bytes, of the area reserved for arguments passed
5378 in registers for the function represented by fndecl dependent to the used
5379 abi format. */
5380 int
5381 ix86_reg_parm_stack_space (const_tree fndecl)
5382 {
5383 enum calling_abi call_abi = SYSV_ABI;
5384 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5385 call_abi = ix86_function_abi (fndecl);
5386 else
5387 call_abi = ix86_function_type_abi (fndecl);
5388 if (TARGET_64BIT && call_abi == MS_ABI)
5389 return 32;
5390 return 0;
5391 }
5392
5393 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5394 call abi used. */
5395 enum calling_abi
5396 ix86_function_type_abi (const_tree fntype)
5397 {
5398 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5399 {
5400 enum calling_abi abi = ix86_abi;
5401 if (abi == SYSV_ABI)
5402 {
5403 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5404 abi = MS_ABI;
5405 }
5406 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5407 abi = SYSV_ABI;
5408 return abi;
5409 }
5410 return ix86_abi;
5411 }
5412
5413 static bool
5414 ix86_function_ms_hook_prologue (const_tree fn)
5415 {
5416 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5417 {
5418 if (decl_function_context (fn) != NULL_TREE)
5419 error_at (DECL_SOURCE_LOCATION (fn),
5420 "ms_hook_prologue is not compatible with nested function");
5421 else
5422 return true;
5423 }
5424 return false;
5425 }
5426
5427 static enum calling_abi
5428 ix86_function_abi (const_tree fndecl)
5429 {
5430 if (! fndecl)
5431 return ix86_abi;
5432 return ix86_function_type_abi (TREE_TYPE (fndecl));
5433 }
5434
5435 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5436 call abi used. */
5437 enum calling_abi
5438 ix86_cfun_abi (void)
5439 {
5440 if (! cfun)
5441 return ix86_abi;
5442 return cfun->machine->call_abi;
5443 }
5444
5445 /* Write the extra assembler code needed to declare a function properly. */
5446
5447 void
5448 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5449 tree decl)
5450 {
5451 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5452
5453 if (is_ms_hook)
5454 {
5455 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5456 unsigned int filler_cc = 0xcccccccc;
5457
5458 for (i = 0; i < filler_count; i += 4)
5459 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5460 }
5461
5462 #ifdef SUBTARGET_ASM_UNWIND_INIT
5463 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5464 #endif
5465
5466 ASM_OUTPUT_LABEL (asm_out_file, fname);
5467
5468 /* Output magic byte marker, if hot-patch attribute is set. */
5469 if (is_ms_hook)
5470 {
5471 if (TARGET_64BIT)
5472 {
5473 /* leaq [%rsp + 0], %rsp */
5474 asm_fprintf (asm_out_file, ASM_BYTE
5475 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5476 }
5477 else
5478 {
5479 /* movl.s %edi, %edi
5480 push %ebp
5481 movl.s %esp, %ebp */
5482 asm_fprintf (asm_out_file, ASM_BYTE
5483 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5484 }
5485 }
5486 }
5487
5488 /* regclass.c */
5489 extern void init_regs (void);
5490
5491 /* Implementation of call abi switching target hook. Specific to FNDECL
5492 the specific call register sets are set. See also
5493 ix86_conditional_register_usage for more details. */
5494 void
5495 ix86_call_abi_override (const_tree fndecl)
5496 {
5497 if (fndecl == NULL_TREE)
5498 cfun->machine->call_abi = ix86_abi;
5499 else
5500 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5501 }
5502
5503 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5504 expensive re-initialization of init_regs each time we switch function context
5505 since this is needed only during RTL expansion. */
5506 static void
5507 ix86_maybe_switch_abi (void)
5508 {
5509 if (TARGET_64BIT &&
5510 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5511 reinit_regs ();
5512 }
5513
5514 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5515 for a call to a function whose data type is FNTYPE.
5516 For a library call, FNTYPE is 0. */
5517
5518 void
5519 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5520 tree fntype, /* tree ptr for function decl */
5521 rtx libname, /* SYMBOL_REF of library name or 0 */
5522 tree fndecl,
5523 int caller)
5524 {
5525 struct cgraph_local_info *i;
5526 tree fnret_type;
5527
5528 memset (cum, 0, sizeof (*cum));
5529
5530 /* Initialize for the current callee. */
5531 if (caller)
5532 {
5533 cfun->machine->callee_pass_avx256_p = false;
5534 cfun->machine->callee_return_avx256_p = false;
5535 }
5536
5537 if (fndecl)
5538 {
5539 i = cgraph_local_info (fndecl);
5540 cum->call_abi = ix86_function_abi (fndecl);
5541 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5542 }
5543 else
5544 {
5545 i = NULL;
5546 cum->call_abi = ix86_function_type_abi (fntype);
5547 if (fntype)
5548 fnret_type = TREE_TYPE (fntype);
5549 else
5550 fnret_type = NULL;
5551 }
5552
5553 if (TARGET_VZEROUPPER && fnret_type)
5554 {
5555 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5556 false);
5557 if (function_pass_avx256_p (fnret_value))
5558 {
5559 /* The return value of this function uses 256bit AVX modes. */
5560 if (caller)
5561 cfun->machine->callee_return_avx256_p = true;
5562 else
5563 cfun->machine->caller_return_avx256_p = true;
5564 }
5565 }
5566
5567 cum->caller = caller;
5568
5569 /* Set up the number of registers to use for passing arguments. */
5570
5571 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5572 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5573 "or subtarget optimization implying it");
5574 cum->nregs = ix86_regparm;
5575 if (TARGET_64BIT)
5576 {
5577 cum->nregs = (cum->call_abi == SYSV_ABI
5578 ? X86_64_REGPARM_MAX
5579 : X86_64_MS_REGPARM_MAX);
5580 }
5581 if (TARGET_SSE)
5582 {
5583 cum->sse_nregs = SSE_REGPARM_MAX;
5584 if (TARGET_64BIT)
5585 {
5586 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5587 ? X86_64_SSE_REGPARM_MAX
5588 : X86_64_MS_SSE_REGPARM_MAX);
5589 }
5590 }
5591 if (TARGET_MMX)
5592 cum->mmx_nregs = MMX_REGPARM_MAX;
5593 cum->warn_avx = true;
5594 cum->warn_sse = true;
5595 cum->warn_mmx = true;
5596
5597 /* Because type might mismatch in between caller and callee, we need to
5598 use actual type of function for local calls.
5599 FIXME: cgraph_analyze can be told to actually record if function uses
5600 va_start so for local functions maybe_vaarg can be made aggressive
5601 helping K&R code.
5602 FIXME: once typesytem is fixed, we won't need this code anymore. */
5603 if (i && i->local && i->can_change_signature)
5604 fntype = TREE_TYPE (fndecl);
5605 cum->maybe_vaarg = (fntype
5606 ? (!prototype_p (fntype) || stdarg_p (fntype))
5607 : !libname);
5608
5609 if (!TARGET_64BIT)
5610 {
5611 /* If there are variable arguments, then we won't pass anything
5612 in registers in 32-bit mode. */
5613 if (stdarg_p (fntype))
5614 {
5615 cum->nregs = 0;
5616 cum->sse_nregs = 0;
5617 cum->mmx_nregs = 0;
5618 cum->warn_avx = 0;
5619 cum->warn_sse = 0;
5620 cum->warn_mmx = 0;
5621 return;
5622 }
5623
5624 /* Use ecx and edx registers if function has fastcall attribute,
5625 else look for regparm information. */
5626 if (fntype)
5627 {
5628 unsigned int ccvt = ix86_get_callcvt (fntype);
5629 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5630 {
5631 cum->nregs = 1;
5632 cum->fastcall = 1; /* Same first register as in fastcall. */
5633 }
5634 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5635 {
5636 cum->nregs = 2;
5637 cum->fastcall = 1;
5638 }
5639 else
5640 cum->nregs = ix86_function_regparm (fntype, fndecl);
5641 }
5642
5643 /* Set up the number of SSE registers used for passing SFmode
5644 and DFmode arguments. Warn for mismatching ABI. */
5645 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5646 }
5647 }
5648
5649 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5650 But in the case of vector types, it is some vector mode.
5651
5652 When we have only some of our vector isa extensions enabled, then there
5653 are some modes for which vector_mode_supported_p is false. For these
5654 modes, the generic vector support in gcc will choose some non-vector mode
5655 in order to implement the type. By computing the natural mode, we'll
5656 select the proper ABI location for the operand and not depend on whatever
5657 the middle-end decides to do with these vector types.
5658
5659 The midde-end can't deal with the vector types > 16 bytes. In this
5660 case, we return the original mode and warn ABI change if CUM isn't
5661 NULL. */
5662
5663 static enum machine_mode
5664 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5665 {
5666 enum machine_mode mode = TYPE_MODE (type);
5667
5668 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5669 {
5670 HOST_WIDE_INT size = int_size_in_bytes (type);
5671 if ((size == 8 || size == 16 || size == 32)
5672 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5673 && TYPE_VECTOR_SUBPARTS (type) > 1)
5674 {
5675 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5676
5677 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5678 mode = MIN_MODE_VECTOR_FLOAT;
5679 else
5680 mode = MIN_MODE_VECTOR_INT;
5681
5682 /* Get the mode which has this inner mode and number of units. */
5683 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5684 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5685 && GET_MODE_INNER (mode) == innermode)
5686 {
5687 if (size == 32 && !TARGET_AVX)
5688 {
5689 static bool warnedavx;
5690
5691 if (cum
5692 && !warnedavx
5693 && cum->warn_avx)
5694 {
5695 warnedavx = true;
5696 warning (0, "AVX vector argument without AVX "
5697 "enabled changes the ABI");
5698 }
5699 return TYPE_MODE (type);
5700 }
5701 else
5702 return mode;
5703 }
5704
5705 gcc_unreachable ();
5706 }
5707 }
5708
5709 return mode;
5710 }
5711
5712 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5713 this may not agree with the mode that the type system has chosen for the
5714 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5715 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5716
5717 static rtx
5718 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5719 unsigned int regno)
5720 {
5721 rtx tmp;
5722
5723 if (orig_mode != BLKmode)
5724 tmp = gen_rtx_REG (orig_mode, regno);
5725 else
5726 {
5727 tmp = gen_rtx_REG (mode, regno);
5728 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5729 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5730 }
5731
5732 return tmp;
5733 }
5734
5735 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5736 of this code is to classify each 8bytes of incoming argument by the register
5737 class and assign registers accordingly. */
5738
5739 /* Return the union class of CLASS1 and CLASS2.
5740 See the x86-64 PS ABI for details. */
5741
5742 static enum x86_64_reg_class
5743 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5744 {
5745 /* Rule #1: If both classes are equal, this is the resulting class. */
5746 if (class1 == class2)
5747 return class1;
5748
5749 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5750 the other class. */
5751 if (class1 == X86_64_NO_CLASS)
5752 return class2;
5753 if (class2 == X86_64_NO_CLASS)
5754 return class1;
5755
5756 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5757 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5758 return X86_64_MEMORY_CLASS;
5759
5760 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5761 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5762 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5763 return X86_64_INTEGERSI_CLASS;
5764 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5765 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5766 return X86_64_INTEGER_CLASS;
5767
5768 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5769 MEMORY is used. */
5770 if (class1 == X86_64_X87_CLASS
5771 || class1 == X86_64_X87UP_CLASS
5772 || class1 == X86_64_COMPLEX_X87_CLASS
5773 || class2 == X86_64_X87_CLASS
5774 || class2 == X86_64_X87UP_CLASS
5775 || class2 == X86_64_COMPLEX_X87_CLASS)
5776 return X86_64_MEMORY_CLASS;
5777
5778 /* Rule #6: Otherwise class SSE is used. */
5779 return X86_64_SSE_CLASS;
5780 }
5781
5782 /* Classify the argument of type TYPE and mode MODE.
5783 CLASSES will be filled by the register class used to pass each word
5784 of the operand. The number of words is returned. In case the parameter
5785 should be passed in memory, 0 is returned. As a special case for zero
5786 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5787
5788 BIT_OFFSET is used internally for handling records and specifies offset
5789 of the offset in bits modulo 256 to avoid overflow cases.
5790
5791 See the x86-64 PS ABI for details.
5792 */
5793
5794 static int
5795 classify_argument (enum machine_mode mode, const_tree type,
5796 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5797 {
5798 HOST_WIDE_INT bytes =
5799 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5800 int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5801
5802 /* Variable sized entities are always passed/returned in memory. */
5803 if (bytes < 0)
5804 return 0;
5805
5806 if (mode != VOIDmode
5807 && targetm.calls.must_pass_in_stack (mode, type))
5808 return 0;
5809
5810 if (type && AGGREGATE_TYPE_P (type))
5811 {
5812 int i;
5813 tree field;
5814 enum x86_64_reg_class subclasses[MAX_CLASSES];
5815
5816 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5817 if (bytes > 32)
5818 return 0;
5819
5820 for (i = 0; i < words; i++)
5821 classes[i] = X86_64_NO_CLASS;
5822
5823 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5824 signalize memory class, so handle it as special case. */
5825 if (!words)
5826 {
5827 classes[0] = X86_64_NO_CLASS;
5828 return 1;
5829 }
5830
5831 /* Classify each field of record and merge classes. */
5832 switch (TREE_CODE (type))
5833 {
5834 case RECORD_TYPE:
5835 /* And now merge the fields of structure. */
5836 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5837 {
5838 if (TREE_CODE (field) == FIELD_DECL)
5839 {
5840 int num;
5841
5842 if (TREE_TYPE (field) == error_mark_node)
5843 continue;
5844
5845 /* Bitfields are always classified as integer. Handle them
5846 early, since later code would consider them to be
5847 misaligned integers. */
5848 if (DECL_BIT_FIELD (field))
5849 {
5850 for (i = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5851 i < ((int_bit_position (field) + (bit_offset % 64))
5852 + tree_low_cst (DECL_SIZE (field), 0)
5853 + 63) / 8 / 8; i++)
5854 classes[i] =
5855 merge_classes (X86_64_INTEGER_CLASS,
5856 classes[i]);
5857 }
5858 else
5859 {
5860 int pos;
5861
5862 type = TREE_TYPE (field);
5863
5864 /* Flexible array member is ignored. */
5865 if (TYPE_MODE (type) == BLKmode
5866 && TREE_CODE (type) == ARRAY_TYPE
5867 && TYPE_SIZE (type) == NULL_TREE
5868 && TYPE_DOMAIN (type) != NULL_TREE
5869 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5870 == NULL_TREE))
5871 {
5872 static bool warned;
5873
5874 if (!warned && warn_psabi)
5875 {
5876 warned = true;
5877 inform (input_location,
5878 "the ABI of passing struct with"
5879 " a flexible array member has"
5880 " changed in GCC 4.4");
5881 }
5882 continue;
5883 }
5884 num = classify_argument (TYPE_MODE (type), type,
5885 subclasses,
5886 (int_bit_position (field)
5887 + bit_offset) % 256);
5888 if (!num)
5889 return 0;
5890 pos = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5891 for (i = 0; i < num && (i + pos) < words; i++)
5892 classes[i + pos] =
5893 merge_classes (subclasses[i], classes[i + pos]);
5894 }
5895 }
5896 }
5897 break;
5898
5899 case ARRAY_TYPE:
5900 /* Arrays are handled as small records. */
5901 {
5902 int num;
5903 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5904 TREE_TYPE (type), subclasses, bit_offset);
5905 if (!num)
5906 return 0;
5907
5908 /* The partial classes are now full classes. */
5909 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5910 subclasses[0] = X86_64_SSE_CLASS;
5911 if (subclasses[0] == X86_64_INTEGERSI_CLASS
5912 && !((bit_offset % 64) == 0 && bytes == 4))
5913 subclasses[0] = X86_64_INTEGER_CLASS;
5914
5915 for (i = 0; i < words; i++)
5916 classes[i] = subclasses[i % num];
5917
5918 break;
5919 }
5920 case UNION_TYPE:
5921 case QUAL_UNION_TYPE:
5922 /* Unions are similar to RECORD_TYPE but offset is always 0.
5923 */
5924 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5925 {
5926 if (TREE_CODE (field) == FIELD_DECL)
5927 {
5928 int num;
5929
5930 if (TREE_TYPE (field) == error_mark_node)
5931 continue;
5932
5933 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
5934 TREE_TYPE (field), subclasses,
5935 bit_offset);
5936 if (!num)
5937 return 0;
5938 for (i = 0; i < num; i++)
5939 classes[i] = merge_classes (subclasses[i], classes[i]);
5940 }
5941 }
5942 break;
5943
5944 default:
5945 gcc_unreachable ();
5946 }
5947
5948 if (words > 2)
5949 {
5950 /* When size > 16 bytes, if the first one isn't
5951 X86_64_SSE_CLASS or any other ones aren't
5952 X86_64_SSEUP_CLASS, everything should be passed in
5953 memory. */
5954 if (classes[0] != X86_64_SSE_CLASS)
5955 return 0;
5956
5957 for (i = 1; i < words; i++)
5958 if (classes[i] != X86_64_SSEUP_CLASS)
5959 return 0;
5960 }
5961
5962 /* Final merger cleanup. */
5963 for (i = 0; i < words; i++)
5964 {
5965 /* If one class is MEMORY, everything should be passed in
5966 memory. */
5967 if (classes[i] == X86_64_MEMORY_CLASS)
5968 return 0;
5969
5970 /* The X86_64_SSEUP_CLASS should be always preceded by
5971 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
5972 if (classes[i] == X86_64_SSEUP_CLASS
5973 && classes[i - 1] != X86_64_SSE_CLASS
5974 && classes[i - 1] != X86_64_SSEUP_CLASS)
5975 {
5976 /* The first one should never be X86_64_SSEUP_CLASS. */
5977 gcc_assert (i != 0);
5978 classes[i] = X86_64_SSE_CLASS;
5979 }
5980
5981 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
5982 everything should be passed in memory. */
5983 if (classes[i] == X86_64_X87UP_CLASS
5984 && (classes[i - 1] != X86_64_X87_CLASS))
5985 {
5986 static bool warned;
5987
5988 /* The first one should never be X86_64_X87UP_CLASS. */
5989 gcc_assert (i != 0);
5990 if (!warned && warn_psabi)
5991 {
5992 warned = true;
5993 inform (input_location,
5994 "the ABI of passing union with long double"
5995 " has changed in GCC 4.4");
5996 }
5997 return 0;
5998 }
5999 }
6000 return words;
6001 }
6002
6003 /* Compute alignment needed. We align all types to natural boundaries with
6004 exception of XFmode that is aligned to 64bits. */
6005 if (mode != VOIDmode && mode != BLKmode)
6006 {
6007 int mode_alignment = GET_MODE_BITSIZE (mode);
6008
6009 if (mode == XFmode)
6010 mode_alignment = 128;
6011 else if (mode == XCmode)
6012 mode_alignment = 256;
6013 if (COMPLEX_MODE_P (mode))
6014 mode_alignment /= 2;
6015 /* Misaligned fields are always returned in memory. */
6016 if (bit_offset % mode_alignment)
6017 return 0;
6018 }
6019
6020 /* for V1xx modes, just use the base mode */
6021 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6022 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6023 mode = GET_MODE_INNER (mode);
6024
6025 /* Classification of atomic types. */
6026 switch (mode)
6027 {
6028 case SDmode:
6029 case DDmode:
6030 classes[0] = X86_64_SSE_CLASS;
6031 return 1;
6032 case TDmode:
6033 classes[0] = X86_64_SSE_CLASS;
6034 classes[1] = X86_64_SSEUP_CLASS;
6035 return 2;
6036 case DImode:
6037 case SImode:
6038 case HImode:
6039 case QImode:
6040 case CSImode:
6041 case CHImode:
6042 case CQImode:
6043 {
6044 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6045
6046 if (size <= 32)
6047 {
6048 classes[0] = X86_64_INTEGERSI_CLASS;
6049 return 1;
6050 }
6051 else if (size <= 64)
6052 {
6053 classes[0] = X86_64_INTEGER_CLASS;
6054 return 1;
6055 }
6056 else if (size <= 64+32)
6057 {
6058 classes[0] = X86_64_INTEGER_CLASS;
6059 classes[1] = X86_64_INTEGERSI_CLASS;
6060 return 2;
6061 }
6062 else if (size <= 64+64)
6063 {
6064 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6065 return 2;
6066 }
6067 else
6068 gcc_unreachable ();
6069 }
6070 case CDImode:
6071 case TImode:
6072 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6073 return 2;
6074 case COImode:
6075 case OImode:
6076 /* OImode shouldn't be used directly. */
6077 gcc_unreachable ();
6078 case CTImode:
6079 return 0;
6080 case SFmode:
6081 if (!(bit_offset % 64))
6082 classes[0] = X86_64_SSESF_CLASS;
6083 else
6084 classes[0] = X86_64_SSE_CLASS;
6085 return 1;
6086 case DFmode:
6087 classes[0] = X86_64_SSEDF_CLASS;
6088 return 1;
6089 case XFmode:
6090 classes[0] = X86_64_X87_CLASS;
6091 classes[1] = X86_64_X87UP_CLASS;
6092 return 2;
6093 case TFmode:
6094 classes[0] = X86_64_SSE_CLASS;
6095 classes[1] = X86_64_SSEUP_CLASS;
6096 return 2;
6097 case SCmode:
6098 classes[0] = X86_64_SSE_CLASS;
6099 if (!(bit_offset % 64))
6100 return 1;
6101 else
6102 {
6103 static bool warned;
6104
6105 if (!warned && warn_psabi)
6106 {
6107 warned = true;
6108 inform (input_location,
6109 "the ABI of passing structure with complex float"
6110 " member has changed in GCC 4.4");
6111 }
6112 classes[1] = X86_64_SSESF_CLASS;
6113 return 2;
6114 }
6115 case DCmode:
6116 classes[0] = X86_64_SSEDF_CLASS;
6117 classes[1] = X86_64_SSEDF_CLASS;
6118 return 2;
6119 case XCmode:
6120 classes[0] = X86_64_COMPLEX_X87_CLASS;
6121 return 1;
6122 case TCmode:
6123 /* This modes is larger than 16 bytes. */
6124 return 0;
6125 case V8SFmode:
6126 case V8SImode:
6127 case V32QImode:
6128 case V16HImode:
6129 case V4DFmode:
6130 case V4DImode:
6131 classes[0] = X86_64_SSE_CLASS;
6132 classes[1] = X86_64_SSEUP_CLASS;
6133 classes[2] = X86_64_SSEUP_CLASS;
6134 classes[3] = X86_64_SSEUP_CLASS;
6135 return 4;
6136 case V4SFmode:
6137 case V4SImode:
6138 case V16QImode:
6139 case V8HImode:
6140 case V2DFmode:
6141 case V2DImode:
6142 classes[0] = X86_64_SSE_CLASS;
6143 classes[1] = X86_64_SSEUP_CLASS;
6144 return 2;
6145 case V1TImode:
6146 case V1DImode:
6147 case V2SFmode:
6148 case V2SImode:
6149 case V4HImode:
6150 case V8QImode:
6151 classes[0] = X86_64_SSE_CLASS;
6152 return 1;
6153 case BLKmode:
6154 case VOIDmode:
6155 return 0;
6156 default:
6157 gcc_assert (VECTOR_MODE_P (mode));
6158
6159 if (bytes > 16)
6160 return 0;
6161
6162 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6163
6164 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6165 classes[0] = X86_64_INTEGERSI_CLASS;
6166 else
6167 classes[0] = X86_64_INTEGER_CLASS;
6168 classes[1] = X86_64_INTEGER_CLASS;
6169 return 1 + (bytes > 8);
6170 }
6171 }
6172
6173 /* Examine the argument and return set number of register required in each
6174 class. Return 0 iff parameter should be passed in memory. */
6175 static int
6176 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6177 int *int_nregs, int *sse_nregs)
6178 {
6179 enum x86_64_reg_class regclass[MAX_CLASSES];
6180 int n = classify_argument (mode, type, regclass, 0);
6181
6182 *int_nregs = 0;
6183 *sse_nregs = 0;
6184 if (!n)
6185 return 0;
6186 for (n--; n >= 0; n--)
6187 switch (regclass[n])
6188 {
6189 case X86_64_INTEGER_CLASS:
6190 case X86_64_INTEGERSI_CLASS:
6191 (*int_nregs)++;
6192 break;
6193 case X86_64_SSE_CLASS:
6194 case X86_64_SSESF_CLASS:
6195 case X86_64_SSEDF_CLASS:
6196 (*sse_nregs)++;
6197 break;
6198 case X86_64_NO_CLASS:
6199 case X86_64_SSEUP_CLASS:
6200 break;
6201 case X86_64_X87_CLASS:
6202 case X86_64_X87UP_CLASS:
6203 if (!in_return)
6204 return 0;
6205 break;
6206 case X86_64_COMPLEX_X87_CLASS:
6207 return in_return ? 2 : 0;
6208 case X86_64_MEMORY_CLASS:
6209 gcc_unreachable ();
6210 }
6211 return 1;
6212 }
6213
6214 /* Construct container for the argument used by GCC interface. See
6215 FUNCTION_ARG for the detailed description. */
6216
6217 static rtx
6218 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6219 const_tree type, int in_return, int nintregs, int nsseregs,
6220 const int *intreg, int sse_regno)
6221 {
6222 /* The following variables hold the static issued_error state. */
6223 static bool issued_sse_arg_error;
6224 static bool issued_sse_ret_error;
6225 static bool issued_x87_ret_error;
6226
6227 enum machine_mode tmpmode;
6228 int bytes =
6229 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6230 enum x86_64_reg_class regclass[MAX_CLASSES];
6231 int n;
6232 int i;
6233 int nexps = 0;
6234 int needed_sseregs, needed_intregs;
6235 rtx exp[MAX_CLASSES];
6236 rtx ret;
6237
6238 n = classify_argument (mode, type, regclass, 0);
6239 if (!n)
6240 return NULL;
6241 if (!examine_argument (mode, type, in_return, &needed_intregs,
6242 &needed_sseregs))
6243 return NULL;
6244 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6245 return NULL;
6246
6247 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6248 some less clueful developer tries to use floating-point anyway. */
6249 if (needed_sseregs && !TARGET_SSE)
6250 {
6251 if (in_return)
6252 {
6253 if (!issued_sse_ret_error)
6254 {
6255 error ("SSE register return with SSE disabled");
6256 issued_sse_ret_error = true;
6257 }
6258 }
6259 else if (!issued_sse_arg_error)
6260 {
6261 error ("SSE register argument with SSE disabled");
6262 issued_sse_arg_error = true;
6263 }
6264 return NULL;
6265 }
6266
6267 /* Likewise, error if the ABI requires us to return values in the
6268 x87 registers and the user specified -mno-80387. */
6269 if (!TARGET_80387 && in_return)
6270 for (i = 0; i < n; i++)
6271 if (regclass[i] == X86_64_X87_CLASS
6272 || regclass[i] == X86_64_X87UP_CLASS
6273 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6274 {
6275 if (!issued_x87_ret_error)
6276 {
6277 error ("x87 register return with x87 disabled");
6278 issued_x87_ret_error = true;
6279 }
6280 return NULL;
6281 }
6282
6283 /* First construct simple cases. Avoid SCmode, since we want to use
6284 single register to pass this type. */
6285 if (n == 1 && mode != SCmode)
6286 switch (regclass[0])
6287 {
6288 case X86_64_INTEGER_CLASS:
6289 case X86_64_INTEGERSI_CLASS:
6290 return gen_rtx_REG (mode, intreg[0]);
6291 case X86_64_SSE_CLASS:
6292 case X86_64_SSESF_CLASS:
6293 case X86_64_SSEDF_CLASS:
6294 if (mode != BLKmode)
6295 return gen_reg_or_parallel (mode, orig_mode,
6296 SSE_REGNO (sse_regno));
6297 break;
6298 case X86_64_X87_CLASS:
6299 case X86_64_COMPLEX_X87_CLASS:
6300 return gen_rtx_REG (mode, FIRST_STACK_REG);
6301 case X86_64_NO_CLASS:
6302 /* Zero sized array, struct or class. */
6303 return NULL;
6304 default:
6305 gcc_unreachable ();
6306 }
6307 if (n == 2 && regclass[0] == X86_64_SSE_CLASS
6308 && regclass[1] == X86_64_SSEUP_CLASS && mode != BLKmode)
6309 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6310 if (n == 4
6311 && regclass[0] == X86_64_SSE_CLASS
6312 && regclass[1] == X86_64_SSEUP_CLASS
6313 && regclass[2] == X86_64_SSEUP_CLASS
6314 && regclass[3] == X86_64_SSEUP_CLASS
6315 && mode != BLKmode)
6316 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6317
6318 if (n == 2
6319 && regclass[0] == X86_64_X87_CLASS && regclass[1] == X86_64_X87UP_CLASS)
6320 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6321 if (n == 2 && regclass[0] == X86_64_INTEGER_CLASS
6322 && regclass[1] == X86_64_INTEGER_CLASS
6323 && (mode == CDImode || mode == TImode || mode == TFmode)
6324 && intreg[0] + 1 == intreg[1])
6325 return gen_rtx_REG (mode, intreg[0]);
6326
6327 /* Otherwise figure out the entries of the PARALLEL. */
6328 for (i = 0; i < n; i++)
6329 {
6330 int pos;
6331
6332 switch (regclass[i])
6333 {
6334 case X86_64_NO_CLASS:
6335 break;
6336 case X86_64_INTEGER_CLASS:
6337 case X86_64_INTEGERSI_CLASS:
6338 /* Merge TImodes on aligned occasions here too. */
6339 if (i * 8 + 8 > bytes)
6340 tmpmode = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6341 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6342 tmpmode = SImode;
6343 else
6344 tmpmode = DImode;
6345 /* We've requested 24 bytes we don't have mode for. Use DImode. */
6346 if (tmpmode == BLKmode)
6347 tmpmode = DImode;
6348 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6349 gen_rtx_REG (tmpmode, *intreg),
6350 GEN_INT (i*8));
6351 intreg++;
6352 break;
6353 case X86_64_SSESF_CLASS:
6354 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6355 gen_rtx_REG (SFmode,
6356 SSE_REGNO (sse_regno)),
6357 GEN_INT (i*8));
6358 sse_regno++;
6359 break;
6360 case X86_64_SSEDF_CLASS:
6361 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6362 gen_rtx_REG (DFmode,
6363 SSE_REGNO (sse_regno)),
6364 GEN_INT (i*8));
6365 sse_regno++;
6366 break;
6367 case X86_64_SSE_CLASS:
6368 pos = i;
6369 switch (n)
6370 {
6371 case 1:
6372 tmpmode = DImode;
6373 break;
6374 case 2:
6375 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6376 {
6377 tmpmode = TImode;
6378 i++;
6379 }
6380 else
6381 tmpmode = DImode;
6382 break;
6383 case 4:
6384 gcc_assert (i == 0
6385 && regclass[1] == X86_64_SSEUP_CLASS
6386 && regclass[2] == X86_64_SSEUP_CLASS
6387 && regclass[3] == X86_64_SSEUP_CLASS);
6388 tmpmode = OImode;
6389 i += 3;
6390 break;
6391 default:
6392 gcc_unreachable ();
6393 }
6394 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6395 gen_rtx_REG (tmpmode,
6396 SSE_REGNO (sse_regno)),
6397 GEN_INT (pos*8));
6398 sse_regno++;
6399 break;
6400 default:
6401 gcc_unreachable ();
6402 }
6403 }
6404
6405 /* Empty aligned struct, union or class. */
6406 if (nexps == 0)
6407 return NULL;
6408
6409 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6410 for (i = 0; i < nexps; i++)
6411 XVECEXP (ret, 0, i) = exp [i];
6412 return ret;
6413 }
6414
6415 /* Update the data in CUM to advance over an argument of mode MODE
6416 and data type TYPE. (TYPE is null for libcalls where that information
6417 may not be available.) */
6418
6419 static void
6420 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6421 const_tree type, HOST_WIDE_INT bytes,
6422 HOST_WIDE_INT words)
6423 {
6424 switch (mode)
6425 {
6426 default:
6427 break;
6428
6429 case BLKmode:
6430 if (bytes < 0)
6431 break;
6432 /* FALLTHRU */
6433
6434 case DImode:
6435 case SImode:
6436 case HImode:
6437 case QImode:
6438 cum->words += words;
6439 cum->nregs -= words;
6440 cum->regno += words;
6441
6442 if (cum->nregs <= 0)
6443 {
6444 cum->nregs = 0;
6445 cum->regno = 0;
6446 }
6447 break;
6448
6449 case OImode:
6450 /* OImode shouldn't be used directly. */
6451 gcc_unreachable ();
6452
6453 case DFmode:
6454 if (cum->float_in_sse < 2)
6455 break;
6456 case SFmode:
6457 if (cum->float_in_sse < 1)
6458 break;
6459 /* FALLTHRU */
6460
6461 case V8SFmode:
6462 case V8SImode:
6463 case V32QImode:
6464 case V16HImode:
6465 case V4DFmode:
6466 case V4DImode:
6467 case TImode:
6468 case V16QImode:
6469 case V8HImode:
6470 case V4SImode:
6471 case V2DImode:
6472 case V4SFmode:
6473 case V2DFmode:
6474 if (!type || !AGGREGATE_TYPE_P (type))
6475 {
6476 cum->sse_words += words;
6477 cum->sse_nregs -= 1;
6478 cum->sse_regno += 1;
6479 if (cum->sse_nregs <= 0)
6480 {
6481 cum->sse_nregs = 0;
6482 cum->sse_regno = 0;
6483 }
6484 }
6485 break;
6486
6487 case V8QImode:
6488 case V4HImode:
6489 case V2SImode:
6490 case V2SFmode:
6491 case V1TImode:
6492 case V1DImode:
6493 if (!type || !AGGREGATE_TYPE_P (type))
6494 {
6495 cum->mmx_words += words;
6496 cum->mmx_nregs -= 1;
6497 cum->mmx_regno += 1;
6498 if (cum->mmx_nregs <= 0)
6499 {
6500 cum->mmx_nregs = 0;
6501 cum->mmx_regno = 0;
6502 }
6503 }
6504 break;
6505 }
6506 }
6507
6508 static void
6509 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6510 const_tree type, HOST_WIDE_INT words, bool named)
6511 {
6512 int int_nregs, sse_nregs;
6513
6514 /* Unnamed 256bit vector mode parameters are passed on stack. */
6515 if (!named && VALID_AVX256_REG_MODE (mode))
6516 return;
6517
6518 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6519 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6520 {
6521 cum->nregs -= int_nregs;
6522 cum->sse_nregs -= sse_nregs;
6523 cum->regno += int_nregs;
6524 cum->sse_regno += sse_nregs;
6525 }
6526 else
6527 {
6528 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6529 cum->words = (cum->words + align - 1) & ~(align - 1);
6530 cum->words += words;
6531 }
6532 }
6533
6534 static void
6535 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6536 HOST_WIDE_INT words)
6537 {
6538 /* Otherwise, this should be passed indirect. */
6539 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6540
6541 cum->words += words;
6542 if (cum->nregs > 0)
6543 {
6544 cum->nregs -= 1;
6545 cum->regno += 1;
6546 }
6547 }
6548
6549 /* Update the data in CUM to advance over an argument of mode MODE and
6550 data type TYPE. (TYPE is null for libcalls where that information
6551 may not be available.) */
6552
6553 static void
6554 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6555 const_tree type, bool named)
6556 {
6557 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6558 HOST_WIDE_INT bytes, words;
6559
6560 if (mode == BLKmode)
6561 bytes = int_size_in_bytes (type);
6562 else
6563 bytes = GET_MODE_SIZE (mode);
6564 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6565
6566 if (type)
6567 mode = type_natural_mode (type, NULL);
6568
6569 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6570 function_arg_advance_ms_64 (cum, bytes, words);
6571 else if (TARGET_64BIT)
6572 function_arg_advance_64 (cum, mode, type, words, named);
6573 else
6574 function_arg_advance_32 (cum, mode, type, bytes, words);
6575 }
6576
6577 /* Define where to put the arguments to a function.
6578 Value is zero to push the argument on the stack,
6579 or a hard register in which to store the argument.
6580
6581 MODE is the argument's machine mode.
6582 TYPE is the data type of the argument (as a tree).
6583 This is null for libcalls where that information may
6584 not be available.
6585 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6586 the preceding args and about the function being called.
6587 NAMED is nonzero if this argument is a named parameter
6588 (otherwise it is an extra parameter matching an ellipsis). */
6589
6590 static rtx
6591 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6592 enum machine_mode orig_mode, const_tree type,
6593 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6594 {
6595 static bool warnedsse, warnedmmx;
6596
6597 /* Avoid the AL settings for the Unix64 ABI. */
6598 if (mode == VOIDmode)
6599 return constm1_rtx;
6600
6601 switch (mode)
6602 {
6603 default:
6604 break;
6605
6606 case BLKmode:
6607 if (bytes < 0)
6608 break;
6609 /* FALLTHRU */
6610 case DImode:
6611 case SImode:
6612 case HImode:
6613 case QImode:
6614 if (words <= cum->nregs)
6615 {
6616 int regno = cum->regno;
6617
6618 /* Fastcall allocates the first two DWORD (SImode) or
6619 smaller arguments to ECX and EDX if it isn't an
6620 aggregate type . */
6621 if (cum->fastcall)
6622 {
6623 if (mode == BLKmode
6624 || mode == DImode
6625 || (type && AGGREGATE_TYPE_P (type)))
6626 break;
6627
6628 /* ECX not EAX is the first allocated register. */
6629 if (regno == AX_REG)
6630 regno = CX_REG;
6631 }
6632 return gen_rtx_REG (mode, regno);
6633 }
6634 break;
6635
6636 case DFmode:
6637 if (cum->float_in_sse < 2)
6638 break;
6639 case SFmode:
6640 if (cum->float_in_sse < 1)
6641 break;
6642 /* FALLTHRU */
6643 case TImode:
6644 /* In 32bit, we pass TImode in xmm registers. */
6645 case V16QImode:
6646 case V8HImode:
6647 case V4SImode:
6648 case V2DImode:
6649 case V4SFmode:
6650 case V2DFmode:
6651 if (!type || !AGGREGATE_TYPE_P (type))
6652 {
6653 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6654 {
6655 warnedsse = true;
6656 warning (0, "SSE vector argument without SSE enabled "
6657 "changes the ABI");
6658 }
6659 if (cum->sse_nregs)
6660 return gen_reg_or_parallel (mode, orig_mode,
6661 cum->sse_regno + FIRST_SSE_REG);
6662 }
6663 break;
6664
6665 case OImode:
6666 /* OImode shouldn't be used directly. */
6667 gcc_unreachable ();
6668
6669 case V8SFmode:
6670 case V8SImode:
6671 case V32QImode:
6672 case V16HImode:
6673 case V4DFmode:
6674 case V4DImode:
6675 if (!type || !AGGREGATE_TYPE_P (type))
6676 {
6677 if (cum->sse_nregs)
6678 return gen_reg_or_parallel (mode, orig_mode,
6679 cum->sse_regno + FIRST_SSE_REG);
6680 }
6681 break;
6682
6683 case V8QImode:
6684 case V4HImode:
6685 case V2SImode:
6686 case V2SFmode:
6687 case V1TImode:
6688 case V1DImode:
6689 if (!type || !AGGREGATE_TYPE_P (type))
6690 {
6691 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6692 {
6693 warnedmmx = true;
6694 warning (0, "MMX vector argument without MMX enabled "
6695 "changes the ABI");
6696 }
6697 if (cum->mmx_nregs)
6698 return gen_reg_or_parallel (mode, orig_mode,
6699 cum->mmx_regno + FIRST_MMX_REG);
6700 }
6701 break;
6702 }
6703
6704 return NULL_RTX;
6705 }
6706
6707 static rtx
6708 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6709 enum machine_mode orig_mode, const_tree type, bool named)
6710 {
6711 /* Handle a hidden AL argument containing number of registers
6712 for varargs x86-64 functions. */
6713 if (mode == VOIDmode)
6714 return GEN_INT (cum->maybe_vaarg
6715 ? (cum->sse_nregs < 0
6716 ? X86_64_SSE_REGPARM_MAX
6717 : cum->sse_regno)
6718 : -1);
6719
6720 switch (mode)
6721 {
6722 default:
6723 break;
6724
6725 case V8SFmode:
6726 case V8SImode:
6727 case V32QImode:
6728 case V16HImode:
6729 case V4DFmode:
6730 case V4DImode:
6731 /* Unnamed 256bit vector mode parameters are passed on stack. */
6732 if (!named)
6733 return NULL;
6734 break;
6735 }
6736
6737 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6738 cum->sse_nregs,
6739 &x86_64_int_parameter_registers [cum->regno],
6740 cum->sse_regno);
6741 }
6742
6743 static rtx
6744 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6745 enum machine_mode orig_mode, bool named,
6746 HOST_WIDE_INT bytes)
6747 {
6748 unsigned int regno;
6749
6750 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6751 We use value of -2 to specify that current function call is MSABI. */
6752 if (mode == VOIDmode)
6753 return GEN_INT (-2);
6754
6755 /* If we've run out of registers, it goes on the stack. */
6756 if (cum->nregs == 0)
6757 return NULL_RTX;
6758
6759 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6760
6761 /* Only floating point modes are passed in anything but integer regs. */
6762 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6763 {
6764 if (named)
6765 regno = cum->regno + FIRST_SSE_REG;
6766 else
6767 {
6768 rtx t1, t2;
6769
6770 /* Unnamed floating parameters are passed in both the
6771 SSE and integer registers. */
6772 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6773 t2 = gen_rtx_REG (mode, regno);
6774 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6775 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6776 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6777 }
6778 }
6779 /* Handle aggregated types passed in register. */
6780 if (orig_mode == BLKmode)
6781 {
6782 if (bytes > 0 && bytes <= 8)
6783 mode = (bytes > 4 ? DImode : SImode);
6784 if (mode == BLKmode)
6785 mode = DImode;
6786 }
6787
6788 return gen_reg_or_parallel (mode, orig_mode, regno);
6789 }
6790
6791 /* Return where to put the arguments to a function.
6792 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6793
6794 MODE is the argument's machine mode. TYPE is the data type of the
6795 argument. It is null for libcalls where that information may not be
6796 available. CUM gives information about the preceding args and about
6797 the function being called. NAMED is nonzero if this argument is a
6798 named parameter (otherwise it is an extra parameter matching an
6799 ellipsis). */
6800
6801 static rtx
6802 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6803 const_tree type, bool named)
6804 {
6805 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6806 enum machine_mode mode = omode;
6807 HOST_WIDE_INT bytes, words;
6808 rtx arg;
6809
6810 if (mode == BLKmode)
6811 bytes = int_size_in_bytes (type);
6812 else
6813 bytes = GET_MODE_SIZE (mode);
6814 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6815
6816 /* To simplify the code below, represent vector types with a vector mode
6817 even if MMX/SSE are not active. */
6818 if (type && TREE_CODE (type) == VECTOR_TYPE)
6819 mode = type_natural_mode (type, cum);
6820
6821 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6822 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6823 else if (TARGET_64BIT)
6824 arg = function_arg_64 (cum, mode, omode, type, named);
6825 else
6826 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6827
6828 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6829 {
6830 /* This argument uses 256bit AVX modes. */
6831 if (cum->caller)
6832 cfun->machine->callee_pass_avx256_p = true;
6833 else
6834 cfun->machine->caller_pass_avx256_p = true;
6835 }
6836
6837 return arg;
6838 }
6839
6840 /* A C expression that indicates when an argument must be passed by
6841 reference. If nonzero for an argument, a copy of that argument is
6842 made in memory and a pointer to the argument is passed instead of
6843 the argument itself. The pointer is passed in whatever way is
6844 appropriate for passing a pointer to that type. */
6845
6846 static bool
6847 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6848 enum machine_mode mode ATTRIBUTE_UNUSED,
6849 const_tree type, bool named ATTRIBUTE_UNUSED)
6850 {
6851 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6852
6853 /* See Windows x64 Software Convention. */
6854 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6855 {
6856 int msize = (int) GET_MODE_SIZE (mode);
6857 if (type)
6858 {
6859 /* Arrays are passed by reference. */
6860 if (TREE_CODE (type) == ARRAY_TYPE)
6861 return true;
6862
6863 if (AGGREGATE_TYPE_P (type))
6864 {
6865 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6866 are passed by reference. */
6867 msize = int_size_in_bytes (type);
6868 }
6869 }
6870
6871 /* __m128 is passed by reference. */
6872 switch (msize) {
6873 case 1: case 2: case 4: case 8:
6874 break;
6875 default:
6876 return true;
6877 }
6878 }
6879 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6880 return 1;
6881
6882 return 0;
6883 }
6884
6885 /* Return true when TYPE should be 128bit aligned for 32bit argument
6886 passing ABI. XXX: This function is obsolete and is only used for
6887 checking psABI compatibility with previous versions of GCC. */
6888
6889 static bool
6890 ix86_compat_aligned_value_p (const_tree type)
6891 {
6892 enum machine_mode mode = TYPE_MODE (type);
6893 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6894 || mode == TDmode
6895 || mode == TFmode
6896 || mode == TCmode)
6897 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6898 return true;
6899 if (TYPE_ALIGN (type) < 128)
6900 return false;
6901
6902 if (AGGREGATE_TYPE_P (type))
6903 {
6904 /* Walk the aggregates recursively. */
6905 switch (TREE_CODE (type))
6906 {
6907 case RECORD_TYPE:
6908 case UNION_TYPE:
6909 case QUAL_UNION_TYPE:
6910 {
6911 tree field;
6912
6913 /* Walk all the structure fields. */
6914 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6915 {
6916 if (TREE_CODE (field) == FIELD_DECL
6917 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
6918 return true;
6919 }
6920 break;
6921 }
6922
6923 case ARRAY_TYPE:
6924 /* Just for use if some languages passes arrays by value. */
6925 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
6926 return true;
6927 break;
6928
6929 default:
6930 gcc_unreachable ();
6931 }
6932 }
6933 return false;
6934 }
6935
6936 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
6937 XXX: This function is obsolete and is only used for checking psABI
6938 compatibility with previous versions of GCC. */
6939
6940 static unsigned int
6941 ix86_compat_function_arg_boundary (enum machine_mode mode,
6942 const_tree type, unsigned int align)
6943 {
6944 /* In 32bit, only _Decimal128 and __float128 are aligned to their
6945 natural boundaries. */
6946 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
6947 {
6948 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
6949 make an exception for SSE modes since these require 128bit
6950 alignment.
6951
6952 The handling here differs from field_alignment. ICC aligns MMX
6953 arguments to 4 byte boundaries, while structure fields are aligned
6954 to 8 byte boundaries. */
6955 if (!type)
6956 {
6957 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
6958 align = PARM_BOUNDARY;
6959 }
6960 else
6961 {
6962 if (!ix86_compat_aligned_value_p (type))
6963 align = PARM_BOUNDARY;
6964 }
6965 }
6966 if (align > BIGGEST_ALIGNMENT)
6967 align = BIGGEST_ALIGNMENT;
6968 return align;
6969 }
6970
6971 /* Return true when TYPE should be 128bit aligned for 32bit argument
6972 passing ABI. */
6973
6974 static bool
6975 ix86_contains_aligned_value_p (const_tree type)
6976 {
6977 enum machine_mode mode = TYPE_MODE (type);
6978
6979 if (mode == XFmode || mode == XCmode)
6980 return false;
6981
6982 if (TYPE_ALIGN (type) < 128)
6983 return false;
6984
6985 if (AGGREGATE_TYPE_P (type))
6986 {
6987 /* Walk the aggregates recursively. */
6988 switch (TREE_CODE (type))
6989 {
6990 case RECORD_TYPE:
6991 case UNION_TYPE:
6992 case QUAL_UNION_TYPE:
6993 {
6994 tree field;
6995
6996 /* Walk all the structure fields. */
6997 for (field = TYPE_FIELDS (type);
6998 field;
6999 field = DECL_CHAIN (field))
7000 {
7001 if (TREE_CODE (field) == FIELD_DECL
7002 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7003 return true;
7004 }
7005 break;
7006 }
7007
7008 case ARRAY_TYPE:
7009 /* Just for use if some languages passes arrays by value. */
7010 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7011 return true;
7012 break;
7013
7014 default:
7015 gcc_unreachable ();
7016 }
7017 }
7018 else
7019 return TYPE_ALIGN (type) >= 128;
7020
7021 return false;
7022 }
7023
7024 /* Gives the alignment boundary, in bits, of an argument with the
7025 specified mode and type. */
7026
7027 static unsigned int
7028 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7029 {
7030 unsigned int align;
7031 if (type)
7032 {
7033 /* Since the main variant type is used for call, we convert it to
7034 the main variant type. */
7035 type = TYPE_MAIN_VARIANT (type);
7036 align = TYPE_ALIGN (type);
7037 }
7038 else
7039 align = GET_MODE_ALIGNMENT (mode);
7040 if (align < PARM_BOUNDARY)
7041 align = PARM_BOUNDARY;
7042 else
7043 {
7044 static bool warned;
7045 unsigned int saved_align = align;
7046
7047 if (!TARGET_64BIT)
7048 {
7049 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7050 if (!type)
7051 {
7052 if (mode == XFmode || mode == XCmode)
7053 align = PARM_BOUNDARY;
7054 }
7055 else if (!ix86_contains_aligned_value_p (type))
7056 align = PARM_BOUNDARY;
7057
7058 if (align < 128)
7059 align = PARM_BOUNDARY;
7060 }
7061
7062 if (warn_psabi
7063 && !warned
7064 && align != ix86_compat_function_arg_boundary (mode, type,
7065 saved_align))
7066 {
7067 warned = true;
7068 inform (input_location,
7069 "The ABI for passing parameters with %d-byte"
7070 " alignment has changed in GCC 4.6",
7071 align / BITS_PER_UNIT);
7072 }
7073 }
7074
7075 return align;
7076 }
7077
7078 /* Return true if N is a possible register number of function value. */
7079
7080 static bool
7081 ix86_function_value_regno_p (const unsigned int regno)
7082 {
7083 switch (regno)
7084 {
7085 case AX_REG:
7086 return true;
7087
7088 case FIRST_FLOAT_REG:
7089 /* TODO: The function should depend on current function ABI but
7090 builtins.c would need updating then. Therefore we use the
7091 default ABI. */
7092 if (TARGET_64BIT && ix86_abi == MS_ABI)
7093 return false;
7094 return TARGET_FLOAT_RETURNS_IN_80387;
7095
7096 case FIRST_SSE_REG:
7097 return TARGET_SSE;
7098
7099 case FIRST_MMX_REG:
7100 if (TARGET_MACHO || TARGET_64BIT)
7101 return false;
7102 return TARGET_MMX;
7103 }
7104
7105 return false;
7106 }
7107
7108 /* Define how to find the value returned by a function.
7109 VALTYPE is the data type of the value (as a tree).
7110 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7111 otherwise, FUNC is 0. */
7112
7113 static rtx
7114 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7115 const_tree fntype, const_tree fn)
7116 {
7117 unsigned int regno;
7118
7119 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7120 we normally prevent this case when mmx is not available. However
7121 some ABIs may require the result to be returned like DImode. */
7122 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7123 regno = FIRST_MMX_REG;
7124
7125 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7126 we prevent this case when sse is not available. However some ABIs
7127 may require the result to be returned like integer TImode. */
7128 else if (mode == TImode
7129 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7130 regno = FIRST_SSE_REG;
7131
7132 /* 32-byte vector modes in %ymm0. */
7133 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7134 regno = FIRST_SSE_REG;
7135
7136 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7137 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7138 regno = FIRST_FLOAT_REG;
7139 else
7140 /* Most things go in %eax. */
7141 regno = AX_REG;
7142
7143 /* Override FP return register with %xmm0 for local functions when
7144 SSE math is enabled or for functions with sseregparm attribute. */
7145 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7146 {
7147 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7148 if ((sse_level >= 1 && mode == SFmode)
7149 || (sse_level == 2 && mode == DFmode))
7150 regno = FIRST_SSE_REG;
7151 }
7152
7153 /* OImode shouldn't be used directly. */
7154 gcc_assert (mode != OImode);
7155
7156 return gen_rtx_REG (orig_mode, regno);
7157 }
7158
7159 static rtx
7160 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7161 const_tree valtype)
7162 {
7163 rtx ret;
7164
7165 /* Handle libcalls, which don't provide a type node. */
7166 if (valtype == NULL)
7167 {
7168 unsigned int regno;
7169
7170 switch (mode)
7171 {
7172 case SFmode:
7173 case SCmode:
7174 case DFmode:
7175 case DCmode:
7176 case TFmode:
7177 case SDmode:
7178 case DDmode:
7179 case TDmode:
7180 regno = FIRST_SSE_REG;
7181 break;
7182 case XFmode:
7183 case XCmode:
7184 regno = FIRST_FLOAT_REG;
7185 break;
7186 case TCmode:
7187 return NULL;
7188 default:
7189 regno = AX_REG;
7190 }
7191
7192 return gen_rtx_REG (mode, regno);
7193 }
7194 else if (POINTER_TYPE_P (valtype))
7195 {
7196 /* Pointers are always returned in Pmode. */
7197 mode = Pmode;
7198 }
7199
7200 ret = construct_container (mode, orig_mode, valtype, 1,
7201 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7202 x86_64_int_return_registers, 0);
7203
7204 /* For zero sized structures, construct_container returns NULL, but we
7205 need to keep rest of compiler happy by returning meaningful value. */
7206 if (!ret)
7207 ret = gen_rtx_REG (orig_mode, AX_REG);
7208
7209 return ret;
7210 }
7211
7212 static rtx
7213 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7214 {
7215 unsigned int regno = AX_REG;
7216
7217 if (TARGET_SSE)
7218 {
7219 switch (GET_MODE_SIZE (mode))
7220 {
7221 case 16:
7222 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7223 && !COMPLEX_MODE_P (mode))
7224 regno = FIRST_SSE_REG;
7225 break;
7226 case 8:
7227 case 4:
7228 if (mode == SFmode || mode == DFmode)
7229 regno = FIRST_SSE_REG;
7230 break;
7231 default:
7232 break;
7233 }
7234 }
7235 return gen_rtx_REG (orig_mode, regno);
7236 }
7237
7238 static rtx
7239 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7240 enum machine_mode orig_mode, enum machine_mode mode)
7241 {
7242 const_tree fn, fntype;
7243
7244 fn = NULL_TREE;
7245 if (fntype_or_decl && DECL_P (fntype_or_decl))
7246 fn = fntype_or_decl;
7247 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7248
7249 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7250 return function_value_ms_64 (orig_mode, mode);
7251 else if (TARGET_64BIT)
7252 return function_value_64 (orig_mode, mode, valtype);
7253 else
7254 return function_value_32 (orig_mode, mode, fntype, fn);
7255 }
7256
7257 static rtx
7258 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7259 bool outgoing ATTRIBUTE_UNUSED)
7260 {
7261 enum machine_mode mode, orig_mode;
7262
7263 orig_mode = TYPE_MODE (valtype);
7264 mode = type_natural_mode (valtype, NULL);
7265 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7266 }
7267
7268 /* Pointer function arguments and return values are promoted to Pmode. */
7269
7270 static enum machine_mode
7271 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7272 int *punsignedp, const_tree fntype,
7273 int for_return)
7274 {
7275 if (type != NULL_TREE && POINTER_TYPE_P (type))
7276 {
7277 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7278 return Pmode;
7279 }
7280 return default_promote_function_mode (type, mode, punsignedp, fntype,
7281 for_return);
7282 }
7283
7284 rtx
7285 ix86_libcall_value (enum machine_mode mode)
7286 {
7287 return ix86_function_value_1 (NULL, NULL, mode, mode);
7288 }
7289
7290 /* Return true iff type is returned in memory. */
7291
7292 static bool ATTRIBUTE_UNUSED
7293 return_in_memory_32 (const_tree type, enum machine_mode mode)
7294 {
7295 HOST_WIDE_INT size;
7296
7297 if (mode == BLKmode)
7298 return true;
7299
7300 size = int_size_in_bytes (type);
7301
7302 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7303 return false;
7304
7305 if (VECTOR_MODE_P (mode) || mode == TImode)
7306 {
7307 /* User-created vectors small enough to fit in EAX. */
7308 if (size < 8)
7309 return false;
7310
7311 /* MMX/3dNow values are returned in MM0,
7312 except when it doesn't exits or the ABI prescribes otherwise. */
7313 if (size == 8)
7314 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7315
7316 /* SSE values are returned in XMM0, except when it doesn't exist. */
7317 if (size == 16)
7318 return !TARGET_SSE;
7319
7320 /* AVX values are returned in YMM0, except when it doesn't exist. */
7321 if (size == 32)
7322 return !TARGET_AVX;
7323 }
7324
7325 if (mode == XFmode)
7326 return false;
7327
7328 if (size > 12)
7329 return true;
7330
7331 /* OImode shouldn't be used directly. */
7332 gcc_assert (mode != OImode);
7333
7334 return false;
7335 }
7336
7337 static bool ATTRIBUTE_UNUSED
7338 return_in_memory_64 (const_tree type, enum machine_mode mode)
7339 {
7340 int needed_intregs, needed_sseregs;
7341 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7342 }
7343
7344 static bool ATTRIBUTE_UNUSED
7345 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7346 {
7347 HOST_WIDE_INT size = int_size_in_bytes (type);
7348
7349 /* __m128 is returned in xmm0. */
7350 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7351 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7352 return false;
7353
7354 /* Otherwise, the size must be exactly in [1248]. */
7355 return size != 1 && size != 2 && size != 4 && size != 8;
7356 }
7357
7358 static bool
7359 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7360 {
7361 #ifdef SUBTARGET_RETURN_IN_MEMORY
7362 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7363 #else
7364 const enum machine_mode mode = type_natural_mode (type, NULL);
7365
7366 if (TARGET_64BIT)
7367 {
7368 if (ix86_function_type_abi (fntype) == MS_ABI)
7369 return return_in_memory_ms_64 (type, mode);
7370 else
7371 return return_in_memory_64 (type, mode);
7372 }
7373 else
7374 return return_in_memory_32 (type, mode);
7375 #endif
7376 }
7377
7378 /* When returning SSE vector types, we have a choice of either
7379 (1) being abi incompatible with a -march switch, or
7380 (2) generating an error.
7381 Given no good solution, I think the safest thing is one warning.
7382 The user won't be able to use -Werror, but....
7383
7384 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7385 called in response to actually generating a caller or callee that
7386 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7387 via aggregate_value_p for general type probing from tree-ssa. */
7388
7389 static rtx
7390 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7391 {
7392 static bool warnedsse, warnedmmx;
7393
7394 if (!TARGET_64BIT && type)
7395 {
7396 /* Look at the return type of the function, not the function type. */
7397 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7398
7399 if (!TARGET_SSE && !warnedsse)
7400 {
7401 if (mode == TImode
7402 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7403 {
7404 warnedsse = true;
7405 warning (0, "SSE vector return without SSE enabled "
7406 "changes the ABI");
7407 }
7408 }
7409
7410 if (!TARGET_MMX && !warnedmmx)
7411 {
7412 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7413 {
7414 warnedmmx = true;
7415 warning (0, "MMX vector return without MMX enabled "
7416 "changes the ABI");
7417 }
7418 }
7419 }
7420
7421 return NULL;
7422 }
7423
7424 \f
7425 /* Create the va_list data type. */
7426
7427 /* Returns the calling convention specific va_list date type.
7428 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7429
7430 static tree
7431 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7432 {
7433 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7434
7435 /* For i386 we use plain pointer to argument area. */
7436 if (!TARGET_64BIT || abi == MS_ABI)
7437 return build_pointer_type (char_type_node);
7438
7439 record = lang_hooks.types.make_type (RECORD_TYPE);
7440 type_decl = build_decl (BUILTINS_LOCATION,
7441 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7442
7443 f_gpr = build_decl (BUILTINS_LOCATION,
7444 FIELD_DECL, get_identifier ("gp_offset"),
7445 unsigned_type_node);
7446 f_fpr = build_decl (BUILTINS_LOCATION,
7447 FIELD_DECL, get_identifier ("fp_offset"),
7448 unsigned_type_node);
7449 f_ovf = build_decl (BUILTINS_LOCATION,
7450 FIELD_DECL, get_identifier ("overflow_arg_area"),
7451 ptr_type_node);
7452 f_sav = build_decl (BUILTINS_LOCATION,
7453 FIELD_DECL, get_identifier ("reg_save_area"),
7454 ptr_type_node);
7455
7456 va_list_gpr_counter_field = f_gpr;
7457 va_list_fpr_counter_field = f_fpr;
7458
7459 DECL_FIELD_CONTEXT (f_gpr) = record;
7460 DECL_FIELD_CONTEXT (f_fpr) = record;
7461 DECL_FIELD_CONTEXT (f_ovf) = record;
7462 DECL_FIELD_CONTEXT (f_sav) = record;
7463
7464 TYPE_STUB_DECL (record) = type_decl;
7465 TYPE_NAME (record) = type_decl;
7466 TYPE_FIELDS (record) = f_gpr;
7467 DECL_CHAIN (f_gpr) = f_fpr;
7468 DECL_CHAIN (f_fpr) = f_ovf;
7469 DECL_CHAIN (f_ovf) = f_sav;
7470
7471 layout_type (record);
7472
7473 /* The correct type is an array type of one element. */
7474 return build_array_type (record, build_index_type (size_zero_node));
7475 }
7476
7477 /* Setup the builtin va_list data type and for 64-bit the additional
7478 calling convention specific va_list data types. */
7479
7480 static tree
7481 ix86_build_builtin_va_list (void)
7482 {
7483 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7484
7485 /* Initialize abi specific va_list builtin types. */
7486 if (TARGET_64BIT)
7487 {
7488 tree t;
7489 if (ix86_abi == MS_ABI)
7490 {
7491 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7492 if (TREE_CODE (t) != RECORD_TYPE)
7493 t = build_variant_type_copy (t);
7494 sysv_va_list_type_node = t;
7495 }
7496 else
7497 {
7498 t = ret;
7499 if (TREE_CODE (t) != RECORD_TYPE)
7500 t = build_variant_type_copy (t);
7501 sysv_va_list_type_node = t;
7502 }
7503 if (ix86_abi != MS_ABI)
7504 {
7505 t = ix86_build_builtin_va_list_abi (MS_ABI);
7506 if (TREE_CODE (t) != RECORD_TYPE)
7507 t = build_variant_type_copy (t);
7508 ms_va_list_type_node = t;
7509 }
7510 else
7511 {
7512 t = ret;
7513 if (TREE_CODE (t) != RECORD_TYPE)
7514 t = build_variant_type_copy (t);
7515 ms_va_list_type_node = t;
7516 }
7517 }
7518
7519 return ret;
7520 }
7521
7522 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7523
7524 static void
7525 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7526 {
7527 rtx save_area, mem;
7528 alias_set_type set;
7529 int i, max;
7530
7531 /* GPR size of varargs save area. */
7532 if (cfun->va_list_gpr_size)
7533 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7534 else
7535 ix86_varargs_gpr_size = 0;
7536
7537 /* FPR size of varargs save area. We don't need it if we don't pass
7538 anything in SSE registers. */
7539 if (TARGET_SSE && cfun->va_list_fpr_size)
7540 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7541 else
7542 ix86_varargs_fpr_size = 0;
7543
7544 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7545 return;
7546
7547 save_area = frame_pointer_rtx;
7548 set = get_varargs_alias_set ();
7549
7550 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7551 if (max > X86_64_REGPARM_MAX)
7552 max = X86_64_REGPARM_MAX;
7553
7554 for (i = cum->regno; i < max; i++)
7555 {
7556 mem = gen_rtx_MEM (Pmode,
7557 plus_constant (save_area, i * UNITS_PER_WORD));
7558 MEM_NOTRAP_P (mem) = 1;
7559 set_mem_alias_set (mem, set);
7560 emit_move_insn (mem, gen_rtx_REG (Pmode,
7561 x86_64_int_parameter_registers[i]));
7562 }
7563
7564 if (ix86_varargs_fpr_size)
7565 {
7566 enum machine_mode smode;
7567 rtx label, test;
7568
7569 /* Now emit code to save SSE registers. The AX parameter contains number
7570 of SSE parameter registers used to call this function, though all we
7571 actually check here is the zero/non-zero status. */
7572
7573 label = gen_label_rtx ();
7574 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7575 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7576 label));
7577
7578 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7579 we used movdqa (i.e. TImode) instead? Perhaps even better would
7580 be if we could determine the real mode of the data, via a hook
7581 into pass_stdarg. Ignore all that for now. */
7582 smode = V4SFmode;
7583 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7584 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7585
7586 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7587 if (max > X86_64_SSE_REGPARM_MAX)
7588 max = X86_64_SSE_REGPARM_MAX;
7589
7590 for (i = cum->sse_regno; i < max; ++i)
7591 {
7592 mem = plus_constant (save_area, i * 16 + ix86_varargs_gpr_size);
7593 mem = gen_rtx_MEM (smode, mem);
7594 MEM_NOTRAP_P (mem) = 1;
7595 set_mem_alias_set (mem, set);
7596 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7597
7598 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7599 }
7600
7601 emit_label (label);
7602 }
7603 }
7604
7605 static void
7606 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7607 {
7608 alias_set_type set = get_varargs_alias_set ();
7609 int i;
7610
7611 /* Reset to zero, as there might be a sysv vaarg used
7612 before. */
7613 ix86_varargs_gpr_size = 0;
7614 ix86_varargs_fpr_size = 0;
7615
7616 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7617 {
7618 rtx reg, mem;
7619
7620 mem = gen_rtx_MEM (Pmode,
7621 plus_constant (virtual_incoming_args_rtx,
7622 i * UNITS_PER_WORD));
7623 MEM_NOTRAP_P (mem) = 1;
7624 set_mem_alias_set (mem, set);
7625
7626 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7627 emit_move_insn (mem, reg);
7628 }
7629 }
7630
7631 static void
7632 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7633 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7634 int no_rtl)
7635 {
7636 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7637 CUMULATIVE_ARGS next_cum;
7638 tree fntype;
7639
7640 /* This argument doesn't appear to be used anymore. Which is good,
7641 because the old code here didn't suppress rtl generation. */
7642 gcc_assert (!no_rtl);
7643
7644 if (!TARGET_64BIT)
7645 return;
7646
7647 fntype = TREE_TYPE (current_function_decl);
7648
7649 /* For varargs, we do not want to skip the dummy va_dcl argument.
7650 For stdargs, we do want to skip the last named argument. */
7651 next_cum = *cum;
7652 if (stdarg_p (fntype))
7653 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7654 true);
7655
7656 if (cum->call_abi == MS_ABI)
7657 setup_incoming_varargs_ms_64 (&next_cum);
7658 else
7659 setup_incoming_varargs_64 (&next_cum);
7660 }
7661
7662 /* Checks if TYPE is of kind va_list char *. */
7663
7664 static bool
7665 is_va_list_char_pointer (tree type)
7666 {
7667 tree canonic;
7668
7669 /* For 32-bit it is always true. */
7670 if (!TARGET_64BIT)
7671 return true;
7672 canonic = ix86_canonical_va_list_type (type);
7673 return (canonic == ms_va_list_type_node
7674 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7675 }
7676
7677 /* Implement va_start. */
7678
7679 static void
7680 ix86_va_start (tree valist, rtx nextarg)
7681 {
7682 HOST_WIDE_INT words, n_gpr, n_fpr;
7683 tree f_gpr, f_fpr, f_ovf, f_sav;
7684 tree gpr, fpr, ovf, sav, t;
7685 tree type;
7686 rtx ovf_rtx;
7687
7688 if (flag_split_stack
7689 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7690 {
7691 unsigned int scratch_regno;
7692
7693 /* When we are splitting the stack, we can't refer to the stack
7694 arguments using internal_arg_pointer, because they may be on
7695 the old stack. The split stack prologue will arrange to
7696 leave a pointer to the old stack arguments in a scratch
7697 register, which we here copy to a pseudo-register. The split
7698 stack prologue can't set the pseudo-register directly because
7699 it (the prologue) runs before any registers have been saved. */
7700
7701 scratch_regno = split_stack_prologue_scratch_regno ();
7702 if (scratch_regno != INVALID_REGNUM)
7703 {
7704 rtx reg, seq;
7705
7706 reg = gen_reg_rtx (Pmode);
7707 cfun->machine->split_stack_varargs_pointer = reg;
7708
7709 start_sequence ();
7710 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7711 seq = get_insns ();
7712 end_sequence ();
7713
7714 push_topmost_sequence ();
7715 emit_insn_after (seq, entry_of_function ());
7716 pop_topmost_sequence ();
7717 }
7718 }
7719
7720 /* Only 64bit target needs something special. */
7721 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7722 {
7723 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7724 std_expand_builtin_va_start (valist, nextarg);
7725 else
7726 {
7727 rtx va_r, next;
7728
7729 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7730 next = expand_binop (ptr_mode, add_optab,
7731 cfun->machine->split_stack_varargs_pointer,
7732 crtl->args.arg_offset_rtx,
7733 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7734 convert_move (va_r, next, 0);
7735 }
7736 return;
7737 }
7738
7739 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7740 f_fpr = DECL_CHAIN (f_gpr);
7741 f_ovf = DECL_CHAIN (f_fpr);
7742 f_sav = DECL_CHAIN (f_ovf);
7743
7744 valist = build_simple_mem_ref (valist);
7745 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7746 /* The following should be folded into the MEM_REF offset. */
7747 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7748 f_gpr, NULL_TREE);
7749 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7750 f_fpr, NULL_TREE);
7751 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7752 f_ovf, NULL_TREE);
7753 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7754 f_sav, NULL_TREE);
7755
7756 /* Count number of gp and fp argument registers used. */
7757 words = crtl->args.info.words;
7758 n_gpr = crtl->args.info.regno;
7759 n_fpr = crtl->args.info.sse_regno;
7760
7761 if (cfun->va_list_gpr_size)
7762 {
7763 type = TREE_TYPE (gpr);
7764 t = build2 (MODIFY_EXPR, type,
7765 gpr, build_int_cst (type, n_gpr * 8));
7766 TREE_SIDE_EFFECTS (t) = 1;
7767 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7768 }
7769
7770 if (TARGET_SSE && cfun->va_list_fpr_size)
7771 {
7772 type = TREE_TYPE (fpr);
7773 t = build2 (MODIFY_EXPR, type, fpr,
7774 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7775 TREE_SIDE_EFFECTS (t) = 1;
7776 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7777 }
7778
7779 /* Find the overflow area. */
7780 type = TREE_TYPE (ovf);
7781 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7782 ovf_rtx = crtl->args.internal_arg_pointer;
7783 else
7784 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7785 t = make_tree (type, ovf_rtx);
7786 if (words != 0)
7787 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7788 t = build2 (MODIFY_EXPR, type, ovf, t);
7789 TREE_SIDE_EFFECTS (t) = 1;
7790 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7791
7792 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7793 {
7794 /* Find the register save area.
7795 Prologue of the function save it right above stack frame. */
7796 type = TREE_TYPE (sav);
7797 t = make_tree (type, frame_pointer_rtx);
7798 if (!ix86_varargs_gpr_size)
7799 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7800 t = build2 (MODIFY_EXPR, type, sav, t);
7801 TREE_SIDE_EFFECTS (t) = 1;
7802 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7803 }
7804 }
7805
7806 /* Implement va_arg. */
7807
7808 static tree
7809 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7810 gimple_seq *post_p)
7811 {
7812 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7813 tree f_gpr, f_fpr, f_ovf, f_sav;
7814 tree gpr, fpr, ovf, sav, t;
7815 int size, rsize;
7816 tree lab_false, lab_over = NULL_TREE;
7817 tree addr, t2;
7818 rtx container;
7819 int indirect_p = 0;
7820 tree ptrtype;
7821 enum machine_mode nat_mode;
7822 unsigned int arg_boundary;
7823
7824 /* Only 64bit target needs something special. */
7825 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7826 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7827
7828 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7829 f_fpr = DECL_CHAIN (f_gpr);
7830 f_ovf = DECL_CHAIN (f_fpr);
7831 f_sav = DECL_CHAIN (f_ovf);
7832
7833 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7834 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7835 valist = build_va_arg_indirect_ref (valist);
7836 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7837 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7838 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7839
7840 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7841 if (indirect_p)
7842 type = build_pointer_type (type);
7843 size = int_size_in_bytes (type);
7844 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7845
7846 nat_mode = type_natural_mode (type, NULL);
7847 switch (nat_mode)
7848 {
7849 case V8SFmode:
7850 case V8SImode:
7851 case V32QImode:
7852 case V16HImode:
7853 case V4DFmode:
7854 case V4DImode:
7855 /* Unnamed 256bit vector mode parameters are passed on stack. */
7856 if (!TARGET_64BIT_MS_ABI)
7857 {
7858 container = NULL;
7859 break;
7860 }
7861
7862 default:
7863 container = construct_container (nat_mode, TYPE_MODE (type),
7864 type, 0, X86_64_REGPARM_MAX,
7865 X86_64_SSE_REGPARM_MAX, intreg,
7866 0);
7867 break;
7868 }
7869
7870 /* Pull the value out of the saved registers. */
7871
7872 addr = create_tmp_var (ptr_type_node, "addr");
7873
7874 if (container)
7875 {
7876 int needed_intregs, needed_sseregs;
7877 bool need_temp;
7878 tree int_addr, sse_addr;
7879
7880 lab_false = create_artificial_label (UNKNOWN_LOCATION);
7881 lab_over = create_artificial_label (UNKNOWN_LOCATION);
7882
7883 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7884
7885 need_temp = (!REG_P (container)
7886 && ((needed_intregs && TYPE_ALIGN (type) > 64)
7887 || TYPE_ALIGN (type) > 128));
7888
7889 /* In case we are passing structure, verify that it is consecutive block
7890 on the register save area. If not we need to do moves. */
7891 if (!need_temp && !REG_P (container))
7892 {
7893 /* Verify that all registers are strictly consecutive */
7894 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7895 {
7896 int i;
7897
7898 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7899 {
7900 rtx slot = XVECEXP (container, 0, i);
7901 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7902 || INTVAL (XEXP (slot, 1)) != i * 16)
7903 need_temp = 1;
7904 }
7905 }
7906 else
7907 {
7908 int i;
7909
7910 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7911 {
7912 rtx slot = XVECEXP (container, 0, i);
7913 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
7914 || INTVAL (XEXP (slot, 1)) != i * 8)
7915 need_temp = 1;
7916 }
7917 }
7918 }
7919 if (!need_temp)
7920 {
7921 int_addr = addr;
7922 sse_addr = addr;
7923 }
7924 else
7925 {
7926 int_addr = create_tmp_var (ptr_type_node, "int_addr");
7927 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
7928 }
7929
7930 /* First ensure that we fit completely in registers. */
7931 if (needed_intregs)
7932 {
7933 t = build_int_cst (TREE_TYPE (gpr),
7934 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
7935 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
7936 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7937 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7938 gimplify_and_add (t, pre_p);
7939 }
7940 if (needed_sseregs)
7941 {
7942 t = build_int_cst (TREE_TYPE (fpr),
7943 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
7944 + X86_64_REGPARM_MAX * 8);
7945 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
7946 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7947 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7948 gimplify_and_add (t, pre_p);
7949 }
7950
7951 /* Compute index to start of area used for integer regs. */
7952 if (needed_intregs)
7953 {
7954 /* int_addr = gpr + sav; */
7955 t = fold_build_pointer_plus (sav, gpr);
7956 gimplify_assign (int_addr, t, pre_p);
7957 }
7958 if (needed_sseregs)
7959 {
7960 /* sse_addr = fpr + sav; */
7961 t = fold_build_pointer_plus (sav, fpr);
7962 gimplify_assign (sse_addr, t, pre_p);
7963 }
7964 if (need_temp)
7965 {
7966 int i, prev_size = 0;
7967 tree temp = create_tmp_var (type, "va_arg_tmp");
7968
7969 /* addr = &temp; */
7970 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
7971 gimplify_assign (addr, t, pre_p);
7972
7973 for (i = 0; i < XVECLEN (container, 0); i++)
7974 {
7975 rtx slot = XVECEXP (container, 0, i);
7976 rtx reg = XEXP (slot, 0);
7977 enum machine_mode mode = GET_MODE (reg);
7978 tree piece_type;
7979 tree addr_type;
7980 tree daddr_type;
7981 tree src_addr, src;
7982 int src_offset;
7983 tree dest_addr, dest;
7984 int cur_size = GET_MODE_SIZE (mode);
7985
7986 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
7987 prev_size = INTVAL (XEXP (slot, 1));
7988 if (prev_size + cur_size > size)
7989 {
7990 cur_size = size - prev_size;
7991 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
7992 if (mode == BLKmode)
7993 mode = QImode;
7994 }
7995 piece_type = lang_hooks.types.type_for_mode (mode, 1);
7996 if (mode == GET_MODE (reg))
7997 addr_type = build_pointer_type (piece_type);
7998 else
7999 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8000 true);
8001 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8002 true);
8003
8004 if (SSE_REGNO_P (REGNO (reg)))
8005 {
8006 src_addr = sse_addr;
8007 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8008 }
8009 else
8010 {
8011 src_addr = int_addr;
8012 src_offset = REGNO (reg) * 8;
8013 }
8014 src_addr = fold_convert (addr_type, src_addr);
8015 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8016
8017 dest_addr = fold_convert (daddr_type, addr);
8018 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8019 if (cur_size == GET_MODE_SIZE (mode))
8020 {
8021 src = build_va_arg_indirect_ref (src_addr);
8022 dest = build_va_arg_indirect_ref (dest_addr);
8023
8024 gimplify_assign (dest, src, pre_p);
8025 }
8026 else
8027 {
8028 tree copy
8029 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8030 3, dest_addr, src_addr,
8031 size_int (cur_size));
8032 gimplify_and_add (copy, pre_p);
8033 }
8034 prev_size += cur_size;
8035 }
8036 }
8037
8038 if (needed_intregs)
8039 {
8040 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8041 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8042 gimplify_assign (gpr, t, pre_p);
8043 }
8044
8045 if (needed_sseregs)
8046 {
8047 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8048 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8049 gimplify_assign (fpr, t, pre_p);
8050 }
8051
8052 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8053
8054 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8055 }
8056
8057 /* ... otherwise out of the overflow area. */
8058
8059 /* When we align parameter on stack for caller, if the parameter
8060 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8061 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8062 here with caller. */
8063 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8064 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8065 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8066
8067 /* Care for on-stack alignment if needed. */
8068 if (arg_boundary <= 64 || size == 0)
8069 t = ovf;
8070 else
8071 {
8072 HOST_WIDE_INT align = arg_boundary / 8;
8073 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8074 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8075 build_int_cst (TREE_TYPE (t), -align));
8076 }
8077
8078 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8079 gimplify_assign (addr, t, pre_p);
8080
8081 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8082 gimplify_assign (unshare_expr (ovf), t, pre_p);
8083
8084 if (container)
8085 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8086
8087 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8088 addr = fold_convert (ptrtype, addr);
8089
8090 if (indirect_p)
8091 addr = build_va_arg_indirect_ref (addr);
8092 return build_va_arg_indirect_ref (addr);
8093 }
8094 \f
8095 /* Return true if OPNUM's MEM should be matched
8096 in movabs* patterns. */
8097
8098 bool
8099 ix86_check_movabs (rtx insn, int opnum)
8100 {
8101 rtx set, mem;
8102
8103 set = PATTERN (insn);
8104 if (GET_CODE (set) == PARALLEL)
8105 set = XVECEXP (set, 0, 0);
8106 gcc_assert (GET_CODE (set) == SET);
8107 mem = XEXP (set, opnum);
8108 while (GET_CODE (mem) == SUBREG)
8109 mem = SUBREG_REG (mem);
8110 gcc_assert (MEM_P (mem));
8111 return volatile_ok || !MEM_VOLATILE_P (mem);
8112 }
8113 \f
8114 /* Initialize the table of extra 80387 mathematical constants. */
8115
8116 static void
8117 init_ext_80387_constants (void)
8118 {
8119 static const char * cst[5] =
8120 {
8121 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8122 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8123 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8124 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8125 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8126 };
8127 int i;
8128
8129 for (i = 0; i < 5; i++)
8130 {
8131 real_from_string (&ext_80387_constants_table[i], cst[i]);
8132 /* Ensure each constant is rounded to XFmode precision. */
8133 real_convert (&ext_80387_constants_table[i],
8134 XFmode, &ext_80387_constants_table[i]);
8135 }
8136
8137 ext_80387_constants_init = 1;
8138 }
8139
8140 /* Return non-zero if the constant is something that
8141 can be loaded with a special instruction. */
8142
8143 int
8144 standard_80387_constant_p (rtx x)
8145 {
8146 enum machine_mode mode = GET_MODE (x);
8147
8148 REAL_VALUE_TYPE r;
8149
8150 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8151 return -1;
8152
8153 if (x == CONST0_RTX (mode))
8154 return 1;
8155 if (x == CONST1_RTX (mode))
8156 return 2;
8157
8158 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8159
8160 /* For XFmode constants, try to find a special 80387 instruction when
8161 optimizing for size or on those CPUs that benefit from them. */
8162 if (mode == XFmode
8163 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8164 {
8165 int i;
8166
8167 if (! ext_80387_constants_init)
8168 init_ext_80387_constants ();
8169
8170 for (i = 0; i < 5; i++)
8171 if (real_identical (&r, &ext_80387_constants_table[i]))
8172 return i + 3;
8173 }
8174
8175 /* Load of the constant -0.0 or -1.0 will be split as
8176 fldz;fchs or fld1;fchs sequence. */
8177 if (real_isnegzero (&r))
8178 return 8;
8179 if (real_identical (&r, &dconstm1))
8180 return 9;
8181
8182 return 0;
8183 }
8184
8185 /* Return the opcode of the special instruction to be used to load
8186 the constant X. */
8187
8188 const char *
8189 standard_80387_constant_opcode (rtx x)
8190 {
8191 switch (standard_80387_constant_p (x))
8192 {
8193 case 1:
8194 return "fldz";
8195 case 2:
8196 return "fld1";
8197 case 3:
8198 return "fldlg2";
8199 case 4:
8200 return "fldln2";
8201 case 5:
8202 return "fldl2e";
8203 case 6:
8204 return "fldl2t";
8205 case 7:
8206 return "fldpi";
8207 case 8:
8208 case 9:
8209 return "#";
8210 default:
8211 gcc_unreachable ();
8212 }
8213 }
8214
8215 /* Return the CONST_DOUBLE representing the 80387 constant that is
8216 loaded by the specified special instruction. The argument IDX
8217 matches the return value from standard_80387_constant_p. */
8218
8219 rtx
8220 standard_80387_constant_rtx (int idx)
8221 {
8222 int i;
8223
8224 if (! ext_80387_constants_init)
8225 init_ext_80387_constants ();
8226
8227 switch (idx)
8228 {
8229 case 3:
8230 case 4:
8231 case 5:
8232 case 6:
8233 case 7:
8234 i = idx - 3;
8235 break;
8236
8237 default:
8238 gcc_unreachable ();
8239 }
8240
8241 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8242 XFmode);
8243 }
8244
8245 /* Return 1 if X is all 0s and 2 if x is all 1s
8246 in supported SSE/AVX vector mode. */
8247
8248 int
8249 standard_sse_constant_p (rtx x)
8250 {
8251 enum machine_mode mode = GET_MODE (x);
8252
8253 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8254 return 1;
8255 if (vector_all_ones_operand (x, mode))
8256 switch (mode)
8257 {
8258 case V16QImode:
8259 case V8HImode:
8260 case V4SImode:
8261 case V2DImode:
8262 if (TARGET_SSE2)
8263 return 2;
8264 case V32QImode:
8265 case V16HImode:
8266 case V8SImode:
8267 case V4DImode:
8268 if (TARGET_AVX2)
8269 return 2;
8270 default:
8271 break;
8272 }
8273
8274 return 0;
8275 }
8276
8277 /* Return the opcode of the special instruction to be used to load
8278 the constant X. */
8279
8280 const char *
8281 standard_sse_constant_opcode (rtx insn, rtx x)
8282 {
8283 switch (standard_sse_constant_p (x))
8284 {
8285 case 1:
8286 switch (get_attr_mode (insn))
8287 {
8288 case MODE_TI:
8289 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8290 return "%vpxor\t%0, %d0";
8291 case MODE_V2DF:
8292 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8293 return "%vxorpd\t%0, %d0";
8294 case MODE_V4SF:
8295 return "%vxorps\t%0, %d0";
8296
8297 case MODE_OI:
8298 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8299 return "vpxor\t%x0, %x0, %x0";
8300 case MODE_V4DF:
8301 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8302 return "vxorpd\t%x0, %x0, %x0";
8303 case MODE_V8SF:
8304 return "vxorps\t%x0, %x0, %x0";
8305
8306 default:
8307 break;
8308 }
8309
8310 case 2:
8311 if (TARGET_AVX)
8312 return "vpcmpeqd\t%0, %0, %0";
8313 else
8314 return "pcmpeqd\t%0, %0";
8315
8316 default:
8317 break;
8318 }
8319 gcc_unreachable ();
8320 }
8321
8322 /* Returns true if OP contains a symbol reference */
8323
8324 bool
8325 symbolic_reference_mentioned_p (rtx op)
8326 {
8327 const char *fmt;
8328 int i;
8329
8330 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8331 return true;
8332
8333 fmt = GET_RTX_FORMAT (GET_CODE (op));
8334 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8335 {
8336 if (fmt[i] == 'E')
8337 {
8338 int j;
8339
8340 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8341 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8342 return true;
8343 }
8344
8345 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8346 return true;
8347 }
8348
8349 return false;
8350 }
8351
8352 /* Return true if it is appropriate to emit `ret' instructions in the
8353 body of a function. Do this only if the epilogue is simple, needing a
8354 couple of insns. Prior to reloading, we can't tell how many registers
8355 must be saved, so return false then. Return false if there is no frame
8356 marker to de-allocate. */
8357
8358 bool
8359 ix86_can_use_return_insn_p (void)
8360 {
8361 struct ix86_frame frame;
8362
8363 if (! reload_completed || frame_pointer_needed)
8364 return 0;
8365
8366 /* Don't allow more than 32k pop, since that's all we can do
8367 with one instruction. */
8368 if (crtl->args.pops_args && crtl->args.size >= 32768)
8369 return 0;
8370
8371 ix86_compute_frame_layout (&frame);
8372 return (frame.stack_pointer_offset == UNITS_PER_WORD
8373 && (frame.nregs + frame.nsseregs) == 0);
8374 }
8375 \f
8376 /* Value should be nonzero if functions must have frame pointers.
8377 Zero means the frame pointer need not be set up (and parms may
8378 be accessed via the stack pointer) in functions that seem suitable. */
8379
8380 static bool
8381 ix86_frame_pointer_required (void)
8382 {
8383 /* If we accessed previous frames, then the generated code expects
8384 to be able to access the saved ebp value in our frame. */
8385 if (cfun->machine->accesses_prev_frame)
8386 return true;
8387
8388 /* Several x86 os'es need a frame pointer for other reasons,
8389 usually pertaining to setjmp. */
8390 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8391 return true;
8392
8393 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8394 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8395 return true;
8396
8397 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8398 turns off the frame pointer by default. Turn it back on now if
8399 we've not got a leaf function. */
8400 if (TARGET_OMIT_LEAF_FRAME_POINTER
8401 && (!current_function_is_leaf
8402 || ix86_current_function_calls_tls_descriptor))
8403 return true;
8404
8405 if (crtl->profile && !flag_fentry)
8406 return true;
8407
8408 return false;
8409 }
8410
8411 /* Record that the current function accesses previous call frames. */
8412
8413 void
8414 ix86_setup_frame_addresses (void)
8415 {
8416 cfun->machine->accesses_prev_frame = 1;
8417 }
8418 \f
8419 #ifndef USE_HIDDEN_LINKONCE
8420 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8421 # define USE_HIDDEN_LINKONCE 1
8422 # else
8423 # define USE_HIDDEN_LINKONCE 0
8424 # endif
8425 #endif
8426
8427 static int pic_labels_used;
8428
8429 /* Fills in the label name that should be used for a pc thunk for
8430 the given register. */
8431
8432 static void
8433 get_pc_thunk_name (char name[32], unsigned int regno)
8434 {
8435 gcc_assert (!TARGET_64BIT);
8436
8437 if (USE_HIDDEN_LINKONCE)
8438 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8439 else
8440 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8441 }
8442
8443
8444 /* This function generates code for -fpic that loads %ebx with
8445 the return address of the caller and then returns. */
8446
8447 static void
8448 ix86_code_end (void)
8449 {
8450 rtx xops[2];
8451 int regno;
8452
8453 for (regno = AX_REG; regno <= SP_REG; regno++)
8454 {
8455 char name[32];
8456 tree decl;
8457
8458 if (!(pic_labels_used & (1 << regno)))
8459 continue;
8460
8461 get_pc_thunk_name (name, regno);
8462
8463 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8464 get_identifier (name),
8465 build_function_type_list (void_type_node, NULL_TREE));
8466 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8467 NULL_TREE, void_type_node);
8468 TREE_PUBLIC (decl) = 1;
8469 TREE_STATIC (decl) = 1;
8470
8471 #if TARGET_MACHO
8472 if (TARGET_MACHO)
8473 {
8474 switch_to_section (darwin_sections[text_coal_section]);
8475 fputs ("\t.weak_definition\t", asm_out_file);
8476 assemble_name (asm_out_file, name);
8477 fputs ("\n\t.private_extern\t", asm_out_file);
8478 assemble_name (asm_out_file, name);
8479 putc ('\n', asm_out_file);
8480 ASM_OUTPUT_LABEL (asm_out_file, name);
8481 DECL_WEAK (decl) = 1;
8482 }
8483 else
8484 #endif
8485 if (USE_HIDDEN_LINKONCE)
8486 {
8487 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8488
8489 targetm.asm_out.unique_section (decl, 0);
8490 switch_to_section (get_named_section (decl, NULL, 0));
8491
8492 targetm.asm_out.globalize_label (asm_out_file, name);
8493 fputs ("\t.hidden\t", asm_out_file);
8494 assemble_name (asm_out_file, name);
8495 putc ('\n', asm_out_file);
8496 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8497 }
8498 else
8499 {
8500 switch_to_section (text_section);
8501 ASM_OUTPUT_LABEL (asm_out_file, name);
8502 }
8503
8504 DECL_INITIAL (decl) = make_node (BLOCK);
8505 current_function_decl = decl;
8506 init_function_start (decl);
8507 first_function_block_is_cold = false;
8508 /* Make sure unwind info is emitted for the thunk if needed. */
8509 final_start_function (emit_barrier (), asm_out_file, 1);
8510
8511 /* Pad stack IP move with 4 instructions (two NOPs count
8512 as one instruction). */
8513 if (TARGET_PAD_SHORT_FUNCTION)
8514 {
8515 int i = 8;
8516
8517 while (i--)
8518 fputs ("\tnop\n", asm_out_file);
8519 }
8520
8521 xops[0] = gen_rtx_REG (Pmode, regno);
8522 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8523 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8524 fputs ("\tret\n", asm_out_file);
8525 final_end_function ();
8526 init_insn_lengths ();
8527 free_after_compilation (cfun);
8528 set_cfun (NULL);
8529 current_function_decl = NULL;
8530 }
8531
8532 if (flag_split_stack)
8533 file_end_indicate_split_stack ();
8534 }
8535
8536 /* Emit code for the SET_GOT patterns. */
8537
8538 const char *
8539 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8540 {
8541 rtx xops[3];
8542
8543 xops[0] = dest;
8544
8545 if (TARGET_VXWORKS_RTP && flag_pic)
8546 {
8547 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8548 xops[2] = gen_rtx_MEM (Pmode,
8549 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8550 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8551
8552 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8553 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8554 an unadorned address. */
8555 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8556 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8557 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8558 return "";
8559 }
8560
8561 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8562
8563 if (!flag_pic)
8564 {
8565 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8566
8567 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8568
8569 #if TARGET_MACHO
8570 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8571 is what will be referenced by the Mach-O PIC subsystem. */
8572 if (!label)
8573 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8574 #endif
8575
8576 targetm.asm_out.internal_label (asm_out_file, "L",
8577 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8578 }
8579 else
8580 {
8581 char name[32];
8582 get_pc_thunk_name (name, REGNO (dest));
8583 pic_labels_used |= 1 << REGNO (dest);
8584
8585 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8586 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8587 output_asm_insn ("call\t%X2", xops);
8588 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8589 is what will be referenced by the Mach-O PIC subsystem. */
8590 #if TARGET_MACHO
8591 if (!label)
8592 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8593 else
8594 targetm.asm_out.internal_label (asm_out_file, "L",
8595 CODE_LABEL_NUMBER (label));
8596 #endif
8597 }
8598
8599 if (!TARGET_MACHO)
8600 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8601
8602 return "";
8603 }
8604
8605 /* Generate an "push" pattern for input ARG. */
8606
8607 static rtx
8608 gen_push (rtx arg)
8609 {
8610 struct machine_function *m = cfun->machine;
8611
8612 if (m->fs.cfa_reg == stack_pointer_rtx)
8613 m->fs.cfa_offset += UNITS_PER_WORD;
8614 m->fs.sp_offset += UNITS_PER_WORD;
8615
8616 return gen_rtx_SET (VOIDmode,
8617 gen_rtx_MEM (Pmode,
8618 gen_rtx_PRE_DEC (Pmode,
8619 stack_pointer_rtx)),
8620 arg);
8621 }
8622
8623 /* Generate an "pop" pattern for input ARG. */
8624
8625 static rtx
8626 gen_pop (rtx arg)
8627 {
8628 return gen_rtx_SET (VOIDmode,
8629 arg,
8630 gen_rtx_MEM (Pmode,
8631 gen_rtx_POST_INC (Pmode,
8632 stack_pointer_rtx)));
8633 }
8634
8635 /* Return >= 0 if there is an unused call-clobbered register available
8636 for the entire function. */
8637
8638 static unsigned int
8639 ix86_select_alt_pic_regnum (void)
8640 {
8641 if (current_function_is_leaf
8642 && !crtl->profile
8643 && !ix86_current_function_calls_tls_descriptor)
8644 {
8645 int i, drap;
8646 /* Can't use the same register for both PIC and DRAP. */
8647 if (crtl->drap_reg)
8648 drap = REGNO (crtl->drap_reg);
8649 else
8650 drap = -1;
8651 for (i = 2; i >= 0; --i)
8652 if (i != drap && !df_regs_ever_live_p (i))
8653 return i;
8654 }
8655
8656 return INVALID_REGNUM;
8657 }
8658
8659 /* Return TRUE if we need to save REGNO. */
8660
8661 static bool
8662 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8663 {
8664 if (pic_offset_table_rtx
8665 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8666 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8667 || crtl->profile
8668 || crtl->calls_eh_return
8669 || crtl->uses_const_pool))
8670 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8671
8672 if (crtl->calls_eh_return && maybe_eh_return)
8673 {
8674 unsigned i;
8675 for (i = 0; ; i++)
8676 {
8677 unsigned test = EH_RETURN_DATA_REGNO (i);
8678 if (test == INVALID_REGNUM)
8679 break;
8680 if (test == regno)
8681 return true;
8682 }
8683 }
8684
8685 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8686 return true;
8687
8688 return (df_regs_ever_live_p (regno)
8689 && !call_used_regs[regno]
8690 && !fixed_regs[regno]
8691 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8692 }
8693
8694 /* Return number of saved general prupose registers. */
8695
8696 static int
8697 ix86_nsaved_regs (void)
8698 {
8699 int nregs = 0;
8700 int regno;
8701
8702 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8703 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8704 nregs ++;
8705 return nregs;
8706 }
8707
8708 /* Return number of saved SSE registrers. */
8709
8710 static int
8711 ix86_nsaved_sseregs (void)
8712 {
8713 int nregs = 0;
8714 int regno;
8715
8716 if (!TARGET_64BIT_MS_ABI)
8717 return 0;
8718 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8719 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8720 nregs ++;
8721 return nregs;
8722 }
8723
8724 /* Given FROM and TO register numbers, say whether this elimination is
8725 allowed. If stack alignment is needed, we can only replace argument
8726 pointer with hard frame pointer, or replace frame pointer with stack
8727 pointer. Otherwise, frame pointer elimination is automatically
8728 handled and all other eliminations are valid. */
8729
8730 static bool
8731 ix86_can_eliminate (const int from, const int to)
8732 {
8733 if (stack_realign_fp)
8734 return ((from == ARG_POINTER_REGNUM
8735 && to == HARD_FRAME_POINTER_REGNUM)
8736 || (from == FRAME_POINTER_REGNUM
8737 && to == STACK_POINTER_REGNUM));
8738 else
8739 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8740 }
8741
8742 /* Return the offset between two registers, one to be eliminated, and the other
8743 its replacement, at the start of a routine. */
8744
8745 HOST_WIDE_INT
8746 ix86_initial_elimination_offset (int from, int to)
8747 {
8748 struct ix86_frame frame;
8749 ix86_compute_frame_layout (&frame);
8750
8751 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8752 return frame.hard_frame_pointer_offset;
8753 else if (from == FRAME_POINTER_REGNUM
8754 && to == HARD_FRAME_POINTER_REGNUM)
8755 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8756 else
8757 {
8758 gcc_assert (to == STACK_POINTER_REGNUM);
8759
8760 if (from == ARG_POINTER_REGNUM)
8761 return frame.stack_pointer_offset;
8762
8763 gcc_assert (from == FRAME_POINTER_REGNUM);
8764 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8765 }
8766 }
8767
8768 /* In a dynamically-aligned function, we can't know the offset from
8769 stack pointer to frame pointer, so we must ensure that setjmp
8770 eliminates fp against the hard fp (%ebp) rather than trying to
8771 index from %esp up to the top of the frame across a gap that is
8772 of unknown (at compile-time) size. */
8773 static rtx
8774 ix86_builtin_setjmp_frame_value (void)
8775 {
8776 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8777 }
8778
8779 /* When using -fsplit-stack, the allocation routines set a field in
8780 the TCB to the bottom of the stack plus this much space, measured
8781 in bytes. */
8782
8783 #define SPLIT_STACK_AVAILABLE 256
8784
8785 /* Fill structure ix86_frame about frame of currently computed function. */
8786
8787 static void
8788 ix86_compute_frame_layout (struct ix86_frame *frame)
8789 {
8790 unsigned int stack_alignment_needed;
8791 HOST_WIDE_INT offset;
8792 unsigned int preferred_alignment;
8793 HOST_WIDE_INT size = get_frame_size ();
8794 HOST_WIDE_INT to_allocate;
8795
8796 frame->nregs = ix86_nsaved_regs ();
8797 frame->nsseregs = ix86_nsaved_sseregs ();
8798
8799 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8800 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8801
8802 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8803 function prologues and leaf. */
8804 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8805 && (!current_function_is_leaf || cfun->calls_alloca != 0
8806 || ix86_current_function_calls_tls_descriptor))
8807 {
8808 preferred_alignment = 16;
8809 stack_alignment_needed = 16;
8810 crtl->preferred_stack_boundary = 128;
8811 crtl->stack_alignment_needed = 128;
8812 }
8813
8814 gcc_assert (!size || stack_alignment_needed);
8815 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8816 gcc_assert (preferred_alignment <= stack_alignment_needed);
8817
8818 /* For SEH we have to limit the amount of code movement into the prologue.
8819 At present we do this via a BLOCKAGE, at which point there's very little
8820 scheduling that can be done, which means that there's very little point
8821 in doing anything except PUSHs. */
8822 if (TARGET_SEH)
8823 cfun->machine->use_fast_prologue_epilogue = false;
8824
8825 /* During reload iteration the amount of registers saved can change.
8826 Recompute the value as needed. Do not recompute when amount of registers
8827 didn't change as reload does multiple calls to the function and does not
8828 expect the decision to change within single iteration. */
8829 else if (!optimize_function_for_size_p (cfun)
8830 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8831 {
8832 int count = frame->nregs;
8833 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8834
8835 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8836
8837 /* The fast prologue uses move instead of push to save registers. This
8838 is significantly longer, but also executes faster as modern hardware
8839 can execute the moves in parallel, but can't do that for push/pop.
8840
8841 Be careful about choosing what prologue to emit: When function takes
8842 many instructions to execute we may use slow version as well as in
8843 case function is known to be outside hot spot (this is known with
8844 feedback only). Weight the size of function by number of registers
8845 to save as it is cheap to use one or two push instructions but very
8846 slow to use many of them. */
8847 if (count)
8848 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8849 if (node->frequency < NODE_FREQUENCY_NORMAL
8850 || (flag_branch_probabilities
8851 && node->frequency < NODE_FREQUENCY_HOT))
8852 cfun->machine->use_fast_prologue_epilogue = false;
8853 else
8854 cfun->machine->use_fast_prologue_epilogue
8855 = !expensive_function_p (count);
8856 }
8857
8858 frame->save_regs_using_mov
8859 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
8860 /* If static stack checking is enabled and done with probes,
8861 the registers need to be saved before allocating the frame. */
8862 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
8863
8864 /* Skip return address. */
8865 offset = UNITS_PER_WORD;
8866
8867 /* Skip pushed static chain. */
8868 if (ix86_static_chain_on_stack)
8869 offset += UNITS_PER_WORD;
8870
8871 /* Skip saved base pointer. */
8872 if (frame_pointer_needed)
8873 offset += UNITS_PER_WORD;
8874 frame->hfp_save_offset = offset;
8875
8876 /* The traditional frame pointer location is at the top of the frame. */
8877 frame->hard_frame_pointer_offset = offset;
8878
8879 /* Register save area */
8880 offset += frame->nregs * UNITS_PER_WORD;
8881 frame->reg_save_offset = offset;
8882
8883 /* Align and set SSE register save area. */
8884 if (frame->nsseregs)
8885 {
8886 /* The only ABI that has saved SSE registers (Win64) also has a
8887 16-byte aligned default stack, and thus we don't need to be
8888 within the re-aligned local stack frame to save them. */
8889 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
8890 offset = (offset + 16 - 1) & -16;
8891 offset += frame->nsseregs * 16;
8892 }
8893 frame->sse_reg_save_offset = offset;
8894
8895 /* The re-aligned stack starts here. Values before this point are not
8896 directly comparable with values below this point. In order to make
8897 sure that no value happens to be the same before and after, force
8898 the alignment computation below to add a non-zero value. */
8899 if (stack_realign_fp)
8900 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
8901
8902 /* Va-arg area */
8903 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8904 offset += frame->va_arg_size;
8905
8906 /* Align start of frame for local function. */
8907 if (stack_realign_fp
8908 || offset != frame->sse_reg_save_offset
8909 || size != 0
8910 || !current_function_is_leaf
8911 || cfun->calls_alloca
8912 || ix86_current_function_calls_tls_descriptor)
8913 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
8914
8915 /* Frame pointer points here. */
8916 frame->frame_pointer_offset = offset;
8917
8918 offset += size;
8919
8920 /* Add outgoing arguments area. Can be skipped if we eliminated
8921 all the function calls as dead code.
8922 Skipping is however impossible when function calls alloca. Alloca
8923 expander assumes that last crtl->outgoing_args_size
8924 of stack frame are unused. */
8925 if (ACCUMULATE_OUTGOING_ARGS
8926 && (!current_function_is_leaf || cfun->calls_alloca
8927 || ix86_current_function_calls_tls_descriptor))
8928 {
8929 offset += crtl->outgoing_args_size;
8930 frame->outgoing_arguments_size = crtl->outgoing_args_size;
8931 }
8932 else
8933 frame->outgoing_arguments_size = 0;
8934
8935 /* Align stack boundary. Only needed if we're calling another function
8936 or using alloca. */
8937 if (!current_function_is_leaf || cfun->calls_alloca
8938 || ix86_current_function_calls_tls_descriptor)
8939 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
8940
8941 /* We've reached end of stack frame. */
8942 frame->stack_pointer_offset = offset;
8943
8944 /* Size prologue needs to allocate. */
8945 to_allocate = offset - frame->sse_reg_save_offset;
8946
8947 if ((!to_allocate && frame->nregs <= 1)
8948 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
8949 frame->save_regs_using_mov = false;
8950
8951 if (ix86_using_red_zone ()
8952 && current_function_sp_is_unchanging
8953 && current_function_is_leaf
8954 && !ix86_current_function_calls_tls_descriptor)
8955 {
8956 frame->red_zone_size = to_allocate;
8957 if (frame->save_regs_using_mov)
8958 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
8959 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
8960 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
8961 }
8962 else
8963 frame->red_zone_size = 0;
8964 frame->stack_pointer_offset -= frame->red_zone_size;
8965
8966 /* The SEH frame pointer location is near the bottom of the frame.
8967 This is enforced by the fact that the difference between the
8968 stack pointer and the frame pointer is limited to 240 bytes in
8969 the unwind data structure. */
8970 if (TARGET_SEH)
8971 {
8972 HOST_WIDE_INT diff;
8973
8974 /* If we can leave the frame pointer where it is, do so. */
8975 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
8976 if (diff > 240 || (diff & 15) != 0)
8977 {
8978 /* Ideally we'd determine what portion of the local stack frame
8979 (within the constraint of the lowest 240) is most heavily used.
8980 But without that complication, simply bias the frame pointer
8981 by 128 bytes so as to maximize the amount of the local stack
8982 frame that is addressable with 8-bit offsets. */
8983 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
8984 }
8985 }
8986 }
8987
8988 /* This is semi-inlined memory_address_length, but simplified
8989 since we know that we're always dealing with reg+offset, and
8990 to avoid having to create and discard all that rtl. */
8991
8992 static inline int
8993 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
8994 {
8995 int len = 4;
8996
8997 if (offset == 0)
8998 {
8999 /* EBP and R13 cannot be encoded without an offset. */
9000 len = (regno == BP_REG || regno == R13_REG);
9001 }
9002 else if (IN_RANGE (offset, -128, 127))
9003 len = 1;
9004
9005 /* ESP and R12 must be encoded with a SIB byte. */
9006 if (regno == SP_REG || regno == R12_REG)
9007 len++;
9008
9009 return len;
9010 }
9011
9012 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9013 The valid base registers are taken from CFUN->MACHINE->FS. */
9014
9015 static rtx
9016 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9017 {
9018 const struct machine_function *m = cfun->machine;
9019 rtx base_reg = NULL;
9020 HOST_WIDE_INT base_offset = 0;
9021
9022 if (m->use_fast_prologue_epilogue)
9023 {
9024 /* Choose the base register most likely to allow the most scheduling
9025 opportunities. Generally FP is valid througout the function,
9026 while DRAP must be reloaded within the epilogue. But choose either
9027 over the SP due to increased encoding size. */
9028
9029 if (m->fs.fp_valid)
9030 {
9031 base_reg = hard_frame_pointer_rtx;
9032 base_offset = m->fs.fp_offset - cfa_offset;
9033 }
9034 else if (m->fs.drap_valid)
9035 {
9036 base_reg = crtl->drap_reg;
9037 base_offset = 0 - cfa_offset;
9038 }
9039 else if (m->fs.sp_valid)
9040 {
9041 base_reg = stack_pointer_rtx;
9042 base_offset = m->fs.sp_offset - cfa_offset;
9043 }
9044 }
9045 else
9046 {
9047 HOST_WIDE_INT toffset;
9048 int len = 16, tlen;
9049
9050 /* Choose the base register with the smallest address encoding.
9051 With a tie, choose FP > DRAP > SP. */
9052 if (m->fs.sp_valid)
9053 {
9054 base_reg = stack_pointer_rtx;
9055 base_offset = m->fs.sp_offset - cfa_offset;
9056 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9057 }
9058 if (m->fs.drap_valid)
9059 {
9060 toffset = 0 - cfa_offset;
9061 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9062 if (tlen <= len)
9063 {
9064 base_reg = crtl->drap_reg;
9065 base_offset = toffset;
9066 len = tlen;
9067 }
9068 }
9069 if (m->fs.fp_valid)
9070 {
9071 toffset = m->fs.fp_offset - cfa_offset;
9072 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9073 if (tlen <= len)
9074 {
9075 base_reg = hard_frame_pointer_rtx;
9076 base_offset = toffset;
9077 len = tlen;
9078 }
9079 }
9080 }
9081 gcc_assert (base_reg != NULL);
9082
9083 return plus_constant (base_reg, base_offset);
9084 }
9085
9086 /* Emit code to save registers in the prologue. */
9087
9088 static void
9089 ix86_emit_save_regs (void)
9090 {
9091 unsigned int regno;
9092 rtx insn;
9093
9094 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9095 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9096 {
9097 insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
9098 RTX_FRAME_RELATED_P (insn) = 1;
9099 }
9100 }
9101
9102 /* Emit a single register save at CFA - CFA_OFFSET. */
9103
9104 static void
9105 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9106 HOST_WIDE_INT cfa_offset)
9107 {
9108 struct machine_function *m = cfun->machine;
9109 rtx reg = gen_rtx_REG (mode, regno);
9110 rtx mem, addr, base, insn;
9111
9112 addr = choose_baseaddr (cfa_offset);
9113 mem = gen_frame_mem (mode, addr);
9114
9115 /* For SSE saves, we need to indicate the 128-bit alignment. */
9116 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9117
9118 insn = emit_move_insn (mem, reg);
9119 RTX_FRAME_RELATED_P (insn) = 1;
9120
9121 base = addr;
9122 if (GET_CODE (base) == PLUS)
9123 base = XEXP (base, 0);
9124 gcc_checking_assert (REG_P (base));
9125
9126 /* When saving registers into a re-aligned local stack frame, avoid
9127 any tricky guessing by dwarf2out. */
9128 if (m->fs.realigned)
9129 {
9130 gcc_checking_assert (stack_realign_drap);
9131
9132 if (regno == REGNO (crtl->drap_reg))
9133 {
9134 /* A bit of a hack. We force the DRAP register to be saved in
9135 the re-aligned stack frame, which provides us with a copy
9136 of the CFA that will last past the prologue. Install it. */
9137 gcc_checking_assert (cfun->machine->fs.fp_valid);
9138 addr = plus_constant (hard_frame_pointer_rtx,
9139 cfun->machine->fs.fp_offset - cfa_offset);
9140 mem = gen_rtx_MEM (mode, addr);
9141 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9142 }
9143 else
9144 {
9145 /* The frame pointer is a stable reference within the
9146 aligned frame. Use it. */
9147 gcc_checking_assert (cfun->machine->fs.fp_valid);
9148 addr = plus_constant (hard_frame_pointer_rtx,
9149 cfun->machine->fs.fp_offset - cfa_offset);
9150 mem = gen_rtx_MEM (mode, addr);
9151 add_reg_note (insn, REG_CFA_EXPRESSION,
9152 gen_rtx_SET (VOIDmode, mem, reg));
9153 }
9154 }
9155
9156 /* The memory may not be relative to the current CFA register,
9157 which means that we may need to generate a new pattern for
9158 use by the unwind info. */
9159 else if (base != m->fs.cfa_reg)
9160 {
9161 addr = plus_constant (m->fs.cfa_reg, m->fs.cfa_offset - cfa_offset);
9162 mem = gen_rtx_MEM (mode, addr);
9163 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9164 }
9165 }
9166
9167 /* Emit code to save registers using MOV insns.
9168 First register is stored at CFA - CFA_OFFSET. */
9169 static void
9170 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9171 {
9172 unsigned int regno;
9173
9174 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9175 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9176 {
9177 ix86_emit_save_reg_using_mov (Pmode, regno, cfa_offset);
9178 cfa_offset -= UNITS_PER_WORD;
9179 }
9180 }
9181
9182 /* Emit code to save SSE registers using MOV insns.
9183 First register is stored at CFA - CFA_OFFSET. */
9184 static void
9185 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9186 {
9187 unsigned int regno;
9188
9189 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9190 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9191 {
9192 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9193 cfa_offset -= 16;
9194 }
9195 }
9196
9197 static GTY(()) rtx queued_cfa_restores;
9198
9199 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9200 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9201 Don't add the note if the previously saved value will be left untouched
9202 within stack red-zone till return, as unwinders can find the same value
9203 in the register and on the stack. */
9204
9205 static void
9206 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9207 {
9208 if (!crtl->shrink_wrapped
9209 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9210 return;
9211
9212 if (insn)
9213 {
9214 add_reg_note (insn, REG_CFA_RESTORE, reg);
9215 RTX_FRAME_RELATED_P (insn) = 1;
9216 }
9217 else
9218 queued_cfa_restores
9219 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9220 }
9221
9222 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9223
9224 static void
9225 ix86_add_queued_cfa_restore_notes (rtx insn)
9226 {
9227 rtx last;
9228 if (!queued_cfa_restores)
9229 return;
9230 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9231 ;
9232 XEXP (last, 1) = REG_NOTES (insn);
9233 REG_NOTES (insn) = queued_cfa_restores;
9234 queued_cfa_restores = NULL_RTX;
9235 RTX_FRAME_RELATED_P (insn) = 1;
9236 }
9237
9238 /* Expand prologue or epilogue stack adjustment.
9239 The pattern exist to put a dependency on all ebp-based memory accesses.
9240 STYLE should be negative if instructions should be marked as frame related,
9241 zero if %r11 register is live and cannot be freely used and positive
9242 otherwise. */
9243
9244 static void
9245 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9246 int style, bool set_cfa)
9247 {
9248 struct machine_function *m = cfun->machine;
9249 rtx insn;
9250 bool add_frame_related_expr = false;
9251
9252 if (! TARGET_64BIT)
9253 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9254 else if (x86_64_immediate_operand (offset, DImode))
9255 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9256 else
9257 {
9258 rtx tmp;
9259 /* r11 is used by indirect sibcall return as well, set before the
9260 epilogue and used after the epilogue. */
9261 if (style)
9262 tmp = gen_rtx_REG (DImode, R11_REG);
9263 else
9264 {
9265 gcc_assert (src != hard_frame_pointer_rtx
9266 && dest != hard_frame_pointer_rtx);
9267 tmp = hard_frame_pointer_rtx;
9268 }
9269 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9270 if (style < 0)
9271 add_frame_related_expr = true;
9272
9273 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9274 }
9275
9276 insn = emit_insn (insn);
9277 if (style >= 0)
9278 ix86_add_queued_cfa_restore_notes (insn);
9279
9280 if (set_cfa)
9281 {
9282 rtx r;
9283
9284 gcc_assert (m->fs.cfa_reg == src);
9285 m->fs.cfa_offset += INTVAL (offset);
9286 m->fs.cfa_reg = dest;
9287
9288 r = gen_rtx_PLUS (Pmode, src, offset);
9289 r = gen_rtx_SET (VOIDmode, dest, r);
9290 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9291 RTX_FRAME_RELATED_P (insn) = 1;
9292 }
9293 else if (style < 0)
9294 {
9295 RTX_FRAME_RELATED_P (insn) = 1;
9296 if (add_frame_related_expr)
9297 {
9298 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9299 r = gen_rtx_SET (VOIDmode, dest, r);
9300 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9301 }
9302 }
9303
9304 if (dest == stack_pointer_rtx)
9305 {
9306 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9307 bool valid = m->fs.sp_valid;
9308
9309 if (src == hard_frame_pointer_rtx)
9310 {
9311 valid = m->fs.fp_valid;
9312 ooffset = m->fs.fp_offset;
9313 }
9314 else if (src == crtl->drap_reg)
9315 {
9316 valid = m->fs.drap_valid;
9317 ooffset = 0;
9318 }
9319 else
9320 {
9321 /* Else there are two possibilities: SP itself, which we set
9322 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9323 taken care of this by hand along the eh_return path. */
9324 gcc_checking_assert (src == stack_pointer_rtx
9325 || offset == const0_rtx);
9326 }
9327
9328 m->fs.sp_offset = ooffset - INTVAL (offset);
9329 m->fs.sp_valid = valid;
9330 }
9331 }
9332
9333 /* Find an available register to be used as dynamic realign argument
9334 pointer regsiter. Such a register will be written in prologue and
9335 used in begin of body, so it must not be
9336 1. parameter passing register.
9337 2. GOT pointer.
9338 We reuse static-chain register if it is available. Otherwise, we
9339 use DI for i386 and R13 for x86-64. We chose R13 since it has
9340 shorter encoding.
9341
9342 Return: the regno of chosen register. */
9343
9344 static unsigned int
9345 find_drap_reg (void)
9346 {
9347 tree decl = cfun->decl;
9348
9349 if (TARGET_64BIT)
9350 {
9351 /* Use R13 for nested function or function need static chain.
9352 Since function with tail call may use any caller-saved
9353 registers in epilogue, DRAP must not use caller-saved
9354 register in such case. */
9355 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9356 return R13_REG;
9357
9358 return R10_REG;
9359 }
9360 else
9361 {
9362 /* Use DI for nested function or function need static chain.
9363 Since function with tail call may use any caller-saved
9364 registers in epilogue, DRAP must not use caller-saved
9365 register in such case. */
9366 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9367 return DI_REG;
9368
9369 /* Reuse static chain register if it isn't used for parameter
9370 passing. */
9371 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9372 {
9373 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9374 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9375 return CX_REG;
9376 }
9377 return DI_REG;
9378 }
9379 }
9380
9381 /* Return minimum incoming stack alignment. */
9382
9383 static unsigned int
9384 ix86_minimum_incoming_stack_boundary (bool sibcall)
9385 {
9386 unsigned int incoming_stack_boundary;
9387
9388 /* Prefer the one specified at command line. */
9389 if (ix86_user_incoming_stack_boundary)
9390 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9391 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9392 if -mstackrealign is used, it isn't used for sibcall check and
9393 estimated stack alignment is 128bit. */
9394 else if (!sibcall
9395 && !TARGET_64BIT
9396 && ix86_force_align_arg_pointer
9397 && crtl->stack_alignment_estimated == 128)
9398 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9399 else
9400 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9401
9402 /* Incoming stack alignment can be changed on individual functions
9403 via force_align_arg_pointer attribute. We use the smallest
9404 incoming stack boundary. */
9405 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9406 && lookup_attribute (ix86_force_align_arg_pointer_string,
9407 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9408 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9409
9410 /* The incoming stack frame has to be aligned at least at
9411 parm_stack_boundary. */
9412 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9413 incoming_stack_boundary = crtl->parm_stack_boundary;
9414
9415 /* Stack at entrance of main is aligned by runtime. We use the
9416 smallest incoming stack boundary. */
9417 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9418 && DECL_NAME (current_function_decl)
9419 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9420 && DECL_FILE_SCOPE_P (current_function_decl))
9421 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9422
9423 return incoming_stack_boundary;
9424 }
9425
9426 /* Update incoming stack boundary and estimated stack alignment. */
9427
9428 static void
9429 ix86_update_stack_boundary (void)
9430 {
9431 ix86_incoming_stack_boundary
9432 = ix86_minimum_incoming_stack_boundary (false);
9433
9434 /* x86_64 vararg needs 16byte stack alignment for register save
9435 area. */
9436 if (TARGET_64BIT
9437 && cfun->stdarg
9438 && crtl->stack_alignment_estimated < 128)
9439 crtl->stack_alignment_estimated = 128;
9440 }
9441
9442 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9443 needed or an rtx for DRAP otherwise. */
9444
9445 static rtx
9446 ix86_get_drap_rtx (void)
9447 {
9448 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9449 crtl->need_drap = true;
9450
9451 if (stack_realign_drap)
9452 {
9453 /* Assign DRAP to vDRAP and returns vDRAP */
9454 unsigned int regno = find_drap_reg ();
9455 rtx drap_vreg;
9456 rtx arg_ptr;
9457 rtx seq, insn;
9458
9459 arg_ptr = gen_rtx_REG (Pmode, regno);
9460 crtl->drap_reg = arg_ptr;
9461
9462 start_sequence ();
9463 drap_vreg = copy_to_reg (arg_ptr);
9464 seq = get_insns ();
9465 end_sequence ();
9466
9467 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9468 if (!optimize)
9469 {
9470 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9471 RTX_FRAME_RELATED_P (insn) = 1;
9472 }
9473 return drap_vreg;
9474 }
9475 else
9476 return NULL;
9477 }
9478
9479 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9480
9481 static rtx
9482 ix86_internal_arg_pointer (void)
9483 {
9484 return virtual_incoming_args_rtx;
9485 }
9486
9487 struct scratch_reg {
9488 rtx reg;
9489 bool saved;
9490 };
9491
9492 /* Return a short-lived scratch register for use on function entry.
9493 In 32-bit mode, it is valid only after the registers are saved
9494 in the prologue. This register must be released by means of
9495 release_scratch_register_on_entry once it is dead. */
9496
9497 static void
9498 get_scratch_register_on_entry (struct scratch_reg *sr)
9499 {
9500 int regno;
9501
9502 sr->saved = false;
9503
9504 if (TARGET_64BIT)
9505 {
9506 /* We always use R11 in 64-bit mode. */
9507 regno = R11_REG;
9508 }
9509 else
9510 {
9511 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9512 bool fastcall_p
9513 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9514 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9515 int regparm = ix86_function_regparm (fntype, decl);
9516 int drap_regno
9517 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9518
9519 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9520 for the static chain register. */
9521 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9522 && drap_regno != AX_REG)
9523 regno = AX_REG;
9524 else if (regparm < 2 && drap_regno != DX_REG)
9525 regno = DX_REG;
9526 /* ecx is the static chain register. */
9527 else if (regparm < 3 && !fastcall_p && !static_chain_p
9528 && drap_regno != CX_REG)
9529 regno = CX_REG;
9530 else if (ix86_save_reg (BX_REG, true))
9531 regno = BX_REG;
9532 /* esi is the static chain register. */
9533 else if (!(regparm == 3 && static_chain_p)
9534 && ix86_save_reg (SI_REG, true))
9535 regno = SI_REG;
9536 else if (ix86_save_reg (DI_REG, true))
9537 regno = DI_REG;
9538 else
9539 {
9540 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9541 sr->saved = true;
9542 }
9543 }
9544
9545 sr->reg = gen_rtx_REG (Pmode, regno);
9546 if (sr->saved)
9547 {
9548 rtx insn = emit_insn (gen_push (sr->reg));
9549 RTX_FRAME_RELATED_P (insn) = 1;
9550 }
9551 }
9552
9553 /* Release a scratch register obtained from the preceding function. */
9554
9555 static void
9556 release_scratch_register_on_entry (struct scratch_reg *sr)
9557 {
9558 if (sr->saved)
9559 {
9560 rtx x, insn = emit_insn (gen_pop (sr->reg));
9561
9562 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9563 RTX_FRAME_RELATED_P (insn) = 1;
9564 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9565 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9566 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9567 }
9568 }
9569
9570 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9571
9572 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9573
9574 static void
9575 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9576 {
9577 /* We skip the probe for the first interval + a small dope of 4 words and
9578 probe that many bytes past the specified size to maintain a protection
9579 area at the botton of the stack. */
9580 const int dope = 4 * UNITS_PER_WORD;
9581 rtx size_rtx = GEN_INT (size), last;
9582
9583 /* See if we have a constant small number of probes to generate. If so,
9584 that's the easy case. The run-time loop is made up of 11 insns in the
9585 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9586 for n # of intervals. */
9587 if (size <= 5 * PROBE_INTERVAL)
9588 {
9589 HOST_WIDE_INT i, adjust;
9590 bool first_probe = true;
9591
9592 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9593 values of N from 1 until it exceeds SIZE. If only one probe is
9594 needed, this will not generate any code. Then adjust and probe
9595 to PROBE_INTERVAL + SIZE. */
9596 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9597 {
9598 if (first_probe)
9599 {
9600 adjust = 2 * PROBE_INTERVAL + dope;
9601 first_probe = false;
9602 }
9603 else
9604 adjust = PROBE_INTERVAL;
9605
9606 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9607 plus_constant (stack_pointer_rtx, -adjust)));
9608 emit_stack_probe (stack_pointer_rtx);
9609 }
9610
9611 if (first_probe)
9612 adjust = size + PROBE_INTERVAL + dope;
9613 else
9614 adjust = size + PROBE_INTERVAL - i;
9615
9616 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9617 plus_constant (stack_pointer_rtx, -adjust)));
9618 emit_stack_probe (stack_pointer_rtx);
9619
9620 /* Adjust back to account for the additional first interval. */
9621 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9622 plus_constant (stack_pointer_rtx,
9623 PROBE_INTERVAL + dope)));
9624 }
9625
9626 /* Otherwise, do the same as above, but in a loop. Note that we must be
9627 extra careful with variables wrapping around because we might be at
9628 the very top (or the very bottom) of the address space and we have
9629 to be able to handle this case properly; in particular, we use an
9630 equality test for the loop condition. */
9631 else
9632 {
9633 HOST_WIDE_INT rounded_size;
9634 struct scratch_reg sr;
9635
9636 get_scratch_register_on_entry (&sr);
9637
9638
9639 /* Step 1: round SIZE to the previous multiple of the interval. */
9640
9641 rounded_size = size & -PROBE_INTERVAL;
9642
9643
9644 /* Step 2: compute initial and final value of the loop counter. */
9645
9646 /* SP = SP_0 + PROBE_INTERVAL. */
9647 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9648 plus_constant (stack_pointer_rtx,
9649 - (PROBE_INTERVAL + dope))));
9650
9651 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9652 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9653 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9654 gen_rtx_PLUS (Pmode, sr.reg,
9655 stack_pointer_rtx)));
9656
9657
9658 /* Step 3: the loop
9659
9660 while (SP != LAST_ADDR)
9661 {
9662 SP = SP + PROBE_INTERVAL
9663 probe at SP
9664 }
9665
9666 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9667 values of N from 1 until it is equal to ROUNDED_SIZE. */
9668
9669 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9670
9671
9672 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9673 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9674
9675 if (size != rounded_size)
9676 {
9677 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9678 plus_constant (stack_pointer_rtx,
9679 rounded_size - size)));
9680 emit_stack_probe (stack_pointer_rtx);
9681 }
9682
9683 /* Adjust back to account for the additional first interval. */
9684 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9685 plus_constant (stack_pointer_rtx,
9686 PROBE_INTERVAL + dope)));
9687
9688 release_scratch_register_on_entry (&sr);
9689 }
9690
9691 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9692
9693 /* Even if the stack pointer isn't the CFA register, we need to correctly
9694 describe the adjustments made to it, in particular differentiate the
9695 frame-related ones from the frame-unrelated ones. */
9696 if (size > 0)
9697 {
9698 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9699 XVECEXP (expr, 0, 0)
9700 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9701 plus_constant (stack_pointer_rtx, -size));
9702 XVECEXP (expr, 0, 1)
9703 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9704 plus_constant (stack_pointer_rtx,
9705 PROBE_INTERVAL + dope + size));
9706 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9707 RTX_FRAME_RELATED_P (last) = 1;
9708
9709 cfun->machine->fs.sp_offset += size;
9710 }
9711
9712 /* Make sure nothing is scheduled before we are done. */
9713 emit_insn (gen_blockage ());
9714 }
9715
9716 /* Adjust the stack pointer up to REG while probing it. */
9717
9718 const char *
9719 output_adjust_stack_and_probe (rtx reg)
9720 {
9721 static int labelno = 0;
9722 char loop_lab[32], end_lab[32];
9723 rtx xops[2];
9724
9725 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9726 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9727
9728 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9729
9730 /* Jump to END_LAB if SP == LAST_ADDR. */
9731 xops[0] = stack_pointer_rtx;
9732 xops[1] = reg;
9733 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9734 fputs ("\tje\t", asm_out_file);
9735 assemble_name_raw (asm_out_file, end_lab);
9736 fputc ('\n', asm_out_file);
9737
9738 /* SP = SP + PROBE_INTERVAL. */
9739 xops[1] = GEN_INT (PROBE_INTERVAL);
9740 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9741
9742 /* Probe at SP. */
9743 xops[1] = const0_rtx;
9744 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9745
9746 fprintf (asm_out_file, "\tjmp\t");
9747 assemble_name_raw (asm_out_file, loop_lab);
9748 fputc ('\n', asm_out_file);
9749
9750 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9751
9752 return "";
9753 }
9754
9755 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9756 inclusive. These are offsets from the current stack pointer. */
9757
9758 static void
9759 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9760 {
9761 /* See if we have a constant small number of probes to generate. If so,
9762 that's the easy case. The run-time loop is made up of 7 insns in the
9763 generic case while the compile-time loop is made up of n insns for n #
9764 of intervals. */
9765 if (size <= 7 * PROBE_INTERVAL)
9766 {
9767 HOST_WIDE_INT i;
9768
9769 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9770 it exceeds SIZE. If only one probe is needed, this will not
9771 generate any code. Then probe at FIRST + SIZE. */
9772 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9773 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i)));
9774
9775 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size)));
9776 }
9777
9778 /* Otherwise, do the same as above, but in a loop. Note that we must be
9779 extra careful with variables wrapping around because we might be at
9780 the very top (or the very bottom) of the address space and we have
9781 to be able to handle this case properly; in particular, we use an
9782 equality test for the loop condition. */
9783 else
9784 {
9785 HOST_WIDE_INT rounded_size, last;
9786 struct scratch_reg sr;
9787
9788 get_scratch_register_on_entry (&sr);
9789
9790
9791 /* Step 1: round SIZE to the previous multiple of the interval. */
9792
9793 rounded_size = size & -PROBE_INTERVAL;
9794
9795
9796 /* Step 2: compute initial and final value of the loop counter. */
9797
9798 /* TEST_OFFSET = FIRST. */
9799 emit_move_insn (sr.reg, GEN_INT (-first));
9800
9801 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9802 last = first + rounded_size;
9803
9804
9805 /* Step 3: the loop
9806
9807 while (TEST_ADDR != LAST_ADDR)
9808 {
9809 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9810 probe at TEST_ADDR
9811 }
9812
9813 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9814 until it is equal to ROUNDED_SIZE. */
9815
9816 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9817
9818
9819 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9820 that SIZE is equal to ROUNDED_SIZE. */
9821
9822 if (size != rounded_size)
9823 emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode,
9824 stack_pointer_rtx,
9825 sr.reg),
9826 rounded_size - size));
9827
9828 release_scratch_register_on_entry (&sr);
9829 }
9830
9831 /* Make sure nothing is scheduled before we are done. */
9832 emit_insn (gen_blockage ());
9833 }
9834
9835 /* Probe a range of stack addresses from REG to END, inclusive. These are
9836 offsets from the current stack pointer. */
9837
9838 const char *
9839 output_probe_stack_range (rtx reg, rtx end)
9840 {
9841 static int labelno = 0;
9842 char loop_lab[32], end_lab[32];
9843 rtx xops[3];
9844
9845 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9846 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9847
9848 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9849
9850 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
9851 xops[0] = reg;
9852 xops[1] = end;
9853 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9854 fputs ("\tje\t", asm_out_file);
9855 assemble_name_raw (asm_out_file, end_lab);
9856 fputc ('\n', asm_out_file);
9857
9858 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
9859 xops[1] = GEN_INT (PROBE_INTERVAL);
9860 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9861
9862 /* Probe at TEST_ADDR. */
9863 xops[0] = stack_pointer_rtx;
9864 xops[1] = reg;
9865 xops[2] = const0_rtx;
9866 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
9867
9868 fprintf (asm_out_file, "\tjmp\t");
9869 assemble_name_raw (asm_out_file, loop_lab);
9870 fputc ('\n', asm_out_file);
9871
9872 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9873
9874 return "";
9875 }
9876
9877 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
9878 to be generated in correct form. */
9879 static void
9880 ix86_finalize_stack_realign_flags (void)
9881 {
9882 /* Check if stack realign is really needed after reload, and
9883 stores result in cfun */
9884 unsigned int incoming_stack_boundary
9885 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
9886 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
9887 unsigned int stack_realign = (incoming_stack_boundary
9888 < (current_function_is_leaf
9889 ? crtl->max_used_stack_slot_alignment
9890 : crtl->stack_alignment_needed));
9891
9892 if (crtl->stack_realign_finalized)
9893 {
9894 /* After stack_realign_needed is finalized, we can't no longer
9895 change it. */
9896 gcc_assert (crtl->stack_realign_needed == stack_realign);
9897 }
9898 else
9899 {
9900 crtl->stack_realign_needed = stack_realign;
9901 crtl->stack_realign_finalized = true;
9902 }
9903 }
9904
9905 /* Expand the prologue into a bunch of separate insns. */
9906
9907 void
9908 ix86_expand_prologue (void)
9909 {
9910 struct machine_function *m = cfun->machine;
9911 rtx insn, t;
9912 bool pic_reg_used;
9913 struct ix86_frame frame;
9914 HOST_WIDE_INT allocate;
9915 bool int_registers_saved;
9916
9917 ix86_finalize_stack_realign_flags ();
9918
9919 /* DRAP should not coexist with stack_realign_fp */
9920 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
9921
9922 memset (&m->fs, 0, sizeof (m->fs));
9923
9924 /* Initialize CFA state for before the prologue. */
9925 m->fs.cfa_reg = stack_pointer_rtx;
9926 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
9927
9928 /* Track SP offset to the CFA. We continue tracking this after we've
9929 swapped the CFA register away from SP. In the case of re-alignment
9930 this is fudged; we're interested to offsets within the local frame. */
9931 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
9932 m->fs.sp_valid = true;
9933
9934 ix86_compute_frame_layout (&frame);
9935
9936 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
9937 {
9938 /* We should have already generated an error for any use of
9939 ms_hook on a nested function. */
9940 gcc_checking_assert (!ix86_static_chain_on_stack);
9941
9942 /* Check if profiling is active and we shall use profiling before
9943 prologue variant. If so sorry. */
9944 if (crtl->profile && flag_fentry != 0)
9945 sorry ("ms_hook_prologue attribute isn%'t compatible "
9946 "with -mfentry for 32-bit");
9947
9948 /* In ix86_asm_output_function_label we emitted:
9949 8b ff movl.s %edi,%edi
9950 55 push %ebp
9951 8b ec movl.s %esp,%ebp
9952
9953 This matches the hookable function prologue in Win32 API
9954 functions in Microsoft Windows XP Service Pack 2 and newer.
9955 Wine uses this to enable Windows apps to hook the Win32 API
9956 functions provided by Wine.
9957
9958 What that means is that we've already set up the frame pointer. */
9959
9960 if (frame_pointer_needed
9961 && !(crtl->drap_reg && crtl->stack_realign_needed))
9962 {
9963 rtx push, mov;
9964
9965 /* We've decided to use the frame pointer already set up.
9966 Describe this to the unwinder by pretending that both
9967 push and mov insns happen right here.
9968
9969 Putting the unwind info here at the end of the ms_hook
9970 is done so that we can make absolutely certain we get
9971 the required byte sequence at the start of the function,
9972 rather than relying on an assembler that can produce
9973 the exact encoding required.
9974
9975 However it does mean (in the unpatched case) that we have
9976 a 1 insn window where the asynchronous unwind info is
9977 incorrect. However, if we placed the unwind info at
9978 its correct location we would have incorrect unwind info
9979 in the patched case. Which is probably all moot since
9980 I don't expect Wine generates dwarf2 unwind info for the
9981 system libraries that use this feature. */
9982
9983 insn = emit_insn (gen_blockage ());
9984
9985 push = gen_push (hard_frame_pointer_rtx);
9986 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
9987 stack_pointer_rtx);
9988 RTX_FRAME_RELATED_P (push) = 1;
9989 RTX_FRAME_RELATED_P (mov) = 1;
9990
9991 RTX_FRAME_RELATED_P (insn) = 1;
9992 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
9993 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
9994
9995 /* Note that gen_push incremented m->fs.cfa_offset, even
9996 though we didn't emit the push insn here. */
9997 m->fs.cfa_reg = hard_frame_pointer_rtx;
9998 m->fs.fp_offset = m->fs.cfa_offset;
9999 m->fs.fp_valid = true;
10000 }
10001 else
10002 {
10003 /* The frame pointer is not needed so pop %ebp again.
10004 This leaves us with a pristine state. */
10005 emit_insn (gen_pop (hard_frame_pointer_rtx));
10006 }
10007 }
10008
10009 /* The first insn of a function that accepts its static chain on the
10010 stack is to push the register that would be filled in by a direct
10011 call. This insn will be skipped by the trampoline. */
10012 else if (ix86_static_chain_on_stack)
10013 {
10014 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10015 emit_insn (gen_blockage ());
10016
10017 /* We don't want to interpret this push insn as a register save,
10018 only as a stack adjustment. The real copy of the register as
10019 a save will be done later, if needed. */
10020 t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
10021 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10022 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10023 RTX_FRAME_RELATED_P (insn) = 1;
10024 }
10025
10026 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10027 of DRAP is needed and stack realignment is really needed after reload */
10028 if (stack_realign_drap)
10029 {
10030 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10031
10032 /* Only need to push parameter pointer reg if it is caller saved. */
10033 if (!call_used_regs[REGNO (crtl->drap_reg)])
10034 {
10035 /* Push arg pointer reg */
10036 insn = emit_insn (gen_push (crtl->drap_reg));
10037 RTX_FRAME_RELATED_P (insn) = 1;
10038 }
10039
10040 /* Grab the argument pointer. */
10041 t = plus_constant (stack_pointer_rtx, m->fs.sp_offset);
10042 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10043 RTX_FRAME_RELATED_P (insn) = 1;
10044 m->fs.cfa_reg = crtl->drap_reg;
10045 m->fs.cfa_offset = 0;
10046
10047 /* Align the stack. */
10048 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10049 stack_pointer_rtx,
10050 GEN_INT (-align_bytes)));
10051 RTX_FRAME_RELATED_P (insn) = 1;
10052
10053 /* Replicate the return address on the stack so that return
10054 address can be reached via (argp - 1) slot. This is needed
10055 to implement macro RETURN_ADDR_RTX and intrinsic function
10056 expand_builtin_return_addr etc. */
10057 t = plus_constant (crtl->drap_reg, -UNITS_PER_WORD);
10058 t = gen_frame_mem (Pmode, t);
10059 insn = emit_insn (gen_push (t));
10060 RTX_FRAME_RELATED_P (insn) = 1;
10061
10062 /* For the purposes of frame and register save area addressing,
10063 we've started over with a new frame. */
10064 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10065 m->fs.realigned = true;
10066 }
10067
10068 if (frame_pointer_needed && !m->fs.fp_valid)
10069 {
10070 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10071 slower on all targets. Also sdb doesn't like it. */
10072 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10073 RTX_FRAME_RELATED_P (insn) = 1;
10074
10075 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10076 {
10077 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10078 RTX_FRAME_RELATED_P (insn) = 1;
10079
10080 if (m->fs.cfa_reg == stack_pointer_rtx)
10081 m->fs.cfa_reg = hard_frame_pointer_rtx;
10082 m->fs.fp_offset = m->fs.sp_offset;
10083 m->fs.fp_valid = true;
10084 }
10085 }
10086
10087 int_registers_saved = (frame.nregs == 0);
10088
10089 if (!int_registers_saved)
10090 {
10091 /* If saving registers via PUSH, do so now. */
10092 if (!frame.save_regs_using_mov)
10093 {
10094 ix86_emit_save_regs ();
10095 int_registers_saved = true;
10096 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10097 }
10098
10099 /* When using red zone we may start register saving before allocating
10100 the stack frame saving one cycle of the prologue. However, avoid
10101 doing this if we have to probe the stack; at least on x86_64 the
10102 stack probe can turn into a call that clobbers a red zone location. */
10103 else if (ix86_using_red_zone ()
10104 && (! TARGET_STACK_PROBE
10105 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10106 {
10107 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10108 int_registers_saved = true;
10109 }
10110 }
10111
10112 if (stack_realign_fp)
10113 {
10114 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10115 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10116
10117 /* The computation of the size of the re-aligned stack frame means
10118 that we must allocate the size of the register save area before
10119 performing the actual alignment. Otherwise we cannot guarantee
10120 that there's enough storage above the realignment point. */
10121 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10122 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10123 GEN_INT (m->fs.sp_offset
10124 - frame.sse_reg_save_offset),
10125 -1, false);
10126
10127 /* Align the stack. */
10128 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10129 stack_pointer_rtx,
10130 GEN_INT (-align_bytes)));
10131
10132 /* For the purposes of register save area addressing, the stack
10133 pointer is no longer valid. As for the value of sp_offset,
10134 see ix86_compute_frame_layout, which we need to match in order
10135 to pass verification of stack_pointer_offset at the end. */
10136 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10137 m->fs.sp_valid = false;
10138 }
10139
10140 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10141
10142 if (flag_stack_usage_info)
10143 {
10144 /* We start to count from ARG_POINTER. */
10145 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10146
10147 /* If it was realigned, take into account the fake frame. */
10148 if (stack_realign_drap)
10149 {
10150 if (ix86_static_chain_on_stack)
10151 stack_size += UNITS_PER_WORD;
10152
10153 if (!call_used_regs[REGNO (crtl->drap_reg)])
10154 stack_size += UNITS_PER_WORD;
10155
10156 /* This over-estimates by 1 minimal-stack-alignment-unit but
10157 mitigates that by counting in the new return address slot. */
10158 current_function_dynamic_stack_size
10159 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10160 }
10161
10162 current_function_static_stack_size = stack_size;
10163 }
10164
10165 /* The stack has already been decremented by the instruction calling us
10166 so probe if the size is non-negative to preserve the protection area. */
10167 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10168 {
10169 /* We expect the registers to be saved when probes are used. */
10170 gcc_assert (int_registers_saved);
10171
10172 if (STACK_CHECK_MOVING_SP)
10173 {
10174 ix86_adjust_stack_and_probe (allocate);
10175 allocate = 0;
10176 }
10177 else
10178 {
10179 HOST_WIDE_INT size = allocate;
10180
10181 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10182 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10183
10184 if (TARGET_STACK_PROBE)
10185 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10186 else
10187 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10188 }
10189 }
10190
10191 if (allocate == 0)
10192 ;
10193 else if (!ix86_target_stack_probe ()
10194 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10195 {
10196 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10197 GEN_INT (-allocate), -1,
10198 m->fs.cfa_reg == stack_pointer_rtx);
10199 }
10200 else
10201 {
10202 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10203 rtx r10 = NULL;
10204 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10205
10206 bool eax_live = false;
10207 bool r10_live = false;
10208
10209 if (TARGET_64BIT)
10210 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10211 if (!TARGET_64BIT_MS_ABI)
10212 eax_live = ix86_eax_live_at_start_p ();
10213
10214 if (eax_live)
10215 {
10216 emit_insn (gen_push (eax));
10217 allocate -= UNITS_PER_WORD;
10218 }
10219 if (r10_live)
10220 {
10221 r10 = gen_rtx_REG (Pmode, R10_REG);
10222 emit_insn (gen_push (r10));
10223 allocate -= UNITS_PER_WORD;
10224 }
10225
10226 emit_move_insn (eax, GEN_INT (allocate));
10227 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10228
10229 /* Use the fact that AX still contains ALLOCATE. */
10230 adjust_stack_insn = (TARGET_64BIT
10231 ? gen_pro_epilogue_adjust_stack_di_sub
10232 : gen_pro_epilogue_adjust_stack_si_sub);
10233
10234 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10235 stack_pointer_rtx, eax));
10236
10237 /* Note that SEH directives need to continue tracking the stack
10238 pointer even after the frame pointer has been set up. */
10239 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10240 {
10241 if (m->fs.cfa_reg == stack_pointer_rtx)
10242 m->fs.cfa_offset += allocate;
10243
10244 RTX_FRAME_RELATED_P (insn) = 1;
10245 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10246 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10247 plus_constant (stack_pointer_rtx,
10248 -allocate)));
10249 }
10250 m->fs.sp_offset += allocate;
10251
10252 if (r10_live && eax_live)
10253 {
10254 t = choose_baseaddr (m->fs.sp_offset - allocate);
10255 emit_move_insn (r10, gen_frame_mem (Pmode, t));
10256 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10257 emit_move_insn (eax, gen_frame_mem (Pmode, t));
10258 }
10259 else if (eax_live || r10_live)
10260 {
10261 t = choose_baseaddr (m->fs.sp_offset - allocate);
10262 emit_move_insn ((eax_live ? eax : r10), gen_frame_mem (Pmode, t));
10263 }
10264 }
10265 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10266
10267 /* If we havn't already set up the frame pointer, do so now. */
10268 if (frame_pointer_needed && !m->fs.fp_valid)
10269 {
10270 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10271 GEN_INT (frame.stack_pointer_offset
10272 - frame.hard_frame_pointer_offset));
10273 insn = emit_insn (insn);
10274 RTX_FRAME_RELATED_P (insn) = 1;
10275 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10276
10277 if (m->fs.cfa_reg == stack_pointer_rtx)
10278 m->fs.cfa_reg = hard_frame_pointer_rtx;
10279 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10280 m->fs.fp_valid = true;
10281 }
10282
10283 if (!int_registers_saved)
10284 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10285 if (frame.nsseregs)
10286 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10287
10288 pic_reg_used = false;
10289 if (pic_offset_table_rtx
10290 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10291 || crtl->profile))
10292 {
10293 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10294
10295 if (alt_pic_reg_used != INVALID_REGNUM)
10296 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10297
10298 pic_reg_used = true;
10299 }
10300
10301 if (pic_reg_used)
10302 {
10303 if (TARGET_64BIT)
10304 {
10305 if (ix86_cmodel == CM_LARGE_PIC)
10306 {
10307 rtx tmp_reg = gen_rtx_REG (DImode, R11_REG);
10308 rtx label = gen_label_rtx ();
10309 emit_label (label);
10310 LABEL_PRESERVE_P (label) = 1;
10311 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10312 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, label));
10313 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10314 insn = emit_insn (gen_adddi3 (pic_offset_table_rtx,
10315 pic_offset_table_rtx, tmp_reg));
10316 }
10317 else
10318 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10319 }
10320 else
10321 {
10322 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10323 RTX_FRAME_RELATED_P (insn) = 1;
10324 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10325 }
10326 }
10327
10328 /* In the pic_reg_used case, make sure that the got load isn't deleted
10329 when mcount needs it. Blockage to avoid call movement across mcount
10330 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10331 note. */
10332 if (crtl->profile && !flag_fentry && pic_reg_used)
10333 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10334
10335 if (crtl->drap_reg && !crtl->stack_realign_needed)
10336 {
10337 /* vDRAP is setup but after reload it turns out stack realign
10338 isn't necessary, here we will emit prologue to setup DRAP
10339 without stack realign adjustment */
10340 t = choose_baseaddr (0);
10341 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10342 }
10343
10344 /* Prevent instructions from being scheduled into register save push
10345 sequence when access to the redzone area is done through frame pointer.
10346 The offset between the frame pointer and the stack pointer is calculated
10347 relative to the value of the stack pointer at the end of the function
10348 prologue, and moving instructions that access redzone area via frame
10349 pointer inside push sequence violates this assumption. */
10350 if (frame_pointer_needed && frame.red_zone_size)
10351 emit_insn (gen_memory_blockage ());
10352
10353 /* Emit cld instruction if stringops are used in the function. */
10354 if (TARGET_CLD && ix86_current_function_needs_cld)
10355 emit_insn (gen_cld ());
10356
10357 /* SEH requires that the prologue end within 256 bytes of the start of
10358 the function. Prevent instruction schedules that would extend that.
10359 Further, prevent alloca modifications to the stack pointer from being
10360 combined with prologue modifications. */
10361 if (TARGET_SEH)
10362 emit_insn (gen_prologue_use (stack_pointer_rtx));
10363 }
10364
10365 /* Emit code to restore REG using a POP insn. */
10366
10367 static void
10368 ix86_emit_restore_reg_using_pop (rtx reg)
10369 {
10370 struct machine_function *m = cfun->machine;
10371 rtx insn = emit_insn (gen_pop (reg));
10372
10373 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10374 m->fs.sp_offset -= UNITS_PER_WORD;
10375
10376 if (m->fs.cfa_reg == crtl->drap_reg
10377 && REGNO (reg) == REGNO (crtl->drap_reg))
10378 {
10379 /* Previously we'd represented the CFA as an expression
10380 like *(%ebp - 8). We've just popped that value from
10381 the stack, which means we need to reset the CFA to
10382 the drap register. This will remain until we restore
10383 the stack pointer. */
10384 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10385 RTX_FRAME_RELATED_P (insn) = 1;
10386
10387 /* This means that the DRAP register is valid for addressing too. */
10388 m->fs.drap_valid = true;
10389 return;
10390 }
10391
10392 if (m->fs.cfa_reg == stack_pointer_rtx)
10393 {
10394 rtx x = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
10395 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10396 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10397 RTX_FRAME_RELATED_P (insn) = 1;
10398
10399 m->fs.cfa_offset -= UNITS_PER_WORD;
10400 }
10401
10402 /* When the frame pointer is the CFA, and we pop it, we are
10403 swapping back to the stack pointer as the CFA. This happens
10404 for stack frames that don't allocate other data, so we assume
10405 the stack pointer is now pointing at the return address, i.e.
10406 the function entry state, which makes the offset be 1 word. */
10407 if (reg == hard_frame_pointer_rtx)
10408 {
10409 m->fs.fp_valid = false;
10410 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10411 {
10412 m->fs.cfa_reg = stack_pointer_rtx;
10413 m->fs.cfa_offset -= UNITS_PER_WORD;
10414
10415 add_reg_note (insn, REG_CFA_DEF_CFA,
10416 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10417 GEN_INT (m->fs.cfa_offset)));
10418 RTX_FRAME_RELATED_P (insn) = 1;
10419 }
10420 }
10421 }
10422
10423 /* Emit code to restore saved registers using POP insns. */
10424
10425 static void
10426 ix86_emit_restore_regs_using_pop (void)
10427 {
10428 unsigned int regno;
10429
10430 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10431 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10432 ix86_emit_restore_reg_using_pop (gen_rtx_REG (Pmode, regno));
10433 }
10434
10435 /* Emit code and notes for the LEAVE instruction. */
10436
10437 static void
10438 ix86_emit_leave (void)
10439 {
10440 struct machine_function *m = cfun->machine;
10441 rtx insn = emit_insn (ix86_gen_leave ());
10442
10443 ix86_add_queued_cfa_restore_notes (insn);
10444
10445 gcc_assert (m->fs.fp_valid);
10446 m->fs.sp_valid = true;
10447 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10448 m->fs.fp_valid = false;
10449
10450 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10451 {
10452 m->fs.cfa_reg = stack_pointer_rtx;
10453 m->fs.cfa_offset = m->fs.sp_offset;
10454
10455 add_reg_note (insn, REG_CFA_DEF_CFA,
10456 plus_constant (stack_pointer_rtx, m->fs.sp_offset));
10457 RTX_FRAME_RELATED_P (insn) = 1;
10458 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10459 m->fs.fp_offset);
10460 }
10461 }
10462
10463 /* Emit code to restore saved registers using MOV insns.
10464 First register is restored from CFA - CFA_OFFSET. */
10465 static void
10466 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10467 bool maybe_eh_return)
10468 {
10469 struct machine_function *m = cfun->machine;
10470 unsigned int regno;
10471
10472 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10473 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10474 {
10475 rtx reg = gen_rtx_REG (Pmode, regno);
10476 rtx insn, mem;
10477
10478 mem = choose_baseaddr (cfa_offset);
10479 mem = gen_frame_mem (Pmode, mem);
10480 insn = emit_move_insn (reg, mem);
10481
10482 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10483 {
10484 /* Previously we'd represented the CFA as an expression
10485 like *(%ebp - 8). We've just popped that value from
10486 the stack, which means we need to reset the CFA to
10487 the drap register. This will remain until we restore
10488 the stack pointer. */
10489 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10490 RTX_FRAME_RELATED_P (insn) = 1;
10491
10492 /* This means that the DRAP register is valid for addressing. */
10493 m->fs.drap_valid = true;
10494 }
10495 else
10496 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10497
10498 cfa_offset -= UNITS_PER_WORD;
10499 }
10500 }
10501
10502 /* Emit code to restore saved registers using MOV insns.
10503 First register is restored from CFA - CFA_OFFSET. */
10504 static void
10505 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10506 bool maybe_eh_return)
10507 {
10508 unsigned int regno;
10509
10510 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10511 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10512 {
10513 rtx reg = gen_rtx_REG (V4SFmode, regno);
10514 rtx mem;
10515
10516 mem = choose_baseaddr (cfa_offset);
10517 mem = gen_rtx_MEM (V4SFmode, mem);
10518 set_mem_align (mem, 128);
10519 emit_move_insn (reg, mem);
10520
10521 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10522
10523 cfa_offset -= 16;
10524 }
10525 }
10526
10527 /* Restore function stack, frame, and registers. */
10528
10529 void
10530 ix86_expand_epilogue (int style)
10531 {
10532 struct machine_function *m = cfun->machine;
10533 struct machine_frame_state frame_state_save = m->fs;
10534 struct ix86_frame frame;
10535 bool restore_regs_via_mov;
10536 bool using_drap;
10537
10538 ix86_finalize_stack_realign_flags ();
10539 ix86_compute_frame_layout (&frame);
10540
10541 m->fs.sp_valid = (!frame_pointer_needed
10542 || (current_function_sp_is_unchanging
10543 && !stack_realign_fp));
10544 gcc_assert (!m->fs.sp_valid
10545 || m->fs.sp_offset == frame.stack_pointer_offset);
10546
10547 /* The FP must be valid if the frame pointer is present. */
10548 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10549 gcc_assert (!m->fs.fp_valid
10550 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10551
10552 /* We must have *some* valid pointer to the stack frame. */
10553 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10554
10555 /* The DRAP is never valid at this point. */
10556 gcc_assert (!m->fs.drap_valid);
10557
10558 /* See the comment about red zone and frame
10559 pointer usage in ix86_expand_prologue. */
10560 if (frame_pointer_needed && frame.red_zone_size)
10561 emit_insn (gen_memory_blockage ());
10562
10563 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10564 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10565
10566 /* Determine the CFA offset of the end of the red-zone. */
10567 m->fs.red_zone_offset = 0;
10568 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10569 {
10570 /* The red-zone begins below the return address. */
10571 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10572
10573 /* When the register save area is in the aligned portion of
10574 the stack, determine the maximum runtime displacement that
10575 matches up with the aligned frame. */
10576 if (stack_realign_drap)
10577 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10578 + UNITS_PER_WORD);
10579 }
10580
10581 /* Special care must be taken for the normal return case of a function
10582 using eh_return: the eax and edx registers are marked as saved, but
10583 not restored along this path. Adjust the save location to match. */
10584 if (crtl->calls_eh_return && style != 2)
10585 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10586
10587 /* EH_RETURN requires the use of moves to function properly. */
10588 if (crtl->calls_eh_return)
10589 restore_regs_via_mov = true;
10590 /* SEH requires the use of pops to identify the epilogue. */
10591 else if (TARGET_SEH)
10592 restore_regs_via_mov = false;
10593 /* If we're only restoring one register and sp is not valid then
10594 using a move instruction to restore the register since it's
10595 less work than reloading sp and popping the register. */
10596 else if (!m->fs.sp_valid && frame.nregs <= 1)
10597 restore_regs_via_mov = true;
10598 else if (TARGET_EPILOGUE_USING_MOVE
10599 && cfun->machine->use_fast_prologue_epilogue
10600 && (frame.nregs > 1
10601 || m->fs.sp_offset != frame.reg_save_offset))
10602 restore_regs_via_mov = true;
10603 else if (frame_pointer_needed
10604 && !frame.nregs
10605 && m->fs.sp_offset != frame.reg_save_offset)
10606 restore_regs_via_mov = true;
10607 else if (frame_pointer_needed
10608 && TARGET_USE_LEAVE
10609 && cfun->machine->use_fast_prologue_epilogue
10610 && frame.nregs == 1)
10611 restore_regs_via_mov = true;
10612 else
10613 restore_regs_via_mov = false;
10614
10615 if (restore_regs_via_mov || frame.nsseregs)
10616 {
10617 /* Ensure that the entire register save area is addressable via
10618 the stack pointer, if we will restore via sp. */
10619 if (TARGET_64BIT
10620 && m->fs.sp_offset > 0x7fffffff
10621 && !(m->fs.fp_valid || m->fs.drap_valid)
10622 && (frame.nsseregs + frame.nregs) != 0)
10623 {
10624 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10625 GEN_INT (m->fs.sp_offset
10626 - frame.sse_reg_save_offset),
10627 style,
10628 m->fs.cfa_reg == stack_pointer_rtx);
10629 }
10630 }
10631
10632 /* If there are any SSE registers to restore, then we have to do it
10633 via moves, since there's obviously no pop for SSE regs. */
10634 if (frame.nsseregs)
10635 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10636 style == 2);
10637
10638 if (restore_regs_via_mov)
10639 {
10640 rtx t;
10641
10642 if (frame.nregs)
10643 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10644
10645 /* eh_return epilogues need %ecx added to the stack pointer. */
10646 if (style == 2)
10647 {
10648 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10649
10650 /* Stack align doesn't work with eh_return. */
10651 gcc_assert (!stack_realign_drap);
10652 /* Neither does regparm nested functions. */
10653 gcc_assert (!ix86_static_chain_on_stack);
10654
10655 if (frame_pointer_needed)
10656 {
10657 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10658 t = plus_constant (t, m->fs.fp_offset - UNITS_PER_WORD);
10659 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10660
10661 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10662 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10663
10664 /* Note that we use SA as a temporary CFA, as the return
10665 address is at the proper place relative to it. We
10666 pretend this happens at the FP restore insn because
10667 prior to this insn the FP would be stored at the wrong
10668 offset relative to SA, and after this insn we have no
10669 other reasonable register to use for the CFA. We don't
10670 bother resetting the CFA to the SP for the duration of
10671 the return insn. */
10672 add_reg_note (insn, REG_CFA_DEF_CFA,
10673 plus_constant (sa, UNITS_PER_WORD));
10674 ix86_add_queued_cfa_restore_notes (insn);
10675 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10676 RTX_FRAME_RELATED_P (insn) = 1;
10677
10678 m->fs.cfa_reg = sa;
10679 m->fs.cfa_offset = UNITS_PER_WORD;
10680 m->fs.fp_valid = false;
10681
10682 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10683 const0_rtx, style, false);
10684 }
10685 else
10686 {
10687 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10688 t = plus_constant (t, m->fs.sp_offset - UNITS_PER_WORD);
10689 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10690 ix86_add_queued_cfa_restore_notes (insn);
10691
10692 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10693 if (m->fs.cfa_offset != UNITS_PER_WORD)
10694 {
10695 m->fs.cfa_offset = UNITS_PER_WORD;
10696 add_reg_note (insn, REG_CFA_DEF_CFA,
10697 plus_constant (stack_pointer_rtx,
10698 UNITS_PER_WORD));
10699 RTX_FRAME_RELATED_P (insn) = 1;
10700 }
10701 }
10702 m->fs.sp_offset = UNITS_PER_WORD;
10703 m->fs.sp_valid = true;
10704 }
10705 }
10706 else
10707 {
10708 /* SEH requires that the function end with (1) a stack adjustment
10709 if necessary, (2) a sequence of pops, and (3) a return or
10710 jump instruction. Prevent insns from the function body from
10711 being scheduled into this sequence. */
10712 if (TARGET_SEH)
10713 {
10714 /* Prevent a catch region from being adjacent to the standard
10715 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10716 several other flags that would be interesting to test are
10717 not yet set up. */
10718 if (flag_non_call_exceptions)
10719 emit_insn (gen_nops (const1_rtx));
10720 else
10721 emit_insn (gen_blockage ());
10722 }
10723
10724 /* First step is to deallocate the stack frame so that we can
10725 pop the registers. */
10726 if (!m->fs.sp_valid)
10727 {
10728 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10729 GEN_INT (m->fs.fp_offset
10730 - frame.reg_save_offset),
10731 style, false);
10732 }
10733 else if (m->fs.sp_offset != frame.reg_save_offset)
10734 {
10735 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10736 GEN_INT (m->fs.sp_offset
10737 - frame.reg_save_offset),
10738 style,
10739 m->fs.cfa_reg == stack_pointer_rtx);
10740 }
10741
10742 ix86_emit_restore_regs_using_pop ();
10743 }
10744
10745 /* If we used a stack pointer and haven't already got rid of it,
10746 then do so now. */
10747 if (m->fs.fp_valid)
10748 {
10749 /* If the stack pointer is valid and pointing at the frame
10750 pointer store address, then we only need a pop. */
10751 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10752 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10753 /* Leave results in shorter dependency chains on CPUs that are
10754 able to grok it fast. */
10755 else if (TARGET_USE_LEAVE
10756 || optimize_function_for_size_p (cfun)
10757 || !cfun->machine->use_fast_prologue_epilogue)
10758 ix86_emit_leave ();
10759 else
10760 {
10761 pro_epilogue_adjust_stack (stack_pointer_rtx,
10762 hard_frame_pointer_rtx,
10763 const0_rtx, style, !using_drap);
10764 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10765 }
10766 }
10767
10768 if (using_drap)
10769 {
10770 int param_ptr_offset = UNITS_PER_WORD;
10771 rtx insn;
10772
10773 gcc_assert (stack_realign_drap);
10774
10775 if (ix86_static_chain_on_stack)
10776 param_ptr_offset += UNITS_PER_WORD;
10777 if (!call_used_regs[REGNO (crtl->drap_reg)])
10778 param_ptr_offset += UNITS_PER_WORD;
10779
10780 insn = emit_insn (gen_rtx_SET
10781 (VOIDmode, stack_pointer_rtx,
10782 gen_rtx_PLUS (Pmode,
10783 crtl->drap_reg,
10784 GEN_INT (-param_ptr_offset))));
10785 m->fs.cfa_reg = stack_pointer_rtx;
10786 m->fs.cfa_offset = param_ptr_offset;
10787 m->fs.sp_offset = param_ptr_offset;
10788 m->fs.realigned = false;
10789
10790 add_reg_note (insn, REG_CFA_DEF_CFA,
10791 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10792 GEN_INT (param_ptr_offset)));
10793 RTX_FRAME_RELATED_P (insn) = 1;
10794
10795 if (!call_used_regs[REGNO (crtl->drap_reg)])
10796 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
10797 }
10798
10799 /* At this point the stack pointer must be valid, and we must have
10800 restored all of the registers. We may not have deallocated the
10801 entire stack frame. We've delayed this until now because it may
10802 be possible to merge the local stack deallocation with the
10803 deallocation forced by ix86_static_chain_on_stack. */
10804 gcc_assert (m->fs.sp_valid);
10805 gcc_assert (!m->fs.fp_valid);
10806 gcc_assert (!m->fs.realigned);
10807 if (m->fs.sp_offset != UNITS_PER_WORD)
10808 {
10809 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10810 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
10811 style, true);
10812 }
10813 else
10814 ix86_add_queued_cfa_restore_notes (get_last_insn ());
10815
10816 /* Sibcall epilogues don't want a return instruction. */
10817 if (style == 0)
10818 {
10819 m->fs = frame_state_save;
10820 return;
10821 }
10822
10823 /* Emit vzeroupper if needed. */
10824 if (TARGET_VZEROUPPER
10825 && !TREE_THIS_VOLATILE (cfun->decl)
10826 && !cfun->machine->caller_return_avx256_p)
10827 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10828
10829 if (crtl->args.pops_args && crtl->args.size)
10830 {
10831 rtx popc = GEN_INT (crtl->args.pops_args);
10832
10833 /* i386 can only pop 64K bytes. If asked to pop more, pop return
10834 address, do explicit add, and jump indirectly to the caller. */
10835
10836 if (crtl->args.pops_args >= 65536)
10837 {
10838 rtx ecx = gen_rtx_REG (SImode, CX_REG);
10839 rtx insn;
10840
10841 /* There is no "pascal" calling convention in any 64bit ABI. */
10842 gcc_assert (!TARGET_64BIT);
10843
10844 insn = emit_insn (gen_pop (ecx));
10845 m->fs.cfa_offset -= UNITS_PER_WORD;
10846 m->fs.sp_offset -= UNITS_PER_WORD;
10847
10848 add_reg_note (insn, REG_CFA_ADJUST_CFA,
10849 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
10850 add_reg_note (insn, REG_CFA_REGISTER,
10851 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
10852 RTX_FRAME_RELATED_P (insn) = 1;
10853
10854 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10855 popc, -1, true);
10856 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
10857 }
10858 else
10859 emit_jump_insn (gen_simple_return_pop_internal (popc));
10860 }
10861 else
10862 emit_jump_insn (gen_simple_return_internal ());
10863
10864 /* Restore the state back to the state from the prologue,
10865 so that it's correct for the next epilogue. */
10866 m->fs = frame_state_save;
10867 }
10868
10869 /* Reset from the function's potential modifications. */
10870
10871 static void
10872 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
10873 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
10874 {
10875 if (pic_offset_table_rtx)
10876 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
10877 #if TARGET_MACHO
10878 /* Mach-O doesn't support labels at the end of objects, so if
10879 it looks like we might want one, insert a NOP. */
10880 {
10881 rtx insn = get_last_insn ();
10882 while (insn
10883 && NOTE_P (insn)
10884 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
10885 insn = PREV_INSN (insn);
10886 if (insn
10887 && (LABEL_P (insn)
10888 || (NOTE_P (insn)
10889 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
10890 fputs ("\tnop\n", file);
10891 }
10892 #endif
10893
10894 }
10895
10896 /* Return a scratch register to use in the split stack prologue. The
10897 split stack prologue is used for -fsplit-stack. It is the first
10898 instructions in the function, even before the regular prologue.
10899 The scratch register can be any caller-saved register which is not
10900 used for parameters or for the static chain. */
10901
10902 static unsigned int
10903 split_stack_prologue_scratch_regno (void)
10904 {
10905 if (TARGET_64BIT)
10906 return R11_REG;
10907 else
10908 {
10909 bool is_fastcall;
10910 int regparm;
10911
10912 is_fastcall = (lookup_attribute ("fastcall",
10913 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
10914 != NULL);
10915 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
10916
10917 if (is_fastcall)
10918 {
10919 if (DECL_STATIC_CHAIN (cfun->decl))
10920 {
10921 sorry ("-fsplit-stack does not support fastcall with "
10922 "nested function");
10923 return INVALID_REGNUM;
10924 }
10925 return AX_REG;
10926 }
10927 else if (regparm < 3)
10928 {
10929 if (!DECL_STATIC_CHAIN (cfun->decl))
10930 return CX_REG;
10931 else
10932 {
10933 if (regparm >= 2)
10934 {
10935 sorry ("-fsplit-stack does not support 2 register "
10936 " parameters for a nested function");
10937 return INVALID_REGNUM;
10938 }
10939 return DX_REG;
10940 }
10941 }
10942 else
10943 {
10944 /* FIXME: We could make this work by pushing a register
10945 around the addition and comparison. */
10946 sorry ("-fsplit-stack does not support 3 register parameters");
10947 return INVALID_REGNUM;
10948 }
10949 }
10950 }
10951
10952 /* A SYMBOL_REF for the function which allocates new stackspace for
10953 -fsplit-stack. */
10954
10955 static GTY(()) rtx split_stack_fn;
10956
10957 /* A SYMBOL_REF for the more stack function when using the large
10958 model. */
10959
10960 static GTY(()) rtx split_stack_fn_large;
10961
10962 /* Handle -fsplit-stack. These are the first instructions in the
10963 function, even before the regular prologue. */
10964
10965 void
10966 ix86_expand_split_stack_prologue (void)
10967 {
10968 struct ix86_frame frame;
10969 HOST_WIDE_INT allocate;
10970 unsigned HOST_WIDE_INT args_size;
10971 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
10972 rtx scratch_reg = NULL_RTX;
10973 rtx varargs_label = NULL_RTX;
10974 rtx fn;
10975
10976 gcc_assert (flag_split_stack && reload_completed);
10977
10978 ix86_finalize_stack_realign_flags ();
10979 ix86_compute_frame_layout (&frame);
10980 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
10981
10982 /* This is the label we will branch to if we have enough stack
10983 space. We expect the basic block reordering pass to reverse this
10984 branch if optimizing, so that we branch in the unlikely case. */
10985 label = gen_label_rtx ();
10986
10987 /* We need to compare the stack pointer minus the frame size with
10988 the stack boundary in the TCB. The stack boundary always gives
10989 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
10990 can compare directly. Otherwise we need to do an addition. */
10991
10992 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
10993 UNSPEC_STACK_CHECK);
10994 limit = gen_rtx_CONST (Pmode, limit);
10995 limit = gen_rtx_MEM (Pmode, limit);
10996 if (allocate < SPLIT_STACK_AVAILABLE)
10997 current = stack_pointer_rtx;
10998 else
10999 {
11000 unsigned int scratch_regno;
11001 rtx offset;
11002
11003 /* We need a scratch register to hold the stack pointer minus
11004 the required frame size. Since this is the very start of the
11005 function, the scratch register can be any caller-saved
11006 register which is not used for parameters. */
11007 offset = GEN_INT (- allocate);
11008 scratch_regno = split_stack_prologue_scratch_regno ();
11009 if (scratch_regno == INVALID_REGNUM)
11010 return;
11011 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11012 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11013 {
11014 /* We don't use ix86_gen_add3 in this case because it will
11015 want to split to lea, but when not optimizing the insn
11016 will not be split after this point. */
11017 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11018 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11019 offset)));
11020 }
11021 else
11022 {
11023 emit_move_insn (scratch_reg, offset);
11024 emit_insn (gen_adddi3 (scratch_reg, scratch_reg,
11025 stack_pointer_rtx));
11026 }
11027 current = scratch_reg;
11028 }
11029
11030 ix86_expand_branch (GEU, current, limit, label);
11031 jump_insn = get_last_insn ();
11032 JUMP_LABEL (jump_insn) = label;
11033
11034 /* Mark the jump as very likely to be taken. */
11035 add_reg_note (jump_insn, REG_BR_PROB,
11036 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11037
11038 if (split_stack_fn == NULL_RTX)
11039 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11040 fn = split_stack_fn;
11041
11042 /* Get more stack space. We pass in the desired stack space and the
11043 size of the arguments to copy to the new stack. In 32-bit mode
11044 we push the parameters; __morestack will return on a new stack
11045 anyhow. In 64-bit mode we pass the parameters in r10 and
11046 r11. */
11047 allocate_rtx = GEN_INT (allocate);
11048 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11049 call_fusage = NULL_RTX;
11050 if (TARGET_64BIT)
11051 {
11052 rtx reg10, reg11;
11053
11054 reg10 = gen_rtx_REG (Pmode, R10_REG);
11055 reg11 = gen_rtx_REG (Pmode, R11_REG);
11056
11057 /* If this function uses a static chain, it will be in %r10.
11058 Preserve it across the call to __morestack. */
11059 if (DECL_STATIC_CHAIN (cfun->decl))
11060 {
11061 rtx rax;
11062
11063 rax = gen_rtx_REG (Pmode, AX_REG);
11064 emit_move_insn (rax, reg10);
11065 use_reg (&call_fusage, rax);
11066 }
11067
11068 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11069 {
11070 HOST_WIDE_INT argval;
11071
11072 /* When using the large model we need to load the address
11073 into a register, and we've run out of registers. So we
11074 switch to a different calling convention, and we call a
11075 different function: __morestack_large. We pass the
11076 argument size in the upper 32 bits of r10 and pass the
11077 frame size in the lower 32 bits. */
11078 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11079 gcc_assert ((args_size & 0xffffffff) == args_size);
11080
11081 if (split_stack_fn_large == NULL_RTX)
11082 split_stack_fn_large =
11083 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11084
11085 if (ix86_cmodel == CM_LARGE_PIC)
11086 {
11087 rtx label, x;
11088
11089 label = gen_label_rtx ();
11090 emit_label (label);
11091 LABEL_PRESERVE_P (label) = 1;
11092 emit_insn (gen_set_rip_rex64 (reg10, label));
11093 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11094 emit_insn (gen_adddi3 (reg10, reg10, reg11));
11095 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11096 UNSPEC_GOT);
11097 x = gen_rtx_CONST (Pmode, x);
11098 emit_move_insn (reg11, x);
11099 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11100 x = gen_const_mem (Pmode, x);
11101 emit_move_insn (reg11, x);
11102 }
11103 else
11104 emit_move_insn (reg11, split_stack_fn_large);
11105
11106 fn = reg11;
11107
11108 argval = ((args_size << 16) << 16) + allocate;
11109 emit_move_insn (reg10, GEN_INT (argval));
11110 }
11111 else
11112 {
11113 emit_move_insn (reg10, allocate_rtx);
11114 emit_move_insn (reg11, GEN_INT (args_size));
11115 use_reg (&call_fusage, reg11);
11116 }
11117
11118 use_reg (&call_fusage, reg10);
11119 }
11120 else
11121 {
11122 emit_insn (gen_push (GEN_INT (args_size)));
11123 emit_insn (gen_push (allocate_rtx));
11124 }
11125 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11126 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11127 NULL_RTX, false);
11128 add_function_usage_to (call_insn, call_fusage);
11129
11130 /* In order to make call/return prediction work right, we now need
11131 to execute a return instruction. See
11132 libgcc/config/i386/morestack.S for the details on how this works.
11133
11134 For flow purposes gcc must not see this as a return
11135 instruction--we need control flow to continue at the subsequent
11136 label. Therefore, we use an unspec. */
11137 gcc_assert (crtl->args.pops_args < 65536);
11138 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11139
11140 /* If we are in 64-bit mode and this function uses a static chain,
11141 we saved %r10 in %rax before calling _morestack. */
11142 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11143 emit_move_insn (gen_rtx_REG (Pmode, R10_REG),
11144 gen_rtx_REG (Pmode, AX_REG));
11145
11146 /* If this function calls va_start, we need to store a pointer to
11147 the arguments on the old stack, because they may not have been
11148 all copied to the new stack. At this point the old stack can be
11149 found at the frame pointer value used by __morestack, because
11150 __morestack has set that up before calling back to us. Here we
11151 store that pointer in a scratch register, and in
11152 ix86_expand_prologue we store the scratch register in a stack
11153 slot. */
11154 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11155 {
11156 unsigned int scratch_regno;
11157 rtx frame_reg;
11158 int words;
11159
11160 scratch_regno = split_stack_prologue_scratch_regno ();
11161 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11162 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11163
11164 /* 64-bit:
11165 fp -> old fp value
11166 return address within this function
11167 return address of caller of this function
11168 stack arguments
11169 So we add three words to get to the stack arguments.
11170
11171 32-bit:
11172 fp -> old fp value
11173 return address within this function
11174 first argument to __morestack
11175 second argument to __morestack
11176 return address of caller of this function
11177 stack arguments
11178 So we add five words to get to the stack arguments.
11179 */
11180 words = TARGET_64BIT ? 3 : 5;
11181 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11182 gen_rtx_PLUS (Pmode, frame_reg,
11183 GEN_INT (words * UNITS_PER_WORD))));
11184
11185 varargs_label = gen_label_rtx ();
11186 emit_jump_insn (gen_jump (varargs_label));
11187 JUMP_LABEL (get_last_insn ()) = varargs_label;
11188
11189 emit_barrier ();
11190 }
11191
11192 emit_label (label);
11193 LABEL_NUSES (label) = 1;
11194
11195 /* If this function calls va_start, we now have to set the scratch
11196 register for the case where we do not call __morestack. In this
11197 case we need to set it based on the stack pointer. */
11198 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11199 {
11200 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11201 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11202 GEN_INT (UNITS_PER_WORD))));
11203
11204 emit_label (varargs_label);
11205 LABEL_NUSES (varargs_label) = 1;
11206 }
11207 }
11208
11209 /* We may have to tell the dataflow pass that the split stack prologue
11210 is initializing a scratch register. */
11211
11212 static void
11213 ix86_live_on_entry (bitmap regs)
11214 {
11215 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11216 {
11217 gcc_assert (flag_split_stack);
11218 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11219 }
11220 }
11221 \f
11222 /* Determine if op is suitable SUBREG RTX for address. */
11223
11224 static bool
11225 ix86_address_subreg_operand (rtx op)
11226 {
11227 enum machine_mode mode;
11228
11229 if (!REG_P (op))
11230 return false;
11231
11232 mode = GET_MODE (op);
11233
11234 if (GET_MODE_CLASS (mode) != MODE_INT)
11235 return false;
11236
11237 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11238 failures when the register is one word out of a two word structure. */
11239 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11240 return false;
11241
11242 /* Allow only SUBREGs of non-eliminable hard registers. */
11243 return register_no_elim_operand (op, mode);
11244 }
11245
11246 /* Extract the parts of an RTL expression that is a valid memory address
11247 for an instruction. Return 0 if the structure of the address is
11248 grossly off. Return -1 if the address contains ASHIFT, so it is not
11249 strictly valid, but still used for computing length of lea instruction. */
11250
11251 int
11252 ix86_decompose_address (rtx addr, struct ix86_address *out)
11253 {
11254 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11255 rtx base_reg, index_reg;
11256 HOST_WIDE_INT scale = 1;
11257 rtx scale_rtx = NULL_RTX;
11258 rtx tmp;
11259 int retval = 1;
11260 enum ix86_address_seg seg = SEG_DEFAULT;
11261
11262 /* Allow zero-extended SImode addresses,
11263 they will be emitted with addr32 prefix. */
11264 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11265 {
11266 if (GET_CODE (addr) == ZERO_EXTEND
11267 && GET_MODE (XEXP (addr, 0)) == SImode)
11268 addr = XEXP (addr, 0);
11269 else if (GET_CODE (addr) == AND
11270 && const_32bit_mask (XEXP (addr, 1), DImode))
11271 {
11272 addr = XEXP (addr, 0);
11273
11274 /* Strip subreg. */
11275 if (GET_CODE (addr) == SUBREG
11276 && GET_MODE (SUBREG_REG (addr)) == SImode)
11277 addr = SUBREG_REG (addr);
11278 }
11279 }
11280
11281 if (REG_P (addr))
11282 base = addr;
11283 else if (GET_CODE (addr) == SUBREG)
11284 {
11285 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11286 base = addr;
11287 else
11288 return 0;
11289 }
11290 else if (GET_CODE (addr) == PLUS)
11291 {
11292 rtx addends[4], op;
11293 int n = 0, i;
11294
11295 op = addr;
11296 do
11297 {
11298 if (n >= 4)
11299 return 0;
11300 addends[n++] = XEXP (op, 1);
11301 op = XEXP (op, 0);
11302 }
11303 while (GET_CODE (op) == PLUS);
11304 if (n >= 4)
11305 return 0;
11306 addends[n] = op;
11307
11308 for (i = n; i >= 0; --i)
11309 {
11310 op = addends[i];
11311 switch (GET_CODE (op))
11312 {
11313 case MULT:
11314 if (index)
11315 return 0;
11316 index = XEXP (op, 0);
11317 scale_rtx = XEXP (op, 1);
11318 break;
11319
11320 case ASHIFT:
11321 if (index)
11322 return 0;
11323 index = XEXP (op, 0);
11324 tmp = XEXP (op, 1);
11325 if (!CONST_INT_P (tmp))
11326 return 0;
11327 scale = INTVAL (tmp);
11328 if ((unsigned HOST_WIDE_INT) scale > 3)
11329 return 0;
11330 scale = 1 << scale;
11331 break;
11332
11333 case UNSPEC:
11334 if (XINT (op, 1) == UNSPEC_TP
11335 && TARGET_TLS_DIRECT_SEG_REFS
11336 && seg == SEG_DEFAULT)
11337 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11338 else
11339 return 0;
11340 break;
11341
11342 case SUBREG:
11343 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11344 return 0;
11345 /* FALLTHRU */
11346
11347 case REG:
11348 if (!base)
11349 base = op;
11350 else if (!index)
11351 index = op;
11352 else
11353 return 0;
11354 break;
11355
11356 case CONST:
11357 case CONST_INT:
11358 case SYMBOL_REF:
11359 case LABEL_REF:
11360 if (disp)
11361 return 0;
11362 disp = op;
11363 break;
11364
11365 default:
11366 return 0;
11367 }
11368 }
11369 }
11370 else if (GET_CODE (addr) == MULT)
11371 {
11372 index = XEXP (addr, 0); /* index*scale */
11373 scale_rtx = XEXP (addr, 1);
11374 }
11375 else if (GET_CODE (addr) == ASHIFT)
11376 {
11377 /* We're called for lea too, which implements ashift on occasion. */
11378 index = XEXP (addr, 0);
11379 tmp = XEXP (addr, 1);
11380 if (!CONST_INT_P (tmp))
11381 return 0;
11382 scale = INTVAL (tmp);
11383 if ((unsigned HOST_WIDE_INT) scale > 3)
11384 return 0;
11385 scale = 1 << scale;
11386 retval = -1;
11387 }
11388 else
11389 disp = addr; /* displacement */
11390
11391 if (index)
11392 {
11393 if (REG_P (index))
11394 ;
11395 else if (GET_CODE (index) == SUBREG
11396 && ix86_address_subreg_operand (SUBREG_REG (index)))
11397 ;
11398 else
11399 return 0;
11400 }
11401
11402 /* Extract the integral value of scale. */
11403 if (scale_rtx)
11404 {
11405 if (!CONST_INT_P (scale_rtx))
11406 return 0;
11407 scale = INTVAL (scale_rtx);
11408 }
11409
11410 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11411 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11412
11413 /* Avoid useless 0 displacement. */
11414 if (disp == const0_rtx && (base || index))
11415 disp = NULL_RTX;
11416
11417 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11418 if (base_reg && index_reg && scale == 1
11419 && (index_reg == arg_pointer_rtx
11420 || index_reg == frame_pointer_rtx
11421 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11422 {
11423 rtx tmp;
11424 tmp = base, base = index, index = tmp;
11425 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11426 }
11427
11428 /* Special case: %ebp cannot be encoded as a base without a displacement.
11429 Similarly %r13. */
11430 if (!disp
11431 && base_reg
11432 && (base_reg == hard_frame_pointer_rtx
11433 || base_reg == frame_pointer_rtx
11434 || base_reg == arg_pointer_rtx
11435 || (REG_P (base_reg)
11436 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11437 || REGNO (base_reg) == R13_REG))))
11438 disp = const0_rtx;
11439
11440 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11441 Avoid this by transforming to [%esi+0].
11442 Reload calls address legitimization without cfun defined, so we need
11443 to test cfun for being non-NULL. */
11444 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11445 && base_reg && !index_reg && !disp
11446 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11447 disp = const0_rtx;
11448
11449 /* Special case: encode reg+reg instead of reg*2. */
11450 if (!base && index && scale == 2)
11451 base = index, base_reg = index_reg, scale = 1;
11452
11453 /* Special case: scaling cannot be encoded without base or displacement. */
11454 if (!base && !disp && index && scale != 1)
11455 disp = const0_rtx;
11456
11457 out->base = base;
11458 out->index = index;
11459 out->disp = disp;
11460 out->scale = scale;
11461 out->seg = seg;
11462
11463 return retval;
11464 }
11465 \f
11466 /* Return cost of the memory address x.
11467 For i386, it is better to use a complex address than let gcc copy
11468 the address into a reg and make a new pseudo. But not if the address
11469 requires to two regs - that would mean more pseudos with longer
11470 lifetimes. */
11471 static int
11472 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11473 {
11474 struct ix86_address parts;
11475 int cost = 1;
11476 int ok = ix86_decompose_address (x, &parts);
11477
11478 gcc_assert (ok);
11479
11480 if (parts.base && GET_CODE (parts.base) == SUBREG)
11481 parts.base = SUBREG_REG (parts.base);
11482 if (parts.index && GET_CODE (parts.index) == SUBREG)
11483 parts.index = SUBREG_REG (parts.index);
11484
11485 /* Attempt to minimize number of registers in the address. */
11486 if ((parts.base
11487 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11488 || (parts.index
11489 && (!REG_P (parts.index)
11490 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11491 cost++;
11492
11493 if (parts.base
11494 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11495 && parts.index
11496 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11497 && parts.base != parts.index)
11498 cost++;
11499
11500 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11501 since it's predecode logic can't detect the length of instructions
11502 and it degenerates to vector decoded. Increase cost of such
11503 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11504 to split such addresses or even refuse such addresses at all.
11505
11506 Following addressing modes are affected:
11507 [base+scale*index]
11508 [scale*index+disp]
11509 [base+index]
11510
11511 The first and last case may be avoidable by explicitly coding the zero in
11512 memory address, but I don't have AMD-K6 machine handy to check this
11513 theory. */
11514
11515 if (TARGET_K6
11516 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11517 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11518 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11519 cost += 10;
11520
11521 return cost;
11522 }
11523 \f
11524 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11525 this is used for to form addresses to local data when -fPIC is in
11526 use. */
11527
11528 static bool
11529 darwin_local_data_pic (rtx disp)
11530 {
11531 return (GET_CODE (disp) == UNSPEC
11532 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11533 }
11534
11535 /* Determine if a given RTX is a valid constant. We already know this
11536 satisfies CONSTANT_P. */
11537
11538 static bool
11539 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11540 {
11541 switch (GET_CODE (x))
11542 {
11543 case CONST:
11544 x = XEXP (x, 0);
11545
11546 if (GET_CODE (x) == PLUS)
11547 {
11548 if (!CONST_INT_P (XEXP (x, 1)))
11549 return false;
11550 x = XEXP (x, 0);
11551 }
11552
11553 if (TARGET_MACHO && darwin_local_data_pic (x))
11554 return true;
11555
11556 /* Only some unspecs are valid as "constants". */
11557 if (GET_CODE (x) == UNSPEC)
11558 switch (XINT (x, 1))
11559 {
11560 case UNSPEC_GOT:
11561 case UNSPEC_GOTOFF:
11562 case UNSPEC_PLTOFF:
11563 return TARGET_64BIT;
11564 case UNSPEC_TPOFF:
11565 case UNSPEC_NTPOFF:
11566 x = XVECEXP (x, 0, 0);
11567 return (GET_CODE (x) == SYMBOL_REF
11568 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11569 case UNSPEC_DTPOFF:
11570 x = XVECEXP (x, 0, 0);
11571 return (GET_CODE (x) == SYMBOL_REF
11572 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11573 default:
11574 return false;
11575 }
11576
11577 /* We must have drilled down to a symbol. */
11578 if (GET_CODE (x) == LABEL_REF)
11579 return true;
11580 if (GET_CODE (x) != SYMBOL_REF)
11581 return false;
11582 /* FALLTHRU */
11583
11584 case SYMBOL_REF:
11585 /* TLS symbols are never valid. */
11586 if (SYMBOL_REF_TLS_MODEL (x))
11587 return false;
11588
11589 /* DLLIMPORT symbols are never valid. */
11590 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11591 && SYMBOL_REF_DLLIMPORT_P (x))
11592 return false;
11593
11594 #if TARGET_MACHO
11595 /* mdynamic-no-pic */
11596 if (MACHO_DYNAMIC_NO_PIC_P)
11597 return machopic_symbol_defined_p (x);
11598 #endif
11599 break;
11600
11601 case CONST_DOUBLE:
11602 if (GET_MODE (x) == TImode
11603 && x != CONST0_RTX (TImode)
11604 && !TARGET_64BIT)
11605 return false;
11606 break;
11607
11608 case CONST_VECTOR:
11609 if (!standard_sse_constant_p (x))
11610 return false;
11611
11612 default:
11613 break;
11614 }
11615
11616 /* Otherwise we handle everything else in the move patterns. */
11617 return true;
11618 }
11619
11620 /* Determine if it's legal to put X into the constant pool. This
11621 is not possible for the address of thread-local symbols, which
11622 is checked above. */
11623
11624 static bool
11625 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11626 {
11627 /* We can always put integral constants and vectors in memory. */
11628 switch (GET_CODE (x))
11629 {
11630 case CONST_INT:
11631 case CONST_DOUBLE:
11632 case CONST_VECTOR:
11633 return false;
11634
11635 default:
11636 break;
11637 }
11638 return !ix86_legitimate_constant_p (mode, x);
11639 }
11640
11641
11642 /* Nonzero if the constant value X is a legitimate general operand
11643 when generating PIC code. It is given that flag_pic is on and
11644 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11645
11646 bool
11647 legitimate_pic_operand_p (rtx x)
11648 {
11649 rtx inner;
11650
11651 switch (GET_CODE (x))
11652 {
11653 case CONST:
11654 inner = XEXP (x, 0);
11655 if (GET_CODE (inner) == PLUS
11656 && CONST_INT_P (XEXP (inner, 1)))
11657 inner = XEXP (inner, 0);
11658
11659 /* Only some unspecs are valid as "constants". */
11660 if (GET_CODE (inner) == UNSPEC)
11661 switch (XINT (inner, 1))
11662 {
11663 case UNSPEC_GOT:
11664 case UNSPEC_GOTOFF:
11665 case UNSPEC_PLTOFF:
11666 return TARGET_64BIT;
11667 case UNSPEC_TPOFF:
11668 x = XVECEXP (inner, 0, 0);
11669 return (GET_CODE (x) == SYMBOL_REF
11670 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11671 case UNSPEC_MACHOPIC_OFFSET:
11672 return legitimate_pic_address_disp_p (x);
11673 default:
11674 return false;
11675 }
11676 /* FALLTHRU */
11677
11678 case SYMBOL_REF:
11679 case LABEL_REF:
11680 return legitimate_pic_address_disp_p (x);
11681
11682 default:
11683 return true;
11684 }
11685 }
11686
11687 /* Determine if a given CONST RTX is a valid memory displacement
11688 in PIC mode. */
11689
11690 bool
11691 legitimate_pic_address_disp_p (rtx disp)
11692 {
11693 bool saw_plus;
11694
11695 /* In 64bit mode we can allow direct addresses of symbols and labels
11696 when they are not dynamic symbols. */
11697 if (TARGET_64BIT)
11698 {
11699 rtx op0 = disp, op1;
11700
11701 switch (GET_CODE (disp))
11702 {
11703 case LABEL_REF:
11704 return true;
11705
11706 case CONST:
11707 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11708 break;
11709 op0 = XEXP (XEXP (disp, 0), 0);
11710 op1 = XEXP (XEXP (disp, 0), 1);
11711 if (!CONST_INT_P (op1)
11712 || INTVAL (op1) >= 16*1024*1024
11713 || INTVAL (op1) < -16*1024*1024)
11714 break;
11715 if (GET_CODE (op0) == LABEL_REF)
11716 return true;
11717 if (GET_CODE (op0) != SYMBOL_REF)
11718 break;
11719 /* FALLTHRU */
11720
11721 case SYMBOL_REF:
11722 /* TLS references should always be enclosed in UNSPEC. */
11723 if (SYMBOL_REF_TLS_MODEL (op0))
11724 return false;
11725 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11726 && ix86_cmodel != CM_LARGE_PIC)
11727 return true;
11728 break;
11729
11730 default:
11731 break;
11732 }
11733 }
11734 if (GET_CODE (disp) != CONST)
11735 return false;
11736 disp = XEXP (disp, 0);
11737
11738 if (TARGET_64BIT)
11739 {
11740 /* We are unsafe to allow PLUS expressions. This limit allowed distance
11741 of GOT tables. We should not need these anyway. */
11742 if (GET_CODE (disp) != UNSPEC
11743 || (XINT (disp, 1) != UNSPEC_GOTPCREL
11744 && XINT (disp, 1) != UNSPEC_GOTOFF
11745 && XINT (disp, 1) != UNSPEC_PCREL
11746 && XINT (disp, 1) != UNSPEC_PLTOFF))
11747 return false;
11748
11749 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
11750 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
11751 return false;
11752 return true;
11753 }
11754
11755 saw_plus = false;
11756 if (GET_CODE (disp) == PLUS)
11757 {
11758 if (!CONST_INT_P (XEXP (disp, 1)))
11759 return false;
11760 disp = XEXP (disp, 0);
11761 saw_plus = true;
11762 }
11763
11764 if (TARGET_MACHO && darwin_local_data_pic (disp))
11765 return true;
11766
11767 if (GET_CODE (disp) != UNSPEC)
11768 return false;
11769
11770 switch (XINT (disp, 1))
11771 {
11772 case UNSPEC_GOT:
11773 if (saw_plus)
11774 return false;
11775 /* We need to check for both symbols and labels because VxWorks loads
11776 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
11777 details. */
11778 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11779 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
11780 case UNSPEC_GOTOFF:
11781 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
11782 While ABI specify also 32bit relocation but we don't produce it in
11783 small PIC model at all. */
11784 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11785 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
11786 && !TARGET_64BIT)
11787 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
11788 return false;
11789 case UNSPEC_GOTTPOFF:
11790 case UNSPEC_GOTNTPOFF:
11791 case UNSPEC_INDNTPOFF:
11792 if (saw_plus)
11793 return false;
11794 disp = XVECEXP (disp, 0, 0);
11795 return (GET_CODE (disp) == SYMBOL_REF
11796 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
11797 case UNSPEC_NTPOFF:
11798 disp = XVECEXP (disp, 0, 0);
11799 return (GET_CODE (disp) == SYMBOL_REF
11800 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
11801 case UNSPEC_DTPOFF:
11802 disp = XVECEXP (disp, 0, 0);
11803 return (GET_CODE (disp) == SYMBOL_REF
11804 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
11805 }
11806
11807 return false;
11808 }
11809
11810 /* Recognizes RTL expressions that are valid memory addresses for an
11811 instruction. The MODE argument is the machine mode for the MEM
11812 expression that wants to use this address.
11813
11814 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
11815 convert common non-canonical forms to canonical form so that they will
11816 be recognized. */
11817
11818 static bool
11819 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
11820 rtx addr, bool strict)
11821 {
11822 struct ix86_address parts;
11823 rtx base, index, disp;
11824 HOST_WIDE_INT scale;
11825
11826 if (ix86_decompose_address (addr, &parts) <= 0)
11827 /* Decomposition failed. */
11828 return false;
11829
11830 base = parts.base;
11831 index = parts.index;
11832 disp = parts.disp;
11833 scale = parts.scale;
11834
11835 /* Validate base register. */
11836 if (base)
11837 {
11838 rtx reg;
11839
11840 if (REG_P (base))
11841 reg = base;
11842 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
11843 reg = SUBREG_REG (base);
11844 else
11845 /* Base is not a register. */
11846 return false;
11847
11848 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
11849 return false;
11850
11851 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
11852 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
11853 /* Base is not valid. */
11854 return false;
11855 }
11856
11857 /* Validate index register. */
11858 if (index)
11859 {
11860 rtx reg;
11861
11862 if (REG_P (index))
11863 reg = index;
11864 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
11865 reg = SUBREG_REG (index);
11866 else
11867 /* Index is not a register. */
11868 return false;
11869
11870 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
11871 return false;
11872
11873 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
11874 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
11875 /* Index is not valid. */
11876 return false;
11877 }
11878
11879 /* Index and base should have the same mode. */
11880 if (base && index
11881 && GET_MODE (base) != GET_MODE (index))
11882 return false;
11883
11884 /* Validate scale factor. */
11885 if (scale != 1)
11886 {
11887 if (!index)
11888 /* Scale without index. */
11889 return false;
11890
11891 if (scale != 2 && scale != 4 && scale != 8)
11892 /* Scale is not a valid multiplier. */
11893 return false;
11894 }
11895
11896 /* Validate displacement. */
11897 if (disp)
11898 {
11899 if (GET_CODE (disp) == CONST
11900 && GET_CODE (XEXP (disp, 0)) == UNSPEC
11901 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
11902 switch (XINT (XEXP (disp, 0), 1))
11903 {
11904 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
11905 used. While ABI specify also 32bit relocations, we don't produce
11906 them at all and use IP relative instead. */
11907 case UNSPEC_GOT:
11908 case UNSPEC_GOTOFF:
11909 gcc_assert (flag_pic);
11910 if (!TARGET_64BIT)
11911 goto is_legitimate_pic;
11912
11913 /* 64bit address unspec. */
11914 return false;
11915
11916 case UNSPEC_GOTPCREL:
11917 case UNSPEC_PCREL:
11918 gcc_assert (flag_pic);
11919 goto is_legitimate_pic;
11920
11921 case UNSPEC_GOTTPOFF:
11922 case UNSPEC_GOTNTPOFF:
11923 case UNSPEC_INDNTPOFF:
11924 case UNSPEC_NTPOFF:
11925 case UNSPEC_DTPOFF:
11926 break;
11927
11928 case UNSPEC_STACK_CHECK:
11929 gcc_assert (flag_split_stack);
11930 break;
11931
11932 default:
11933 /* Invalid address unspec. */
11934 return false;
11935 }
11936
11937 else if (SYMBOLIC_CONST (disp)
11938 && (flag_pic
11939 || (TARGET_MACHO
11940 #if TARGET_MACHO
11941 && MACHOPIC_INDIRECT
11942 && !machopic_operand_p (disp)
11943 #endif
11944 )))
11945 {
11946
11947 is_legitimate_pic:
11948 if (TARGET_64BIT && (index || base))
11949 {
11950 /* foo@dtpoff(%rX) is ok. */
11951 if (GET_CODE (disp) != CONST
11952 || GET_CODE (XEXP (disp, 0)) != PLUS
11953 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
11954 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
11955 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
11956 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
11957 /* Non-constant pic memory reference. */
11958 return false;
11959 }
11960 else if ((!TARGET_MACHO || flag_pic)
11961 && ! legitimate_pic_address_disp_p (disp))
11962 /* Displacement is an invalid pic construct. */
11963 return false;
11964 #if TARGET_MACHO
11965 else if (MACHO_DYNAMIC_NO_PIC_P
11966 && !ix86_legitimate_constant_p (Pmode, disp))
11967 /* displacment must be referenced via non_lazy_pointer */
11968 return false;
11969 #endif
11970
11971 /* This code used to verify that a symbolic pic displacement
11972 includes the pic_offset_table_rtx register.
11973
11974 While this is good idea, unfortunately these constructs may
11975 be created by "adds using lea" optimization for incorrect
11976 code like:
11977
11978 int a;
11979 int foo(int i)
11980 {
11981 return *(&a+i);
11982 }
11983
11984 This code is nonsensical, but results in addressing
11985 GOT table with pic_offset_table_rtx base. We can't
11986 just refuse it easily, since it gets matched by
11987 "addsi3" pattern, that later gets split to lea in the
11988 case output register differs from input. While this
11989 can be handled by separate addsi pattern for this case
11990 that never results in lea, this seems to be easier and
11991 correct fix for crash to disable this test. */
11992 }
11993 else if (GET_CODE (disp) != LABEL_REF
11994 && !CONST_INT_P (disp)
11995 && (GET_CODE (disp) != CONST
11996 || !ix86_legitimate_constant_p (Pmode, disp))
11997 && (GET_CODE (disp) != SYMBOL_REF
11998 || !ix86_legitimate_constant_p (Pmode, disp)))
11999 /* Displacement is not constant. */
12000 return false;
12001 else if (TARGET_64BIT
12002 && !x86_64_immediate_operand (disp, VOIDmode))
12003 /* Displacement is out of range. */
12004 return false;
12005 }
12006
12007 /* Everything looks valid. */
12008 return true;
12009 }
12010
12011 /* Determine if a given RTX is a valid constant address. */
12012
12013 bool
12014 constant_address_p (rtx x)
12015 {
12016 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12017 }
12018 \f
12019 /* Return a unique alias set for the GOT. */
12020
12021 static alias_set_type
12022 ix86_GOT_alias_set (void)
12023 {
12024 static alias_set_type set = -1;
12025 if (set == -1)
12026 set = new_alias_set ();
12027 return set;
12028 }
12029
12030 /* Return a legitimate reference for ORIG (an address) using the
12031 register REG. If REG is 0, a new pseudo is generated.
12032
12033 There are two types of references that must be handled:
12034
12035 1. Global data references must load the address from the GOT, via
12036 the PIC reg. An insn is emitted to do this load, and the reg is
12037 returned.
12038
12039 2. Static data references, constant pool addresses, and code labels
12040 compute the address as an offset from the GOT, whose base is in
12041 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12042 differentiate them from global data objects. The returned
12043 address is the PIC reg + an unspec constant.
12044
12045 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12046 reg also appears in the address. */
12047
12048 static rtx
12049 legitimize_pic_address (rtx orig, rtx reg)
12050 {
12051 rtx addr = orig;
12052 rtx new_rtx = orig;
12053 rtx base;
12054
12055 #if TARGET_MACHO
12056 if (TARGET_MACHO && !TARGET_64BIT)
12057 {
12058 if (reg == 0)
12059 reg = gen_reg_rtx (Pmode);
12060 /* Use the generic Mach-O PIC machinery. */
12061 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12062 }
12063 #endif
12064
12065 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12066 new_rtx = addr;
12067 else if (TARGET_64BIT
12068 && ix86_cmodel != CM_SMALL_PIC
12069 && gotoff_operand (addr, Pmode))
12070 {
12071 rtx tmpreg;
12072 /* This symbol may be referenced via a displacement from the PIC
12073 base address (@GOTOFF). */
12074
12075 if (reload_in_progress)
12076 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12077 if (GET_CODE (addr) == CONST)
12078 addr = XEXP (addr, 0);
12079 if (GET_CODE (addr) == PLUS)
12080 {
12081 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12082 UNSPEC_GOTOFF);
12083 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12084 }
12085 else
12086 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12087 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12088 if (!reg)
12089 tmpreg = gen_reg_rtx (Pmode);
12090 else
12091 tmpreg = reg;
12092 emit_move_insn (tmpreg, new_rtx);
12093
12094 if (reg != 0)
12095 {
12096 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12097 tmpreg, 1, OPTAB_DIRECT);
12098 new_rtx = reg;
12099 }
12100 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12101 }
12102 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12103 {
12104 /* This symbol may be referenced via a displacement from the PIC
12105 base address (@GOTOFF). */
12106
12107 if (reload_in_progress)
12108 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12109 if (GET_CODE (addr) == CONST)
12110 addr = XEXP (addr, 0);
12111 if (GET_CODE (addr) == PLUS)
12112 {
12113 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12114 UNSPEC_GOTOFF);
12115 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12116 }
12117 else
12118 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12119 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12120 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12121
12122 if (reg != 0)
12123 {
12124 emit_move_insn (reg, new_rtx);
12125 new_rtx = reg;
12126 }
12127 }
12128 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12129 /* We can't use @GOTOFF for text labels on VxWorks;
12130 see gotoff_operand. */
12131 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12132 {
12133 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12134 {
12135 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12136 return legitimize_dllimport_symbol (addr, true);
12137 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12138 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12139 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12140 {
12141 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12142 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12143 }
12144 }
12145
12146 /* For x64 PE-COFF there is no GOT table. So we use address
12147 directly. */
12148 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12149 {
12150 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12151 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12152
12153 if (reg == 0)
12154 reg = gen_reg_rtx (Pmode);
12155 emit_move_insn (reg, new_rtx);
12156 new_rtx = reg;
12157 }
12158 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12159 {
12160 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12161 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12162 new_rtx = gen_const_mem (Pmode, new_rtx);
12163 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12164
12165 if (reg == 0)
12166 reg = gen_reg_rtx (Pmode);
12167 /* Use directly gen_movsi, otherwise the address is loaded
12168 into register for CSE. We don't want to CSE this addresses,
12169 instead we CSE addresses from the GOT table, so skip this. */
12170 emit_insn (gen_movsi (reg, new_rtx));
12171 new_rtx = reg;
12172 }
12173 else
12174 {
12175 /* This symbol must be referenced via a load from the
12176 Global Offset Table (@GOT). */
12177
12178 if (reload_in_progress)
12179 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12180 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12181 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12182 if (TARGET_64BIT)
12183 new_rtx = force_reg (Pmode, new_rtx);
12184 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12185 new_rtx = gen_const_mem (Pmode, new_rtx);
12186 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12187
12188 if (reg == 0)
12189 reg = gen_reg_rtx (Pmode);
12190 emit_move_insn (reg, new_rtx);
12191 new_rtx = reg;
12192 }
12193 }
12194 else
12195 {
12196 if (CONST_INT_P (addr)
12197 && !x86_64_immediate_operand (addr, VOIDmode))
12198 {
12199 if (reg)
12200 {
12201 emit_move_insn (reg, addr);
12202 new_rtx = reg;
12203 }
12204 else
12205 new_rtx = force_reg (Pmode, addr);
12206 }
12207 else if (GET_CODE (addr) == CONST)
12208 {
12209 addr = XEXP (addr, 0);
12210
12211 /* We must match stuff we generate before. Assume the only
12212 unspecs that can get here are ours. Not that we could do
12213 anything with them anyway.... */
12214 if (GET_CODE (addr) == UNSPEC
12215 || (GET_CODE (addr) == PLUS
12216 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12217 return orig;
12218 gcc_assert (GET_CODE (addr) == PLUS);
12219 }
12220 if (GET_CODE (addr) == PLUS)
12221 {
12222 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12223
12224 /* Check first to see if this is a constant offset from a @GOTOFF
12225 symbol reference. */
12226 if (gotoff_operand (op0, Pmode)
12227 && CONST_INT_P (op1))
12228 {
12229 if (!TARGET_64BIT)
12230 {
12231 if (reload_in_progress)
12232 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12233 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12234 UNSPEC_GOTOFF);
12235 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12236 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12237 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12238
12239 if (reg != 0)
12240 {
12241 emit_move_insn (reg, new_rtx);
12242 new_rtx = reg;
12243 }
12244 }
12245 else
12246 {
12247 if (INTVAL (op1) < -16*1024*1024
12248 || INTVAL (op1) >= 16*1024*1024)
12249 {
12250 if (!x86_64_immediate_operand (op1, Pmode))
12251 op1 = force_reg (Pmode, op1);
12252 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12253 }
12254 }
12255 }
12256 else
12257 {
12258 base = legitimize_pic_address (XEXP (addr, 0), reg);
12259 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12260 base == reg ? NULL_RTX : reg);
12261
12262 if (CONST_INT_P (new_rtx))
12263 new_rtx = plus_constant (base, INTVAL (new_rtx));
12264 else
12265 {
12266 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12267 {
12268 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12269 new_rtx = XEXP (new_rtx, 1);
12270 }
12271 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12272 }
12273 }
12274 }
12275 }
12276 return new_rtx;
12277 }
12278 \f
12279 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12280
12281 static rtx
12282 get_thread_pointer (bool to_reg)
12283 {
12284 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12285
12286 if (GET_MODE (tp) != Pmode)
12287 tp = convert_to_mode (Pmode, tp, 1);
12288
12289 if (to_reg)
12290 tp = copy_addr_to_reg (tp);
12291
12292 return tp;
12293 }
12294
12295 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12296
12297 static GTY(()) rtx ix86_tls_symbol;
12298
12299 static rtx
12300 ix86_tls_get_addr (void)
12301 {
12302 if (!ix86_tls_symbol)
12303 {
12304 const char *sym
12305 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12306 ? "___tls_get_addr" : "__tls_get_addr");
12307
12308 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12309 }
12310
12311 return ix86_tls_symbol;
12312 }
12313
12314 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12315
12316 static GTY(()) rtx ix86_tls_module_base_symbol;
12317
12318 rtx
12319 ix86_tls_module_base (void)
12320 {
12321 if (!ix86_tls_module_base_symbol)
12322 {
12323 ix86_tls_module_base_symbol
12324 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12325
12326 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12327 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12328 }
12329
12330 return ix86_tls_module_base_symbol;
12331 }
12332
12333 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12334 false if we expect this to be used for a memory address and true if
12335 we expect to load the address into a register. */
12336
12337 static rtx
12338 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12339 {
12340 rtx dest, base, off;
12341 rtx pic = NULL_RTX, tp = NULL_RTX;
12342 int type;
12343
12344 switch (model)
12345 {
12346 case TLS_MODEL_GLOBAL_DYNAMIC:
12347 dest = gen_reg_rtx (Pmode);
12348
12349 if (!TARGET_64BIT)
12350 {
12351 if (flag_pic)
12352 pic = pic_offset_table_rtx;
12353 else
12354 {
12355 pic = gen_reg_rtx (Pmode);
12356 emit_insn (gen_set_got (pic));
12357 }
12358 }
12359
12360 if (TARGET_GNU2_TLS)
12361 {
12362 if (TARGET_64BIT)
12363 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12364 else
12365 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12366
12367 tp = get_thread_pointer (true);
12368 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12369
12370 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12371 }
12372 else
12373 {
12374 rtx caddr = ix86_tls_get_addr ();
12375
12376 if (TARGET_64BIT)
12377 {
12378 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12379
12380 start_sequence ();
12381 emit_call_insn (gen_tls_global_dynamic_64 (rax, x, caddr));
12382 insns = get_insns ();
12383 end_sequence ();
12384
12385 RTL_CONST_CALL_P (insns) = 1;
12386 emit_libcall_block (insns, dest, rax, x);
12387 }
12388 else
12389 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12390 }
12391 break;
12392
12393 case TLS_MODEL_LOCAL_DYNAMIC:
12394 base = gen_reg_rtx (Pmode);
12395
12396 if (!TARGET_64BIT)
12397 {
12398 if (flag_pic)
12399 pic = pic_offset_table_rtx;
12400 else
12401 {
12402 pic = gen_reg_rtx (Pmode);
12403 emit_insn (gen_set_got (pic));
12404 }
12405 }
12406
12407 if (TARGET_GNU2_TLS)
12408 {
12409 rtx tmp = ix86_tls_module_base ();
12410
12411 if (TARGET_64BIT)
12412 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12413 else
12414 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12415
12416 tp = get_thread_pointer (true);
12417 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12418 gen_rtx_MINUS (Pmode, tmp, tp));
12419 }
12420 else
12421 {
12422 rtx caddr = ix86_tls_get_addr ();
12423
12424 if (TARGET_64BIT)
12425 {
12426 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12427
12428 start_sequence ();
12429 emit_call_insn (gen_tls_local_dynamic_base_64 (rax, caddr));
12430 insns = get_insns ();
12431 end_sequence ();
12432
12433 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12434 share the LD_BASE result with other LD model accesses. */
12435 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12436 UNSPEC_TLS_LD_BASE);
12437
12438 RTL_CONST_CALL_P (insns) = 1;
12439 emit_libcall_block (insns, base, rax, eqv);
12440 }
12441 else
12442 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12443 }
12444
12445 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12446 off = gen_rtx_CONST (Pmode, off);
12447
12448 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12449
12450 if (TARGET_GNU2_TLS)
12451 {
12452 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12453
12454 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12455 }
12456 break;
12457
12458 case TLS_MODEL_INITIAL_EXEC:
12459 if (TARGET_64BIT)
12460 {
12461 if (TARGET_SUN_TLS)
12462 {
12463 /* The Sun linker took the AMD64 TLS spec literally
12464 and can only handle %rax as destination of the
12465 initial executable code sequence. */
12466
12467 dest = gen_reg_rtx (Pmode);
12468 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12469 return dest;
12470 }
12471
12472 pic = NULL;
12473 type = UNSPEC_GOTNTPOFF;
12474 }
12475 else if (flag_pic)
12476 {
12477 if (reload_in_progress)
12478 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12479 pic = pic_offset_table_rtx;
12480 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12481 }
12482 else if (!TARGET_ANY_GNU_TLS)
12483 {
12484 pic = gen_reg_rtx (Pmode);
12485 emit_insn (gen_set_got (pic));
12486 type = UNSPEC_GOTTPOFF;
12487 }
12488 else
12489 {
12490 pic = NULL;
12491 type = UNSPEC_INDNTPOFF;
12492 }
12493
12494 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
12495 off = gen_rtx_CONST (Pmode, off);
12496 if (pic)
12497 off = gen_rtx_PLUS (Pmode, pic, off);
12498 off = gen_const_mem (Pmode, off);
12499 set_mem_alias_set (off, ix86_GOT_alias_set ());
12500
12501 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12502 {
12503 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12504 off = force_reg (Pmode, off);
12505 return gen_rtx_PLUS (Pmode, base, off);
12506 }
12507 else
12508 {
12509 base = get_thread_pointer (true);
12510 dest = gen_reg_rtx (Pmode);
12511 emit_insn (gen_subsi3 (dest, base, off));
12512 }
12513 break;
12514
12515 case TLS_MODEL_LOCAL_EXEC:
12516 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12517 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12518 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12519 off = gen_rtx_CONST (Pmode, off);
12520
12521 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12522 {
12523 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12524 return gen_rtx_PLUS (Pmode, base, off);
12525 }
12526 else
12527 {
12528 base = get_thread_pointer (true);
12529 dest = gen_reg_rtx (Pmode);
12530 emit_insn (gen_subsi3 (dest, base, off));
12531 }
12532 break;
12533
12534 default:
12535 gcc_unreachable ();
12536 }
12537
12538 return dest;
12539 }
12540
12541 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12542 to symbol DECL. */
12543
12544 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12545 htab_t dllimport_map;
12546
12547 static tree
12548 get_dllimport_decl (tree decl)
12549 {
12550 struct tree_map *h, in;
12551 void **loc;
12552 const char *name;
12553 const char *prefix;
12554 size_t namelen, prefixlen;
12555 char *imp_name;
12556 tree to;
12557 rtx rtl;
12558
12559 if (!dllimport_map)
12560 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12561
12562 in.hash = htab_hash_pointer (decl);
12563 in.base.from = decl;
12564 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12565 h = (struct tree_map *) *loc;
12566 if (h)
12567 return h->to;
12568
12569 *loc = h = ggc_alloc_tree_map ();
12570 h->hash = in.hash;
12571 h->base.from = decl;
12572 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12573 VAR_DECL, NULL, ptr_type_node);
12574 DECL_ARTIFICIAL (to) = 1;
12575 DECL_IGNORED_P (to) = 1;
12576 DECL_EXTERNAL (to) = 1;
12577 TREE_READONLY (to) = 1;
12578
12579 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12580 name = targetm.strip_name_encoding (name);
12581 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12582 ? "*__imp_" : "*__imp__";
12583 namelen = strlen (name);
12584 prefixlen = strlen (prefix);
12585 imp_name = (char *) alloca (namelen + prefixlen + 1);
12586 memcpy (imp_name, prefix, prefixlen);
12587 memcpy (imp_name + prefixlen, name, namelen + 1);
12588
12589 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12590 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12591 SET_SYMBOL_REF_DECL (rtl, to);
12592 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12593
12594 rtl = gen_const_mem (Pmode, rtl);
12595 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12596
12597 SET_DECL_RTL (to, rtl);
12598 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12599
12600 return to;
12601 }
12602
12603 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12604 true if we require the result be a register. */
12605
12606 static rtx
12607 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12608 {
12609 tree imp_decl;
12610 rtx x;
12611
12612 gcc_assert (SYMBOL_REF_DECL (symbol));
12613 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12614
12615 x = DECL_RTL (imp_decl);
12616 if (want_reg)
12617 x = force_reg (Pmode, x);
12618 return x;
12619 }
12620
12621 /* Try machine-dependent ways of modifying an illegitimate address
12622 to be legitimate. If we find one, return the new, valid address.
12623 This macro is used in only one place: `memory_address' in explow.c.
12624
12625 OLDX is the address as it was before break_out_memory_refs was called.
12626 In some cases it is useful to look at this to decide what needs to be done.
12627
12628 It is always safe for this macro to do nothing. It exists to recognize
12629 opportunities to optimize the output.
12630
12631 For the 80386, we handle X+REG by loading X into a register R and
12632 using R+REG. R will go in a general reg and indexing will be used.
12633 However, if REG is a broken-out memory address or multiplication,
12634 nothing needs to be done because REG can certainly go in a general reg.
12635
12636 When -fpic is used, special handling is needed for symbolic references.
12637 See comments by legitimize_pic_address in i386.c for details. */
12638
12639 static rtx
12640 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12641 enum machine_mode mode)
12642 {
12643 int changed = 0;
12644 unsigned log;
12645
12646 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12647 if (log)
12648 return legitimize_tls_address (x, (enum tls_model) log, false);
12649 if (GET_CODE (x) == CONST
12650 && GET_CODE (XEXP (x, 0)) == PLUS
12651 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12652 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12653 {
12654 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12655 (enum tls_model) log, false);
12656 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12657 }
12658
12659 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12660 {
12661 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
12662 return legitimize_dllimport_symbol (x, true);
12663 if (GET_CODE (x) == CONST
12664 && GET_CODE (XEXP (x, 0)) == PLUS
12665 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12666 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
12667 {
12668 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
12669 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12670 }
12671 }
12672
12673 if (flag_pic && SYMBOLIC_CONST (x))
12674 return legitimize_pic_address (x, 0);
12675
12676 #if TARGET_MACHO
12677 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
12678 return machopic_indirect_data_reference (x, 0);
12679 #endif
12680
12681 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
12682 if (GET_CODE (x) == ASHIFT
12683 && CONST_INT_P (XEXP (x, 1))
12684 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
12685 {
12686 changed = 1;
12687 log = INTVAL (XEXP (x, 1));
12688 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
12689 GEN_INT (1 << log));
12690 }
12691
12692 if (GET_CODE (x) == PLUS)
12693 {
12694 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
12695
12696 if (GET_CODE (XEXP (x, 0)) == ASHIFT
12697 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12698 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
12699 {
12700 changed = 1;
12701 log = INTVAL (XEXP (XEXP (x, 0), 1));
12702 XEXP (x, 0) = gen_rtx_MULT (Pmode,
12703 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
12704 GEN_INT (1 << log));
12705 }
12706
12707 if (GET_CODE (XEXP (x, 1)) == ASHIFT
12708 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
12709 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
12710 {
12711 changed = 1;
12712 log = INTVAL (XEXP (XEXP (x, 1), 1));
12713 XEXP (x, 1) = gen_rtx_MULT (Pmode,
12714 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
12715 GEN_INT (1 << log));
12716 }
12717
12718 /* Put multiply first if it isn't already. */
12719 if (GET_CODE (XEXP (x, 1)) == MULT)
12720 {
12721 rtx tmp = XEXP (x, 0);
12722 XEXP (x, 0) = XEXP (x, 1);
12723 XEXP (x, 1) = tmp;
12724 changed = 1;
12725 }
12726
12727 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
12728 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
12729 created by virtual register instantiation, register elimination, and
12730 similar optimizations. */
12731 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
12732 {
12733 changed = 1;
12734 x = gen_rtx_PLUS (Pmode,
12735 gen_rtx_PLUS (Pmode, XEXP (x, 0),
12736 XEXP (XEXP (x, 1), 0)),
12737 XEXP (XEXP (x, 1), 1));
12738 }
12739
12740 /* Canonicalize
12741 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
12742 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
12743 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
12744 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12745 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
12746 && CONSTANT_P (XEXP (x, 1)))
12747 {
12748 rtx constant;
12749 rtx other = NULL_RTX;
12750
12751 if (CONST_INT_P (XEXP (x, 1)))
12752 {
12753 constant = XEXP (x, 1);
12754 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
12755 }
12756 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
12757 {
12758 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
12759 other = XEXP (x, 1);
12760 }
12761 else
12762 constant = 0;
12763
12764 if (constant)
12765 {
12766 changed = 1;
12767 x = gen_rtx_PLUS (Pmode,
12768 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
12769 XEXP (XEXP (XEXP (x, 0), 1), 0)),
12770 plus_constant (other, INTVAL (constant)));
12771 }
12772 }
12773
12774 if (changed && ix86_legitimate_address_p (mode, x, false))
12775 return x;
12776
12777 if (GET_CODE (XEXP (x, 0)) == MULT)
12778 {
12779 changed = 1;
12780 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
12781 }
12782
12783 if (GET_CODE (XEXP (x, 1)) == MULT)
12784 {
12785 changed = 1;
12786 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
12787 }
12788
12789 if (changed
12790 && REG_P (XEXP (x, 1))
12791 && REG_P (XEXP (x, 0)))
12792 return x;
12793
12794 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
12795 {
12796 changed = 1;
12797 x = legitimize_pic_address (x, 0);
12798 }
12799
12800 if (changed && ix86_legitimate_address_p (mode, x, false))
12801 return x;
12802
12803 if (REG_P (XEXP (x, 0)))
12804 {
12805 rtx temp = gen_reg_rtx (Pmode);
12806 rtx val = force_operand (XEXP (x, 1), temp);
12807 if (val != temp)
12808 {
12809 if (GET_MODE (val) != Pmode)
12810 val = convert_to_mode (Pmode, val, 1);
12811 emit_move_insn (temp, val);
12812 }
12813
12814 XEXP (x, 1) = temp;
12815 return x;
12816 }
12817
12818 else if (REG_P (XEXP (x, 1)))
12819 {
12820 rtx temp = gen_reg_rtx (Pmode);
12821 rtx val = force_operand (XEXP (x, 0), temp);
12822 if (val != temp)
12823 {
12824 if (GET_MODE (val) != Pmode)
12825 val = convert_to_mode (Pmode, val, 1);
12826 emit_move_insn (temp, val);
12827 }
12828
12829 XEXP (x, 0) = temp;
12830 return x;
12831 }
12832 }
12833
12834 return x;
12835 }
12836 \f
12837 /* Print an integer constant expression in assembler syntax. Addition
12838 and subtraction are the only arithmetic that may appear in these
12839 expressions. FILE is the stdio stream to write to, X is the rtx, and
12840 CODE is the operand print code from the output string. */
12841
12842 static void
12843 output_pic_addr_const (FILE *file, rtx x, int code)
12844 {
12845 char buf[256];
12846
12847 switch (GET_CODE (x))
12848 {
12849 case PC:
12850 gcc_assert (flag_pic);
12851 putc ('.', file);
12852 break;
12853
12854 case SYMBOL_REF:
12855 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
12856 output_addr_const (file, x);
12857 else
12858 {
12859 const char *name = XSTR (x, 0);
12860
12861 /* Mark the decl as referenced so that cgraph will
12862 output the function. */
12863 if (SYMBOL_REF_DECL (x))
12864 mark_decl_referenced (SYMBOL_REF_DECL (x));
12865
12866 #if TARGET_MACHO
12867 if (MACHOPIC_INDIRECT
12868 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
12869 name = machopic_indirection_name (x, /*stub_p=*/true);
12870 #endif
12871 assemble_name (file, name);
12872 }
12873 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12874 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
12875 fputs ("@PLT", file);
12876 break;
12877
12878 case LABEL_REF:
12879 x = XEXP (x, 0);
12880 /* FALLTHRU */
12881 case CODE_LABEL:
12882 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
12883 assemble_name (asm_out_file, buf);
12884 break;
12885
12886 case CONST_INT:
12887 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
12888 break;
12889
12890 case CONST:
12891 /* This used to output parentheses around the expression,
12892 but that does not work on the 386 (either ATT or BSD assembler). */
12893 output_pic_addr_const (file, XEXP (x, 0), code);
12894 break;
12895
12896 case CONST_DOUBLE:
12897 if (GET_MODE (x) == VOIDmode)
12898 {
12899 /* We can use %d if the number is <32 bits and positive. */
12900 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
12901 fprintf (file, "0x%lx%08lx",
12902 (unsigned long) CONST_DOUBLE_HIGH (x),
12903 (unsigned long) CONST_DOUBLE_LOW (x));
12904 else
12905 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
12906 }
12907 else
12908 /* We can't handle floating point constants;
12909 TARGET_PRINT_OPERAND must handle them. */
12910 output_operand_lossage ("floating constant misused");
12911 break;
12912
12913 case PLUS:
12914 /* Some assemblers need integer constants to appear first. */
12915 if (CONST_INT_P (XEXP (x, 0)))
12916 {
12917 output_pic_addr_const (file, XEXP (x, 0), code);
12918 putc ('+', file);
12919 output_pic_addr_const (file, XEXP (x, 1), code);
12920 }
12921 else
12922 {
12923 gcc_assert (CONST_INT_P (XEXP (x, 1)));
12924 output_pic_addr_const (file, XEXP (x, 1), code);
12925 putc ('+', file);
12926 output_pic_addr_const (file, XEXP (x, 0), code);
12927 }
12928 break;
12929
12930 case MINUS:
12931 if (!TARGET_MACHO)
12932 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
12933 output_pic_addr_const (file, XEXP (x, 0), code);
12934 putc ('-', file);
12935 output_pic_addr_const (file, XEXP (x, 1), code);
12936 if (!TARGET_MACHO)
12937 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
12938 break;
12939
12940 case UNSPEC:
12941 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
12942 {
12943 bool f = i386_asm_output_addr_const_extra (file, x);
12944 gcc_assert (f);
12945 break;
12946 }
12947
12948 gcc_assert (XVECLEN (x, 0) == 1);
12949 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
12950 switch (XINT (x, 1))
12951 {
12952 case UNSPEC_GOT:
12953 fputs ("@GOT", file);
12954 break;
12955 case UNSPEC_GOTOFF:
12956 fputs ("@GOTOFF", file);
12957 break;
12958 case UNSPEC_PLTOFF:
12959 fputs ("@PLTOFF", file);
12960 break;
12961 case UNSPEC_PCREL:
12962 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
12963 "(%rip)" : "[rip]", file);
12964 break;
12965 case UNSPEC_GOTPCREL:
12966 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
12967 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
12968 break;
12969 case UNSPEC_GOTTPOFF:
12970 /* FIXME: This might be @TPOFF in Sun ld too. */
12971 fputs ("@gottpoff", file);
12972 break;
12973 case UNSPEC_TPOFF:
12974 fputs ("@tpoff", file);
12975 break;
12976 case UNSPEC_NTPOFF:
12977 if (TARGET_64BIT)
12978 fputs ("@tpoff", file);
12979 else
12980 fputs ("@ntpoff", file);
12981 break;
12982 case UNSPEC_DTPOFF:
12983 fputs ("@dtpoff", file);
12984 break;
12985 case UNSPEC_GOTNTPOFF:
12986 if (TARGET_64BIT)
12987 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
12988 "@gottpoff(%rip)": "@gottpoff[rip]", file);
12989 else
12990 fputs ("@gotntpoff", file);
12991 break;
12992 case UNSPEC_INDNTPOFF:
12993 fputs ("@indntpoff", file);
12994 break;
12995 #if TARGET_MACHO
12996 case UNSPEC_MACHOPIC_OFFSET:
12997 putc ('-', file);
12998 machopic_output_function_base_name (file);
12999 break;
13000 #endif
13001 default:
13002 output_operand_lossage ("invalid UNSPEC as operand");
13003 break;
13004 }
13005 break;
13006
13007 default:
13008 output_operand_lossage ("invalid expression as operand");
13009 }
13010 }
13011
13012 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13013 We need to emit DTP-relative relocations. */
13014
13015 static void ATTRIBUTE_UNUSED
13016 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13017 {
13018 fputs (ASM_LONG, file);
13019 output_addr_const (file, x);
13020 fputs ("@dtpoff", file);
13021 switch (size)
13022 {
13023 case 4:
13024 break;
13025 case 8:
13026 fputs (", 0", file);
13027 break;
13028 default:
13029 gcc_unreachable ();
13030 }
13031 }
13032
13033 /* Return true if X is a representation of the PIC register. This copes
13034 with calls from ix86_find_base_term, where the register might have
13035 been replaced by a cselib value. */
13036
13037 static bool
13038 ix86_pic_register_p (rtx x)
13039 {
13040 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13041 return (pic_offset_table_rtx
13042 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13043 else
13044 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13045 }
13046
13047 /* Helper function for ix86_delegitimize_address.
13048 Attempt to delegitimize TLS local-exec accesses. */
13049
13050 static rtx
13051 ix86_delegitimize_tls_address (rtx orig_x)
13052 {
13053 rtx x = orig_x, unspec;
13054 struct ix86_address addr;
13055
13056 if (!TARGET_TLS_DIRECT_SEG_REFS)
13057 return orig_x;
13058 if (MEM_P (x))
13059 x = XEXP (x, 0);
13060 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13061 return orig_x;
13062 if (ix86_decompose_address (x, &addr) == 0
13063 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13064 || addr.disp == NULL_RTX
13065 || GET_CODE (addr.disp) != CONST)
13066 return orig_x;
13067 unspec = XEXP (addr.disp, 0);
13068 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13069 unspec = XEXP (unspec, 0);
13070 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13071 return orig_x;
13072 x = XVECEXP (unspec, 0, 0);
13073 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13074 if (unspec != XEXP (addr.disp, 0))
13075 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13076 if (addr.index)
13077 {
13078 rtx idx = addr.index;
13079 if (addr.scale != 1)
13080 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13081 x = gen_rtx_PLUS (Pmode, idx, x);
13082 }
13083 if (addr.base)
13084 x = gen_rtx_PLUS (Pmode, addr.base, x);
13085 if (MEM_P (orig_x))
13086 x = replace_equiv_address_nv (orig_x, x);
13087 return x;
13088 }
13089
13090 /* In the name of slightly smaller debug output, and to cater to
13091 general assembler lossage, recognize PIC+GOTOFF and turn it back
13092 into a direct symbol reference.
13093
13094 On Darwin, this is necessary to avoid a crash, because Darwin
13095 has a different PIC label for each routine but the DWARF debugging
13096 information is not associated with any particular routine, so it's
13097 necessary to remove references to the PIC label from RTL stored by
13098 the DWARF output code. */
13099
13100 static rtx
13101 ix86_delegitimize_address (rtx x)
13102 {
13103 rtx orig_x = delegitimize_mem_from_attrs (x);
13104 /* addend is NULL or some rtx if x is something+GOTOFF where
13105 something doesn't include the PIC register. */
13106 rtx addend = NULL_RTX;
13107 /* reg_addend is NULL or a multiple of some register. */
13108 rtx reg_addend = NULL_RTX;
13109 /* const_addend is NULL or a const_int. */
13110 rtx const_addend = NULL_RTX;
13111 /* This is the result, or NULL. */
13112 rtx result = NULL_RTX;
13113
13114 x = orig_x;
13115
13116 if (MEM_P (x))
13117 x = XEXP (x, 0);
13118
13119 if (TARGET_64BIT)
13120 {
13121 if (GET_CODE (x) != CONST
13122 || GET_CODE (XEXP (x, 0)) != UNSPEC
13123 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13124 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13125 || !MEM_P (orig_x))
13126 return ix86_delegitimize_tls_address (orig_x);
13127 x = XVECEXP (XEXP (x, 0), 0, 0);
13128 if (GET_MODE (orig_x) != GET_MODE (x))
13129 {
13130 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13131 GET_MODE (x), 0);
13132 if (x == NULL_RTX)
13133 return orig_x;
13134 }
13135 return x;
13136 }
13137
13138 if (GET_CODE (x) != PLUS
13139 || GET_CODE (XEXP (x, 1)) != CONST)
13140 return ix86_delegitimize_tls_address (orig_x);
13141
13142 if (ix86_pic_register_p (XEXP (x, 0)))
13143 /* %ebx + GOT/GOTOFF */
13144 ;
13145 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13146 {
13147 /* %ebx + %reg * scale + GOT/GOTOFF */
13148 reg_addend = XEXP (x, 0);
13149 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13150 reg_addend = XEXP (reg_addend, 1);
13151 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13152 reg_addend = XEXP (reg_addend, 0);
13153 else
13154 {
13155 reg_addend = NULL_RTX;
13156 addend = XEXP (x, 0);
13157 }
13158 }
13159 else
13160 addend = XEXP (x, 0);
13161
13162 x = XEXP (XEXP (x, 1), 0);
13163 if (GET_CODE (x) == PLUS
13164 && CONST_INT_P (XEXP (x, 1)))
13165 {
13166 const_addend = XEXP (x, 1);
13167 x = XEXP (x, 0);
13168 }
13169
13170 if (GET_CODE (x) == UNSPEC
13171 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13172 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13173 result = XVECEXP (x, 0, 0);
13174
13175 if (TARGET_MACHO && darwin_local_data_pic (x)
13176 && !MEM_P (orig_x))
13177 result = XVECEXP (x, 0, 0);
13178
13179 if (! result)
13180 return ix86_delegitimize_tls_address (orig_x);
13181
13182 if (const_addend)
13183 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13184 if (reg_addend)
13185 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13186 if (addend)
13187 {
13188 /* If the rest of original X doesn't involve the PIC register, add
13189 addend and subtract pic_offset_table_rtx. This can happen e.g.
13190 for code like:
13191 leal (%ebx, %ecx, 4), %ecx
13192 ...
13193 movl foo@GOTOFF(%ecx), %edx
13194 in which case we return (%ecx - %ebx) + foo. */
13195 if (pic_offset_table_rtx)
13196 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13197 pic_offset_table_rtx),
13198 result);
13199 else
13200 return orig_x;
13201 }
13202 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13203 {
13204 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13205 if (result == NULL_RTX)
13206 return orig_x;
13207 }
13208 return result;
13209 }
13210
13211 /* If X is a machine specific address (i.e. a symbol or label being
13212 referenced as a displacement from the GOT implemented using an
13213 UNSPEC), then return the base term. Otherwise return X. */
13214
13215 rtx
13216 ix86_find_base_term (rtx x)
13217 {
13218 rtx term;
13219
13220 if (TARGET_64BIT)
13221 {
13222 if (GET_CODE (x) != CONST)
13223 return x;
13224 term = XEXP (x, 0);
13225 if (GET_CODE (term) == PLUS
13226 && (CONST_INT_P (XEXP (term, 1))
13227 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13228 term = XEXP (term, 0);
13229 if (GET_CODE (term) != UNSPEC
13230 || (XINT (term, 1) != UNSPEC_GOTPCREL
13231 && XINT (term, 1) != UNSPEC_PCREL))
13232 return x;
13233
13234 return XVECEXP (term, 0, 0);
13235 }
13236
13237 return ix86_delegitimize_address (x);
13238 }
13239 \f
13240 static void
13241 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
13242 int fp, FILE *file)
13243 {
13244 const char *suffix;
13245
13246 if (mode == CCFPmode || mode == CCFPUmode)
13247 {
13248 code = ix86_fp_compare_code_to_integer (code);
13249 mode = CCmode;
13250 }
13251 if (reverse)
13252 code = reverse_condition (code);
13253
13254 switch (code)
13255 {
13256 case EQ:
13257 switch (mode)
13258 {
13259 case CCAmode:
13260 suffix = "a";
13261 break;
13262
13263 case CCCmode:
13264 suffix = "c";
13265 break;
13266
13267 case CCOmode:
13268 suffix = "o";
13269 break;
13270
13271 case CCSmode:
13272 suffix = "s";
13273 break;
13274
13275 default:
13276 suffix = "e";
13277 }
13278 break;
13279 case NE:
13280 switch (mode)
13281 {
13282 case CCAmode:
13283 suffix = "na";
13284 break;
13285
13286 case CCCmode:
13287 suffix = "nc";
13288 break;
13289
13290 case CCOmode:
13291 suffix = "no";
13292 break;
13293
13294 case CCSmode:
13295 suffix = "ns";
13296 break;
13297
13298 default:
13299 suffix = "ne";
13300 }
13301 break;
13302 case GT:
13303 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13304 suffix = "g";
13305 break;
13306 case GTU:
13307 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13308 Those same assemblers have the same but opposite lossage on cmov. */
13309 if (mode == CCmode)
13310 suffix = fp ? "nbe" : "a";
13311 else if (mode == CCCmode)
13312 suffix = "b";
13313 else
13314 gcc_unreachable ();
13315 break;
13316 case LT:
13317 switch (mode)
13318 {
13319 case CCNOmode:
13320 case CCGOCmode:
13321 suffix = "s";
13322 break;
13323
13324 case CCmode:
13325 case CCGCmode:
13326 suffix = "l";
13327 break;
13328
13329 default:
13330 gcc_unreachable ();
13331 }
13332 break;
13333 case LTU:
13334 gcc_assert (mode == CCmode || mode == CCCmode);
13335 suffix = "b";
13336 break;
13337 case GE:
13338 switch (mode)
13339 {
13340 case CCNOmode:
13341 case CCGOCmode:
13342 suffix = "ns";
13343 break;
13344
13345 case CCmode:
13346 case CCGCmode:
13347 suffix = "ge";
13348 break;
13349
13350 default:
13351 gcc_unreachable ();
13352 }
13353 break;
13354 case GEU:
13355 /* ??? As above. */
13356 gcc_assert (mode == CCmode || mode == CCCmode);
13357 suffix = fp ? "nb" : "ae";
13358 break;
13359 case LE:
13360 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13361 suffix = "le";
13362 break;
13363 case LEU:
13364 /* ??? As above. */
13365 if (mode == CCmode)
13366 suffix = "be";
13367 else if (mode == CCCmode)
13368 suffix = fp ? "nb" : "ae";
13369 else
13370 gcc_unreachable ();
13371 break;
13372 case UNORDERED:
13373 suffix = fp ? "u" : "p";
13374 break;
13375 case ORDERED:
13376 suffix = fp ? "nu" : "np";
13377 break;
13378 default:
13379 gcc_unreachable ();
13380 }
13381 fputs (suffix, file);
13382 }
13383
13384 /* Print the name of register X to FILE based on its machine mode and number.
13385 If CODE is 'w', pretend the mode is HImode.
13386 If CODE is 'b', pretend the mode is QImode.
13387 If CODE is 'k', pretend the mode is SImode.
13388 If CODE is 'q', pretend the mode is DImode.
13389 If CODE is 'x', pretend the mode is V4SFmode.
13390 If CODE is 't', pretend the mode is V8SFmode.
13391 If CODE is 'h', pretend the reg is the 'high' byte register.
13392 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13393 If CODE is 'd', duplicate the operand for AVX instruction.
13394 */
13395
13396 void
13397 print_reg (rtx x, int code, FILE *file)
13398 {
13399 const char *reg;
13400 bool duplicated = code == 'd' && TARGET_AVX;
13401
13402 gcc_assert (x == pc_rtx
13403 || (REGNO (x) != ARG_POINTER_REGNUM
13404 && REGNO (x) != FRAME_POINTER_REGNUM
13405 && REGNO (x) != FLAGS_REG
13406 && REGNO (x) != FPSR_REG
13407 && REGNO (x) != FPCR_REG));
13408
13409 if (ASSEMBLER_DIALECT == ASM_ATT)
13410 putc ('%', file);
13411
13412 if (x == pc_rtx)
13413 {
13414 gcc_assert (TARGET_64BIT);
13415 fputs ("rip", file);
13416 return;
13417 }
13418
13419 if (code == 'w' || MMX_REG_P (x))
13420 code = 2;
13421 else if (code == 'b')
13422 code = 1;
13423 else if (code == 'k')
13424 code = 4;
13425 else if (code == 'q')
13426 code = 8;
13427 else if (code == 'y')
13428 code = 3;
13429 else if (code == 'h')
13430 code = 0;
13431 else if (code == 'x')
13432 code = 16;
13433 else if (code == 't')
13434 code = 32;
13435 else
13436 code = GET_MODE_SIZE (GET_MODE (x));
13437
13438 /* Irritatingly, AMD extended registers use different naming convention
13439 from the normal registers. */
13440 if (REX_INT_REG_P (x))
13441 {
13442 gcc_assert (TARGET_64BIT);
13443 switch (code)
13444 {
13445 case 0:
13446 error ("extended registers have no high halves");
13447 break;
13448 case 1:
13449 fprintf (file, "r%ib", REGNO (x) - FIRST_REX_INT_REG + 8);
13450 break;
13451 case 2:
13452 fprintf (file, "r%iw", REGNO (x) - FIRST_REX_INT_REG + 8);
13453 break;
13454 case 4:
13455 fprintf (file, "r%id", REGNO (x) - FIRST_REX_INT_REG + 8);
13456 break;
13457 case 8:
13458 fprintf (file, "r%i", REGNO (x) - FIRST_REX_INT_REG + 8);
13459 break;
13460 default:
13461 error ("unsupported operand size for extended register");
13462 break;
13463 }
13464 return;
13465 }
13466
13467 reg = NULL;
13468 switch (code)
13469 {
13470 case 3:
13471 if (STACK_TOP_P (x))
13472 {
13473 reg = "st(0)";
13474 break;
13475 }
13476 /* FALLTHRU */
13477 case 8:
13478 case 4:
13479 case 12:
13480 if (! ANY_FP_REG_P (x))
13481 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13482 /* FALLTHRU */
13483 case 16:
13484 case 2:
13485 normal:
13486 reg = hi_reg_name[REGNO (x)];
13487 break;
13488 case 1:
13489 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13490 goto normal;
13491 reg = qi_reg_name[REGNO (x)];
13492 break;
13493 case 0:
13494 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13495 goto normal;
13496 reg = qi_high_reg_name[REGNO (x)];
13497 break;
13498 case 32:
13499 if (SSE_REG_P (x))
13500 {
13501 gcc_assert (!duplicated);
13502 putc ('y', file);
13503 fputs (hi_reg_name[REGNO (x)] + 1, file);
13504 return;
13505 }
13506 break;
13507 default:
13508 gcc_unreachable ();
13509 }
13510
13511 fputs (reg, file);
13512 if (duplicated)
13513 {
13514 if (ASSEMBLER_DIALECT == ASM_ATT)
13515 fprintf (file, ", %%%s", reg);
13516 else
13517 fprintf (file, ", %s", reg);
13518 }
13519 }
13520
13521 /* Locate some local-dynamic symbol still in use by this function
13522 so that we can print its name in some tls_local_dynamic_base
13523 pattern. */
13524
13525 static int
13526 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13527 {
13528 rtx x = *px;
13529
13530 if (GET_CODE (x) == SYMBOL_REF
13531 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13532 {
13533 cfun->machine->some_ld_name = XSTR (x, 0);
13534 return 1;
13535 }
13536
13537 return 0;
13538 }
13539
13540 static const char *
13541 get_some_local_dynamic_name (void)
13542 {
13543 rtx insn;
13544
13545 if (cfun->machine->some_ld_name)
13546 return cfun->machine->some_ld_name;
13547
13548 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13549 if (NONDEBUG_INSN_P (insn)
13550 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13551 return cfun->machine->some_ld_name;
13552
13553 return NULL;
13554 }
13555
13556 /* Meaning of CODE:
13557 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13558 C -- print opcode suffix for set/cmov insn.
13559 c -- like C, but print reversed condition
13560 F,f -- likewise, but for floating-point.
13561 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13562 otherwise nothing
13563 R -- print the prefix for register names.
13564 z -- print the opcode suffix for the size of the current operand.
13565 Z -- likewise, with special suffixes for x87 instructions.
13566 * -- print a star (in certain assembler syntax)
13567 A -- print an absolute memory reference.
13568 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13569 s -- print a shift double count, followed by the assemblers argument
13570 delimiter.
13571 b -- print the QImode name of the register for the indicated operand.
13572 %b0 would print %al if operands[0] is reg 0.
13573 w -- likewise, print the HImode name of the register.
13574 k -- likewise, print the SImode name of the register.
13575 q -- likewise, print the DImode name of the register.
13576 x -- likewise, print the V4SFmode name of the register.
13577 t -- likewise, print the V8SFmode name of the register.
13578 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13579 y -- print "st(0)" instead of "st" as a register.
13580 d -- print duplicated register operand for AVX instruction.
13581 D -- print condition for SSE cmp instruction.
13582 P -- if PIC, print an @PLT suffix.
13583 p -- print raw symbol name.
13584 X -- don't print any sort of PIC '@' suffix for a symbol.
13585 & -- print some in-use local-dynamic symbol name.
13586 H -- print a memory address offset by 8; used for sse high-parts
13587 Y -- print condition for XOP pcom* instruction.
13588 + -- print a branch hint as 'cs' or 'ds' prefix
13589 ; -- print a semicolon (after prefixes due to bug in older gas).
13590 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13591 @ -- print a segment register of thread base pointer load
13592 */
13593
13594 void
13595 ix86_print_operand (FILE *file, rtx x, int code)
13596 {
13597 if (code)
13598 {
13599 switch (code)
13600 {
13601 case '*':
13602 if (ASSEMBLER_DIALECT == ASM_ATT)
13603 putc ('*', file);
13604 return;
13605
13606 case '&':
13607 {
13608 const char *name = get_some_local_dynamic_name ();
13609 if (name == NULL)
13610 output_operand_lossage ("'%%&' used without any "
13611 "local dynamic TLS references");
13612 else
13613 assemble_name (file, name);
13614 return;
13615 }
13616
13617 case 'A':
13618 switch (ASSEMBLER_DIALECT)
13619 {
13620 case ASM_ATT:
13621 putc ('*', file);
13622 break;
13623
13624 case ASM_INTEL:
13625 /* Intel syntax. For absolute addresses, registers should not
13626 be surrounded by braces. */
13627 if (!REG_P (x))
13628 {
13629 putc ('[', file);
13630 ix86_print_operand (file, x, 0);
13631 putc (']', file);
13632 return;
13633 }
13634 break;
13635
13636 default:
13637 gcc_unreachable ();
13638 }
13639
13640 ix86_print_operand (file, x, 0);
13641 return;
13642
13643
13644 case 'L':
13645 if (ASSEMBLER_DIALECT == ASM_ATT)
13646 putc ('l', file);
13647 return;
13648
13649 case 'W':
13650 if (ASSEMBLER_DIALECT == ASM_ATT)
13651 putc ('w', file);
13652 return;
13653
13654 case 'B':
13655 if (ASSEMBLER_DIALECT == ASM_ATT)
13656 putc ('b', file);
13657 return;
13658
13659 case 'Q':
13660 if (ASSEMBLER_DIALECT == ASM_ATT)
13661 putc ('l', file);
13662 return;
13663
13664 case 'S':
13665 if (ASSEMBLER_DIALECT == ASM_ATT)
13666 putc ('s', file);
13667 return;
13668
13669 case 'T':
13670 if (ASSEMBLER_DIALECT == ASM_ATT)
13671 putc ('t', file);
13672 return;
13673
13674 case 'z':
13675 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13676 {
13677 /* Opcodes don't get size suffixes if using Intel opcodes. */
13678 if (ASSEMBLER_DIALECT == ASM_INTEL)
13679 return;
13680
13681 switch (GET_MODE_SIZE (GET_MODE (x)))
13682 {
13683 case 1:
13684 putc ('b', file);
13685 return;
13686
13687 case 2:
13688 putc ('w', file);
13689 return;
13690
13691 case 4:
13692 putc ('l', file);
13693 return;
13694
13695 case 8:
13696 putc ('q', file);
13697 return;
13698
13699 default:
13700 output_operand_lossage
13701 ("invalid operand size for operand code '%c'", code);
13702 return;
13703 }
13704 }
13705
13706 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13707 warning
13708 (0, "non-integer operand used with operand code '%c'", code);
13709 /* FALLTHRU */
13710
13711 case 'Z':
13712 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
13713 if (ASSEMBLER_DIALECT == ASM_INTEL)
13714 return;
13715
13716 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13717 {
13718 switch (GET_MODE_SIZE (GET_MODE (x)))
13719 {
13720 case 2:
13721 #ifdef HAVE_AS_IX86_FILDS
13722 putc ('s', file);
13723 #endif
13724 return;
13725
13726 case 4:
13727 putc ('l', file);
13728 return;
13729
13730 case 8:
13731 #ifdef HAVE_AS_IX86_FILDQ
13732 putc ('q', file);
13733 #else
13734 fputs ("ll", file);
13735 #endif
13736 return;
13737
13738 default:
13739 break;
13740 }
13741 }
13742 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13743 {
13744 /* 387 opcodes don't get size suffixes
13745 if the operands are registers. */
13746 if (STACK_REG_P (x))
13747 return;
13748
13749 switch (GET_MODE_SIZE (GET_MODE (x)))
13750 {
13751 case 4:
13752 putc ('s', file);
13753 return;
13754
13755 case 8:
13756 putc ('l', file);
13757 return;
13758
13759 case 12:
13760 case 16:
13761 putc ('t', file);
13762 return;
13763
13764 default:
13765 break;
13766 }
13767 }
13768 else
13769 {
13770 output_operand_lossage
13771 ("invalid operand type used with operand code '%c'", code);
13772 return;
13773 }
13774
13775 output_operand_lossage
13776 ("invalid operand size for operand code '%c'", code);
13777 return;
13778
13779 case 'd':
13780 case 'b':
13781 case 'w':
13782 case 'k':
13783 case 'q':
13784 case 'h':
13785 case 't':
13786 case 'y':
13787 case 'x':
13788 case 'X':
13789 case 'P':
13790 case 'p':
13791 break;
13792
13793 case 's':
13794 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
13795 {
13796 ix86_print_operand (file, x, 0);
13797 fputs (", ", file);
13798 }
13799 return;
13800
13801 case 'D':
13802 /* Little bit of braindamage here. The SSE compare instructions
13803 does use completely different names for the comparisons that the
13804 fp conditional moves. */
13805 if (TARGET_AVX)
13806 {
13807 switch (GET_CODE (x))
13808 {
13809 case EQ:
13810 fputs ("eq", file);
13811 break;
13812 case UNEQ:
13813 fputs ("eq_us", file);
13814 break;
13815 case LT:
13816 fputs ("lt", file);
13817 break;
13818 case UNLT:
13819 fputs ("nge", file);
13820 break;
13821 case LE:
13822 fputs ("le", file);
13823 break;
13824 case UNLE:
13825 fputs ("ngt", file);
13826 break;
13827 case UNORDERED:
13828 fputs ("unord", file);
13829 break;
13830 case NE:
13831 fputs ("neq", file);
13832 break;
13833 case LTGT:
13834 fputs ("neq_oq", file);
13835 break;
13836 case GE:
13837 fputs ("ge", file);
13838 break;
13839 case UNGE:
13840 fputs ("nlt", file);
13841 break;
13842 case GT:
13843 fputs ("gt", file);
13844 break;
13845 case UNGT:
13846 fputs ("nle", file);
13847 break;
13848 case ORDERED:
13849 fputs ("ord", file);
13850 break;
13851 default:
13852 output_operand_lossage ("operand is not a condition code, "
13853 "invalid operand code 'D'");
13854 return;
13855 }
13856 }
13857 else
13858 {
13859 switch (GET_CODE (x))
13860 {
13861 case EQ:
13862 case UNEQ:
13863 fputs ("eq", file);
13864 break;
13865 case LT:
13866 case UNLT:
13867 fputs ("lt", file);
13868 break;
13869 case LE:
13870 case UNLE:
13871 fputs ("le", file);
13872 break;
13873 case UNORDERED:
13874 fputs ("unord", file);
13875 break;
13876 case NE:
13877 case LTGT:
13878 fputs ("neq", file);
13879 break;
13880 case UNGE:
13881 case GE:
13882 fputs ("nlt", file);
13883 break;
13884 case UNGT:
13885 case GT:
13886 fputs ("nle", file);
13887 break;
13888 case ORDERED:
13889 fputs ("ord", file);
13890 break;
13891 default:
13892 output_operand_lossage ("operand is not a condition code, "
13893 "invalid operand code 'D'");
13894 return;
13895 }
13896 }
13897 return;
13898 case 'O':
13899 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
13900 if (ASSEMBLER_DIALECT == ASM_ATT)
13901 {
13902 switch (GET_MODE (x))
13903 {
13904 case HImode: putc ('w', file); break;
13905 case SImode:
13906 case SFmode: putc ('l', file); break;
13907 case DImode:
13908 case DFmode: putc ('q', file); break;
13909 default: gcc_unreachable ();
13910 }
13911 putc ('.', file);
13912 }
13913 #endif
13914 return;
13915 case 'C':
13916 if (!COMPARISON_P (x))
13917 {
13918 output_operand_lossage ("operand is neither a constant nor a "
13919 "condition code, invalid operand code "
13920 "'C'");
13921 return;
13922 }
13923 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
13924 return;
13925 case 'F':
13926 if (!COMPARISON_P (x))
13927 {
13928 output_operand_lossage ("operand is neither a constant nor a "
13929 "condition code, invalid operand code "
13930 "'F'");
13931 return;
13932 }
13933 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
13934 if (ASSEMBLER_DIALECT == ASM_ATT)
13935 putc ('.', file);
13936 #endif
13937 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
13938 return;
13939
13940 /* Like above, but reverse condition */
13941 case 'c':
13942 /* Check to see if argument to %c is really a constant
13943 and not a condition code which needs to be reversed. */
13944 if (!COMPARISON_P (x))
13945 {
13946 output_operand_lossage ("operand is neither a constant nor a "
13947 "condition code, invalid operand "
13948 "code 'c'");
13949 return;
13950 }
13951 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
13952 return;
13953 case 'f':
13954 if (!COMPARISON_P (x))
13955 {
13956 output_operand_lossage ("operand is neither a constant nor a "
13957 "condition code, invalid operand "
13958 "code 'f'");
13959 return;
13960 }
13961 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
13962 if (ASSEMBLER_DIALECT == ASM_ATT)
13963 putc ('.', file);
13964 #endif
13965 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
13966 return;
13967
13968 case 'H':
13969 /* It doesn't actually matter what mode we use here, as we're
13970 only going to use this for printing. */
13971 x = adjust_address_nv (x, DImode, 8);
13972 break;
13973
13974 case '+':
13975 {
13976 rtx x;
13977
13978 if (!optimize
13979 || optimize_function_for_size_p (cfun) || !TARGET_BRANCH_PREDICTION_HINTS)
13980 return;
13981
13982 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
13983 if (x)
13984 {
13985 int pred_val = INTVAL (XEXP (x, 0));
13986
13987 if (pred_val < REG_BR_PROB_BASE * 45 / 100
13988 || pred_val > REG_BR_PROB_BASE * 55 / 100)
13989 {
13990 int taken = pred_val > REG_BR_PROB_BASE / 2;
13991 int cputaken = final_forward_branch_p (current_output_insn) == 0;
13992
13993 /* Emit hints only in the case default branch prediction
13994 heuristics would fail. */
13995 if (taken != cputaken)
13996 {
13997 /* We use 3e (DS) prefix for taken branches and
13998 2e (CS) prefix for not taken branches. */
13999 if (taken)
14000 fputs ("ds ; ", file);
14001 else
14002 fputs ("cs ; ", file);
14003 }
14004 }
14005 }
14006 return;
14007 }
14008
14009 case 'Y':
14010 switch (GET_CODE (x))
14011 {
14012 case NE:
14013 fputs ("neq", file);
14014 break;
14015 case EQ:
14016 fputs ("eq", file);
14017 break;
14018 case GE:
14019 case GEU:
14020 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14021 break;
14022 case GT:
14023 case GTU:
14024 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14025 break;
14026 case LE:
14027 case LEU:
14028 fputs ("le", file);
14029 break;
14030 case LT:
14031 case LTU:
14032 fputs ("lt", file);
14033 break;
14034 case UNORDERED:
14035 fputs ("unord", file);
14036 break;
14037 case ORDERED:
14038 fputs ("ord", file);
14039 break;
14040 case UNEQ:
14041 fputs ("ueq", file);
14042 break;
14043 case UNGE:
14044 fputs ("nlt", file);
14045 break;
14046 case UNGT:
14047 fputs ("nle", file);
14048 break;
14049 case UNLE:
14050 fputs ("ule", file);
14051 break;
14052 case UNLT:
14053 fputs ("ult", file);
14054 break;
14055 case LTGT:
14056 fputs ("une", file);
14057 break;
14058 default:
14059 output_operand_lossage ("operand is not a condition code, "
14060 "invalid operand code 'Y'");
14061 return;
14062 }
14063 return;
14064
14065 case ';':
14066 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14067 putc (';', file);
14068 #endif
14069 return;
14070
14071 case '@':
14072 if (ASSEMBLER_DIALECT == ASM_ATT)
14073 putc ('%', file);
14074
14075 /* The kernel uses a different segment register for performance
14076 reasons; a system call would not have to trash the userspace
14077 segment register, which would be expensive. */
14078 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14079 fputs ("fs", file);
14080 else
14081 fputs ("gs", file);
14082 return;
14083
14084 case '~':
14085 putc (TARGET_AVX2 ? 'i' : 'f', file);
14086 return;
14087
14088 default:
14089 output_operand_lossage ("invalid operand code '%c'", code);
14090 }
14091 }
14092
14093 if (REG_P (x))
14094 print_reg (x, code, file);
14095
14096 else if (MEM_P (x))
14097 {
14098 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14099 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14100 && GET_MODE (x) != BLKmode)
14101 {
14102 const char * size;
14103 switch (GET_MODE_SIZE (GET_MODE (x)))
14104 {
14105 case 1: size = "BYTE"; break;
14106 case 2: size = "WORD"; break;
14107 case 4: size = "DWORD"; break;
14108 case 8: size = "QWORD"; break;
14109 case 12: size = "TBYTE"; break;
14110 case 16:
14111 if (GET_MODE (x) == XFmode)
14112 size = "TBYTE";
14113 else
14114 size = "XMMWORD";
14115 break;
14116 case 32: size = "YMMWORD"; break;
14117 default:
14118 gcc_unreachable ();
14119 }
14120
14121 /* Check for explicit size override (codes 'b', 'w' and 'k') */
14122 if (code == 'b')
14123 size = "BYTE";
14124 else if (code == 'w')
14125 size = "WORD";
14126 else if (code == 'k')
14127 size = "DWORD";
14128
14129 fputs (size, file);
14130 fputs (" PTR ", file);
14131 }
14132
14133 x = XEXP (x, 0);
14134 /* Avoid (%rip) for call operands. */
14135 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14136 && !CONST_INT_P (x))
14137 output_addr_const (file, x);
14138 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14139 output_operand_lossage ("invalid constraints for operand");
14140 else
14141 output_address (x);
14142 }
14143
14144 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14145 {
14146 REAL_VALUE_TYPE r;
14147 long l;
14148
14149 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14150 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14151
14152 if (ASSEMBLER_DIALECT == ASM_ATT)
14153 putc ('$', file);
14154 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14155 if (code == 'q')
14156 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14157 else
14158 fprintf (file, "0x%08x", (unsigned int) l);
14159 }
14160
14161 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14162 {
14163 REAL_VALUE_TYPE r;
14164 long l[2];
14165
14166 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14167 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14168
14169 if (ASSEMBLER_DIALECT == ASM_ATT)
14170 putc ('$', file);
14171 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14172 }
14173
14174 /* These float cases don't actually occur as immediate operands. */
14175 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14176 {
14177 char dstr[30];
14178
14179 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14180 fputs (dstr, file);
14181 }
14182
14183 else
14184 {
14185 /* We have patterns that allow zero sets of memory, for instance.
14186 In 64-bit mode, we should probably support all 8-byte vectors,
14187 since we can in fact encode that into an immediate. */
14188 if (GET_CODE (x) == CONST_VECTOR)
14189 {
14190 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14191 x = const0_rtx;
14192 }
14193
14194 if (code != 'P' && code != 'p')
14195 {
14196 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14197 {
14198 if (ASSEMBLER_DIALECT == ASM_ATT)
14199 putc ('$', file);
14200 }
14201 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14202 || GET_CODE (x) == LABEL_REF)
14203 {
14204 if (ASSEMBLER_DIALECT == ASM_ATT)
14205 putc ('$', file);
14206 else
14207 fputs ("OFFSET FLAT:", file);
14208 }
14209 }
14210 if (CONST_INT_P (x))
14211 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14212 else if (flag_pic || MACHOPIC_INDIRECT)
14213 output_pic_addr_const (file, x, code);
14214 else
14215 output_addr_const (file, x);
14216 }
14217 }
14218
14219 static bool
14220 ix86_print_operand_punct_valid_p (unsigned char code)
14221 {
14222 return (code == '@' || code == '*' || code == '+'
14223 || code == '&' || code == ';' || code == '~');
14224 }
14225 \f
14226 /* Print a memory operand whose address is ADDR. */
14227
14228 static void
14229 ix86_print_operand_address (FILE *file, rtx addr)
14230 {
14231 struct ix86_address parts;
14232 rtx base, index, disp;
14233 int scale;
14234 int ok;
14235 bool vsib = false;
14236
14237 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14238 {
14239 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14240 gcc_assert (parts.index == NULL_RTX);
14241 parts.index = XVECEXP (addr, 0, 1);
14242 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14243 addr = XVECEXP (addr, 0, 0);
14244 vsib = true;
14245 }
14246 else
14247 ok = ix86_decompose_address (addr, &parts);
14248
14249 gcc_assert (ok);
14250
14251 if (parts.base && GET_CODE (parts.base) == SUBREG)
14252 {
14253 rtx tmp = SUBREG_REG (parts.base);
14254 parts.base = simplify_subreg (GET_MODE (parts.base),
14255 tmp, GET_MODE (tmp), 0);
14256 }
14257
14258 if (parts.index && GET_CODE (parts.index) == SUBREG)
14259 {
14260 rtx tmp = SUBREG_REG (parts.index);
14261 parts.index = simplify_subreg (GET_MODE (parts.index),
14262 tmp, GET_MODE (tmp), 0);
14263 }
14264
14265 base = parts.base;
14266 index = parts.index;
14267 disp = parts.disp;
14268 scale = parts.scale;
14269
14270 switch (parts.seg)
14271 {
14272 case SEG_DEFAULT:
14273 break;
14274 case SEG_FS:
14275 case SEG_GS:
14276 if (ASSEMBLER_DIALECT == ASM_ATT)
14277 putc ('%', file);
14278 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14279 break;
14280 default:
14281 gcc_unreachable ();
14282 }
14283
14284 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14285 if (TARGET_64BIT && !base && !index)
14286 {
14287 rtx symbol = disp;
14288
14289 if (GET_CODE (disp) == CONST
14290 && GET_CODE (XEXP (disp, 0)) == PLUS
14291 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14292 symbol = XEXP (XEXP (disp, 0), 0);
14293
14294 if (GET_CODE (symbol) == LABEL_REF
14295 || (GET_CODE (symbol) == SYMBOL_REF
14296 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14297 base = pc_rtx;
14298 }
14299 if (!base && !index)
14300 {
14301 /* Displacement only requires special attention. */
14302
14303 if (CONST_INT_P (disp))
14304 {
14305 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14306 fputs ("ds:", file);
14307 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14308 }
14309 else if (flag_pic)
14310 output_pic_addr_const (file, disp, 0);
14311 else
14312 output_addr_const (file, disp);
14313 }
14314 else
14315 {
14316 int code = 0;
14317
14318 /* Print SImode registers for zero-extended addresses to force
14319 addr32 prefix. Otherwise print DImode registers to avoid it. */
14320 if (TARGET_64BIT)
14321 code = ((GET_CODE (addr) == ZERO_EXTEND
14322 || GET_CODE (addr) == AND)
14323 ? 'l'
14324 : 'q');
14325
14326 if (ASSEMBLER_DIALECT == ASM_ATT)
14327 {
14328 if (disp)
14329 {
14330 if (flag_pic)
14331 output_pic_addr_const (file, disp, 0);
14332 else if (GET_CODE (disp) == LABEL_REF)
14333 output_asm_label (disp);
14334 else
14335 output_addr_const (file, disp);
14336 }
14337
14338 putc ('(', file);
14339 if (base)
14340 print_reg (base, code, file);
14341 if (index)
14342 {
14343 putc (',', file);
14344 print_reg (index, vsib ? 0 : code, file);
14345 if (scale != 1 || vsib)
14346 fprintf (file, ",%d", scale);
14347 }
14348 putc (')', file);
14349 }
14350 else
14351 {
14352 rtx offset = NULL_RTX;
14353
14354 if (disp)
14355 {
14356 /* Pull out the offset of a symbol; print any symbol itself. */
14357 if (GET_CODE (disp) == CONST
14358 && GET_CODE (XEXP (disp, 0)) == PLUS
14359 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14360 {
14361 offset = XEXP (XEXP (disp, 0), 1);
14362 disp = gen_rtx_CONST (VOIDmode,
14363 XEXP (XEXP (disp, 0), 0));
14364 }
14365
14366 if (flag_pic)
14367 output_pic_addr_const (file, disp, 0);
14368 else if (GET_CODE (disp) == LABEL_REF)
14369 output_asm_label (disp);
14370 else if (CONST_INT_P (disp))
14371 offset = disp;
14372 else
14373 output_addr_const (file, disp);
14374 }
14375
14376 putc ('[', file);
14377 if (base)
14378 {
14379 print_reg (base, code, file);
14380 if (offset)
14381 {
14382 if (INTVAL (offset) >= 0)
14383 putc ('+', file);
14384 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14385 }
14386 }
14387 else if (offset)
14388 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14389 else
14390 putc ('0', file);
14391
14392 if (index)
14393 {
14394 putc ('+', file);
14395 print_reg (index, vsib ? 0 : code, file);
14396 if (scale != 1 || vsib)
14397 fprintf (file, "*%d", scale);
14398 }
14399 putc (']', file);
14400 }
14401 }
14402 }
14403
14404 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14405
14406 static bool
14407 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14408 {
14409 rtx op;
14410
14411 if (GET_CODE (x) != UNSPEC)
14412 return false;
14413
14414 op = XVECEXP (x, 0, 0);
14415 switch (XINT (x, 1))
14416 {
14417 case UNSPEC_GOTTPOFF:
14418 output_addr_const (file, op);
14419 /* FIXME: This might be @TPOFF in Sun ld. */
14420 fputs ("@gottpoff", file);
14421 break;
14422 case UNSPEC_TPOFF:
14423 output_addr_const (file, op);
14424 fputs ("@tpoff", file);
14425 break;
14426 case UNSPEC_NTPOFF:
14427 output_addr_const (file, op);
14428 if (TARGET_64BIT)
14429 fputs ("@tpoff", file);
14430 else
14431 fputs ("@ntpoff", file);
14432 break;
14433 case UNSPEC_DTPOFF:
14434 output_addr_const (file, op);
14435 fputs ("@dtpoff", file);
14436 break;
14437 case UNSPEC_GOTNTPOFF:
14438 output_addr_const (file, op);
14439 if (TARGET_64BIT)
14440 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14441 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14442 else
14443 fputs ("@gotntpoff", file);
14444 break;
14445 case UNSPEC_INDNTPOFF:
14446 output_addr_const (file, op);
14447 fputs ("@indntpoff", file);
14448 break;
14449 #if TARGET_MACHO
14450 case UNSPEC_MACHOPIC_OFFSET:
14451 output_addr_const (file, op);
14452 putc ('-', file);
14453 machopic_output_function_base_name (file);
14454 break;
14455 #endif
14456
14457 case UNSPEC_STACK_CHECK:
14458 {
14459 int offset;
14460
14461 gcc_assert (flag_split_stack);
14462
14463 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14464 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14465 #else
14466 gcc_unreachable ();
14467 #endif
14468
14469 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14470 }
14471 break;
14472
14473 default:
14474 return false;
14475 }
14476
14477 return true;
14478 }
14479 \f
14480 /* Split one or more double-mode RTL references into pairs of half-mode
14481 references. The RTL can be REG, offsettable MEM, integer constant, or
14482 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14483 split and "num" is its length. lo_half and hi_half are output arrays
14484 that parallel "operands". */
14485
14486 void
14487 split_double_mode (enum machine_mode mode, rtx operands[],
14488 int num, rtx lo_half[], rtx hi_half[])
14489 {
14490 enum machine_mode half_mode;
14491 unsigned int byte;
14492
14493 switch (mode)
14494 {
14495 case TImode:
14496 half_mode = DImode;
14497 break;
14498 case DImode:
14499 half_mode = SImode;
14500 break;
14501 default:
14502 gcc_unreachable ();
14503 }
14504
14505 byte = GET_MODE_SIZE (half_mode);
14506
14507 while (num--)
14508 {
14509 rtx op = operands[num];
14510
14511 /* simplify_subreg refuse to split volatile memory addresses,
14512 but we still have to handle it. */
14513 if (MEM_P (op))
14514 {
14515 lo_half[num] = adjust_address (op, half_mode, 0);
14516 hi_half[num] = adjust_address (op, half_mode, byte);
14517 }
14518 else
14519 {
14520 lo_half[num] = simplify_gen_subreg (half_mode, op,
14521 GET_MODE (op) == VOIDmode
14522 ? mode : GET_MODE (op), 0);
14523 hi_half[num] = simplify_gen_subreg (half_mode, op,
14524 GET_MODE (op) == VOIDmode
14525 ? mode : GET_MODE (op), byte);
14526 }
14527 }
14528 }
14529 \f
14530 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14531 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14532 is the expression of the binary operation. The output may either be
14533 emitted here, or returned to the caller, like all output_* functions.
14534
14535 There is no guarantee that the operands are the same mode, as they
14536 might be within FLOAT or FLOAT_EXTEND expressions. */
14537
14538 #ifndef SYSV386_COMPAT
14539 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14540 wants to fix the assemblers because that causes incompatibility
14541 with gcc. No-one wants to fix gcc because that causes
14542 incompatibility with assemblers... You can use the option of
14543 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14544 #define SYSV386_COMPAT 1
14545 #endif
14546
14547 const char *
14548 output_387_binary_op (rtx insn, rtx *operands)
14549 {
14550 static char buf[40];
14551 const char *p;
14552 const char *ssep;
14553 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14554
14555 #ifdef ENABLE_CHECKING
14556 /* Even if we do not want to check the inputs, this documents input
14557 constraints. Which helps in understanding the following code. */
14558 if (STACK_REG_P (operands[0])
14559 && ((REG_P (operands[1])
14560 && REGNO (operands[0]) == REGNO (operands[1])
14561 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14562 || (REG_P (operands[2])
14563 && REGNO (operands[0]) == REGNO (operands[2])
14564 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14565 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14566 ; /* ok */
14567 else
14568 gcc_assert (is_sse);
14569 #endif
14570
14571 switch (GET_CODE (operands[3]))
14572 {
14573 case PLUS:
14574 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14575 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14576 p = "fiadd";
14577 else
14578 p = "fadd";
14579 ssep = "vadd";
14580 break;
14581
14582 case MINUS:
14583 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14584 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14585 p = "fisub";
14586 else
14587 p = "fsub";
14588 ssep = "vsub";
14589 break;
14590
14591 case MULT:
14592 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14593 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14594 p = "fimul";
14595 else
14596 p = "fmul";
14597 ssep = "vmul";
14598 break;
14599
14600 case DIV:
14601 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14602 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14603 p = "fidiv";
14604 else
14605 p = "fdiv";
14606 ssep = "vdiv";
14607 break;
14608
14609 default:
14610 gcc_unreachable ();
14611 }
14612
14613 if (is_sse)
14614 {
14615 if (TARGET_AVX)
14616 {
14617 strcpy (buf, ssep);
14618 if (GET_MODE (operands[0]) == SFmode)
14619 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
14620 else
14621 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
14622 }
14623 else
14624 {
14625 strcpy (buf, ssep + 1);
14626 if (GET_MODE (operands[0]) == SFmode)
14627 strcat (buf, "ss\t{%2, %0|%0, %2}");
14628 else
14629 strcat (buf, "sd\t{%2, %0|%0, %2}");
14630 }
14631 return buf;
14632 }
14633 strcpy (buf, p);
14634
14635 switch (GET_CODE (operands[3]))
14636 {
14637 case MULT:
14638 case PLUS:
14639 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
14640 {
14641 rtx temp = operands[2];
14642 operands[2] = operands[1];
14643 operands[1] = temp;
14644 }
14645
14646 /* know operands[0] == operands[1]. */
14647
14648 if (MEM_P (operands[2]))
14649 {
14650 p = "%Z2\t%2";
14651 break;
14652 }
14653
14654 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14655 {
14656 if (STACK_TOP_P (operands[0]))
14657 /* How is it that we are storing to a dead operand[2]?
14658 Well, presumably operands[1] is dead too. We can't
14659 store the result to st(0) as st(0) gets popped on this
14660 instruction. Instead store to operands[2] (which I
14661 think has to be st(1)). st(1) will be popped later.
14662 gcc <= 2.8.1 didn't have this check and generated
14663 assembly code that the Unixware assembler rejected. */
14664 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14665 else
14666 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14667 break;
14668 }
14669
14670 if (STACK_TOP_P (operands[0]))
14671 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14672 else
14673 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14674 break;
14675
14676 case MINUS:
14677 case DIV:
14678 if (MEM_P (operands[1]))
14679 {
14680 p = "r%Z1\t%1";
14681 break;
14682 }
14683
14684 if (MEM_P (operands[2]))
14685 {
14686 p = "%Z2\t%2";
14687 break;
14688 }
14689
14690 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14691 {
14692 #if SYSV386_COMPAT
14693 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
14694 derived assemblers, confusingly reverse the direction of
14695 the operation for fsub{r} and fdiv{r} when the
14696 destination register is not st(0). The Intel assembler
14697 doesn't have this brain damage. Read !SYSV386_COMPAT to
14698 figure out what the hardware really does. */
14699 if (STACK_TOP_P (operands[0]))
14700 p = "{p\t%0, %2|rp\t%2, %0}";
14701 else
14702 p = "{rp\t%2, %0|p\t%0, %2}";
14703 #else
14704 if (STACK_TOP_P (operands[0]))
14705 /* As above for fmul/fadd, we can't store to st(0). */
14706 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14707 else
14708 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14709 #endif
14710 break;
14711 }
14712
14713 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
14714 {
14715 #if SYSV386_COMPAT
14716 if (STACK_TOP_P (operands[0]))
14717 p = "{rp\t%0, %1|p\t%1, %0}";
14718 else
14719 p = "{p\t%1, %0|rp\t%0, %1}";
14720 #else
14721 if (STACK_TOP_P (operands[0]))
14722 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
14723 else
14724 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
14725 #endif
14726 break;
14727 }
14728
14729 if (STACK_TOP_P (operands[0]))
14730 {
14731 if (STACK_TOP_P (operands[1]))
14732 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14733 else
14734 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
14735 break;
14736 }
14737 else if (STACK_TOP_P (operands[1]))
14738 {
14739 #if SYSV386_COMPAT
14740 p = "{\t%1, %0|r\t%0, %1}";
14741 #else
14742 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
14743 #endif
14744 }
14745 else
14746 {
14747 #if SYSV386_COMPAT
14748 p = "{r\t%2, %0|\t%0, %2}";
14749 #else
14750 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14751 #endif
14752 }
14753 break;
14754
14755 default:
14756 gcc_unreachable ();
14757 }
14758
14759 strcat (buf, p);
14760 return buf;
14761 }
14762
14763 /* Return needed mode for entity in optimize_mode_switching pass. */
14764
14765 int
14766 ix86_mode_needed (int entity, rtx insn)
14767 {
14768 enum attr_i387_cw mode;
14769
14770 /* The mode UNINITIALIZED is used to store control word after a
14771 function call or ASM pattern. The mode ANY specify that function
14772 has no requirements on the control word and make no changes in the
14773 bits we are interested in. */
14774
14775 if (CALL_P (insn)
14776 || (NONJUMP_INSN_P (insn)
14777 && (asm_noperands (PATTERN (insn)) >= 0
14778 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
14779 return I387_CW_UNINITIALIZED;
14780
14781 if (recog_memoized (insn) < 0)
14782 return I387_CW_ANY;
14783
14784 mode = get_attr_i387_cw (insn);
14785
14786 switch (entity)
14787 {
14788 case I387_TRUNC:
14789 if (mode == I387_CW_TRUNC)
14790 return mode;
14791 break;
14792
14793 case I387_FLOOR:
14794 if (mode == I387_CW_FLOOR)
14795 return mode;
14796 break;
14797
14798 case I387_CEIL:
14799 if (mode == I387_CW_CEIL)
14800 return mode;
14801 break;
14802
14803 case I387_MASK_PM:
14804 if (mode == I387_CW_MASK_PM)
14805 return mode;
14806 break;
14807
14808 default:
14809 gcc_unreachable ();
14810 }
14811
14812 return I387_CW_ANY;
14813 }
14814
14815 /* Output code to initialize control word copies used by trunc?f?i and
14816 rounding patterns. CURRENT_MODE is set to current control word,
14817 while NEW_MODE is set to new control word. */
14818
14819 void
14820 emit_i387_cw_initialization (int mode)
14821 {
14822 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
14823 rtx new_mode;
14824
14825 enum ix86_stack_slot slot;
14826
14827 rtx reg = gen_reg_rtx (HImode);
14828
14829 emit_insn (gen_x86_fnstcw_1 (stored_mode));
14830 emit_move_insn (reg, copy_rtx (stored_mode));
14831
14832 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
14833 || optimize_function_for_size_p (cfun))
14834 {
14835 switch (mode)
14836 {
14837 case I387_CW_TRUNC:
14838 /* round toward zero (truncate) */
14839 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
14840 slot = SLOT_CW_TRUNC;
14841 break;
14842
14843 case I387_CW_FLOOR:
14844 /* round down toward -oo */
14845 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
14846 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
14847 slot = SLOT_CW_FLOOR;
14848 break;
14849
14850 case I387_CW_CEIL:
14851 /* round up toward +oo */
14852 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
14853 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
14854 slot = SLOT_CW_CEIL;
14855 break;
14856
14857 case I387_CW_MASK_PM:
14858 /* mask precision exception for nearbyint() */
14859 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
14860 slot = SLOT_CW_MASK_PM;
14861 break;
14862
14863 default:
14864 gcc_unreachable ();
14865 }
14866 }
14867 else
14868 {
14869 switch (mode)
14870 {
14871 case I387_CW_TRUNC:
14872 /* round toward zero (truncate) */
14873 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
14874 slot = SLOT_CW_TRUNC;
14875 break;
14876
14877 case I387_CW_FLOOR:
14878 /* round down toward -oo */
14879 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
14880 slot = SLOT_CW_FLOOR;
14881 break;
14882
14883 case I387_CW_CEIL:
14884 /* round up toward +oo */
14885 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
14886 slot = SLOT_CW_CEIL;
14887 break;
14888
14889 case I387_CW_MASK_PM:
14890 /* mask precision exception for nearbyint() */
14891 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
14892 slot = SLOT_CW_MASK_PM;
14893 break;
14894
14895 default:
14896 gcc_unreachable ();
14897 }
14898 }
14899
14900 gcc_assert (slot < MAX_386_STACK_LOCALS);
14901
14902 new_mode = assign_386_stack_local (HImode, slot);
14903 emit_move_insn (new_mode, reg);
14904 }
14905
14906 /* Output code for INSN to convert a float to a signed int. OPERANDS
14907 are the insn operands. The output may be [HSD]Imode and the input
14908 operand may be [SDX]Fmode. */
14909
14910 const char *
14911 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
14912 {
14913 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
14914 int dimode_p = GET_MODE (operands[0]) == DImode;
14915 int round_mode = get_attr_i387_cw (insn);
14916
14917 /* Jump through a hoop or two for DImode, since the hardware has no
14918 non-popping instruction. We used to do this a different way, but
14919 that was somewhat fragile and broke with post-reload splitters. */
14920 if ((dimode_p || fisttp) && !stack_top_dies)
14921 output_asm_insn ("fld\t%y1", operands);
14922
14923 gcc_assert (STACK_TOP_P (operands[1]));
14924 gcc_assert (MEM_P (operands[0]));
14925 gcc_assert (GET_MODE (operands[1]) != TFmode);
14926
14927 if (fisttp)
14928 output_asm_insn ("fisttp%Z0\t%0", operands);
14929 else
14930 {
14931 if (round_mode != I387_CW_ANY)
14932 output_asm_insn ("fldcw\t%3", operands);
14933 if (stack_top_dies || dimode_p)
14934 output_asm_insn ("fistp%Z0\t%0", operands);
14935 else
14936 output_asm_insn ("fist%Z0\t%0", operands);
14937 if (round_mode != I387_CW_ANY)
14938 output_asm_insn ("fldcw\t%2", operands);
14939 }
14940
14941 return "";
14942 }
14943
14944 /* Output code for x87 ffreep insn. The OPNO argument, which may only
14945 have the values zero or one, indicates the ffreep insn's operand
14946 from the OPERANDS array. */
14947
14948 static const char *
14949 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
14950 {
14951 if (TARGET_USE_FFREEP)
14952 #ifdef HAVE_AS_IX86_FFREEP
14953 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
14954 #else
14955 {
14956 static char retval[32];
14957 int regno = REGNO (operands[opno]);
14958
14959 gcc_assert (FP_REGNO_P (regno));
14960
14961 regno -= FIRST_STACK_REG;
14962
14963 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
14964 return retval;
14965 }
14966 #endif
14967
14968 return opno ? "fstp\t%y1" : "fstp\t%y0";
14969 }
14970
14971
14972 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
14973 should be used. UNORDERED_P is true when fucom should be used. */
14974
14975 const char *
14976 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
14977 {
14978 int stack_top_dies;
14979 rtx cmp_op0, cmp_op1;
14980 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
14981
14982 if (eflags_p)
14983 {
14984 cmp_op0 = operands[0];
14985 cmp_op1 = operands[1];
14986 }
14987 else
14988 {
14989 cmp_op0 = operands[1];
14990 cmp_op1 = operands[2];
14991 }
14992
14993 if (is_sse)
14994 {
14995 if (GET_MODE (operands[0]) == SFmode)
14996 if (unordered_p)
14997 return "%vucomiss\t{%1, %0|%0, %1}";
14998 else
14999 return "%vcomiss\t{%1, %0|%0, %1}";
15000 else
15001 if (unordered_p)
15002 return "%vucomisd\t{%1, %0|%0, %1}";
15003 else
15004 return "%vcomisd\t{%1, %0|%0, %1}";
15005 }
15006
15007 gcc_assert (STACK_TOP_P (cmp_op0));
15008
15009 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15010
15011 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15012 {
15013 if (stack_top_dies)
15014 {
15015 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15016 return output_387_ffreep (operands, 1);
15017 }
15018 else
15019 return "ftst\n\tfnstsw\t%0";
15020 }
15021
15022 if (STACK_REG_P (cmp_op1)
15023 && stack_top_dies
15024 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15025 && REGNO (cmp_op1) != FIRST_STACK_REG)
15026 {
15027 /* If both the top of the 387 stack dies, and the other operand
15028 is also a stack register that dies, then this must be a
15029 `fcompp' float compare */
15030
15031 if (eflags_p)
15032 {
15033 /* There is no double popping fcomi variant. Fortunately,
15034 eflags is immune from the fstp's cc clobbering. */
15035 if (unordered_p)
15036 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15037 else
15038 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15039 return output_387_ffreep (operands, 0);
15040 }
15041 else
15042 {
15043 if (unordered_p)
15044 return "fucompp\n\tfnstsw\t%0";
15045 else
15046 return "fcompp\n\tfnstsw\t%0";
15047 }
15048 }
15049 else
15050 {
15051 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15052
15053 static const char * const alt[16] =
15054 {
15055 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15056 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15057 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15058 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15059
15060 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15061 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15062 NULL,
15063 NULL,
15064
15065 "fcomi\t{%y1, %0|%0, %y1}",
15066 "fcomip\t{%y1, %0|%0, %y1}",
15067 "fucomi\t{%y1, %0|%0, %y1}",
15068 "fucomip\t{%y1, %0|%0, %y1}",
15069
15070 NULL,
15071 NULL,
15072 NULL,
15073 NULL
15074 };
15075
15076 int mask;
15077 const char *ret;
15078
15079 mask = eflags_p << 3;
15080 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15081 mask |= unordered_p << 1;
15082 mask |= stack_top_dies;
15083
15084 gcc_assert (mask < 16);
15085 ret = alt[mask];
15086 gcc_assert (ret);
15087
15088 return ret;
15089 }
15090 }
15091
15092 void
15093 ix86_output_addr_vec_elt (FILE *file, int value)
15094 {
15095 const char *directive = ASM_LONG;
15096
15097 #ifdef ASM_QUAD
15098 if (TARGET_LP64)
15099 directive = ASM_QUAD;
15100 #else
15101 gcc_assert (!TARGET_64BIT);
15102 #endif
15103
15104 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15105 }
15106
15107 void
15108 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15109 {
15110 const char *directive = ASM_LONG;
15111
15112 #ifdef ASM_QUAD
15113 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15114 directive = ASM_QUAD;
15115 #else
15116 gcc_assert (!TARGET_64BIT);
15117 #endif
15118 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15119 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15120 fprintf (file, "%s%s%d-%s%d\n",
15121 directive, LPREFIX, value, LPREFIX, rel);
15122 else if (HAVE_AS_GOTOFF_IN_DATA)
15123 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15124 #if TARGET_MACHO
15125 else if (TARGET_MACHO)
15126 {
15127 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15128 machopic_output_function_base_name (file);
15129 putc ('\n', file);
15130 }
15131 #endif
15132 else
15133 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15134 GOT_SYMBOL_NAME, LPREFIX, value);
15135 }
15136 \f
15137 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15138 for the target. */
15139
15140 void
15141 ix86_expand_clear (rtx dest)
15142 {
15143 rtx tmp;
15144
15145 /* We play register width games, which are only valid after reload. */
15146 gcc_assert (reload_completed);
15147
15148 /* Avoid HImode and its attendant prefix byte. */
15149 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15150 dest = gen_rtx_REG (SImode, REGNO (dest));
15151 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15152
15153 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15154 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15155 {
15156 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15157 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15158 }
15159
15160 emit_insn (tmp);
15161 }
15162
15163 /* X is an unchanging MEM. If it is a constant pool reference, return
15164 the constant pool rtx, else NULL. */
15165
15166 rtx
15167 maybe_get_pool_constant (rtx x)
15168 {
15169 x = ix86_delegitimize_address (XEXP (x, 0));
15170
15171 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15172 return get_pool_constant (x);
15173
15174 return NULL_RTX;
15175 }
15176
15177 void
15178 ix86_expand_move (enum machine_mode mode, rtx operands[])
15179 {
15180 rtx op0, op1;
15181 enum tls_model model;
15182
15183 op0 = operands[0];
15184 op1 = operands[1];
15185
15186 if (GET_CODE (op1) == SYMBOL_REF)
15187 {
15188 model = SYMBOL_REF_TLS_MODEL (op1);
15189 if (model)
15190 {
15191 op1 = legitimize_tls_address (op1, model, true);
15192 op1 = force_operand (op1, op0);
15193 if (op1 == op0)
15194 return;
15195 if (GET_MODE (op1) != mode)
15196 op1 = convert_to_mode (mode, op1, 1);
15197 }
15198 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15199 && SYMBOL_REF_DLLIMPORT_P (op1))
15200 op1 = legitimize_dllimport_symbol (op1, false);
15201 }
15202 else if (GET_CODE (op1) == CONST
15203 && GET_CODE (XEXP (op1, 0)) == PLUS
15204 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15205 {
15206 rtx addend = XEXP (XEXP (op1, 0), 1);
15207 rtx symbol = XEXP (XEXP (op1, 0), 0);
15208 rtx tmp = NULL;
15209
15210 model = SYMBOL_REF_TLS_MODEL (symbol);
15211 if (model)
15212 tmp = legitimize_tls_address (symbol, model, true);
15213 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15214 && SYMBOL_REF_DLLIMPORT_P (symbol))
15215 tmp = legitimize_dllimport_symbol (symbol, true);
15216
15217 if (tmp)
15218 {
15219 tmp = force_operand (tmp, NULL);
15220 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15221 op0, 1, OPTAB_DIRECT);
15222 if (tmp == op0)
15223 return;
15224 if (GET_MODE (tmp) != mode)
15225 op1 = convert_to_mode (mode, tmp, 1);
15226 }
15227 }
15228
15229 if ((flag_pic || MACHOPIC_INDIRECT)
15230 && symbolic_operand (op1, mode))
15231 {
15232 if (TARGET_MACHO && !TARGET_64BIT)
15233 {
15234 #if TARGET_MACHO
15235 /* dynamic-no-pic */
15236 if (MACHOPIC_INDIRECT)
15237 {
15238 rtx temp = ((reload_in_progress
15239 || ((op0 && REG_P (op0))
15240 && mode == Pmode))
15241 ? op0 : gen_reg_rtx (Pmode));
15242 op1 = machopic_indirect_data_reference (op1, temp);
15243 if (MACHOPIC_PURE)
15244 op1 = machopic_legitimize_pic_address (op1, mode,
15245 temp == op1 ? 0 : temp);
15246 }
15247 if (op0 != op1 && GET_CODE (op0) != MEM)
15248 {
15249 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15250 emit_insn (insn);
15251 return;
15252 }
15253 if (GET_CODE (op0) == MEM)
15254 op1 = force_reg (Pmode, op1);
15255 else
15256 {
15257 rtx temp = op0;
15258 if (GET_CODE (temp) != REG)
15259 temp = gen_reg_rtx (Pmode);
15260 temp = legitimize_pic_address (op1, temp);
15261 if (temp == op0)
15262 return;
15263 op1 = temp;
15264 }
15265 /* dynamic-no-pic */
15266 #endif
15267 }
15268 else
15269 {
15270 if (MEM_P (op0))
15271 op1 = force_reg (mode, op1);
15272 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15273 {
15274 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15275 op1 = legitimize_pic_address (op1, reg);
15276 if (op0 == op1)
15277 return;
15278 if (GET_MODE (op1) != mode)
15279 op1 = convert_to_mode (mode, op1, 1);
15280 }
15281 }
15282 }
15283 else
15284 {
15285 if (MEM_P (op0)
15286 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15287 || !push_operand (op0, mode))
15288 && MEM_P (op1))
15289 op1 = force_reg (mode, op1);
15290
15291 if (push_operand (op0, mode)
15292 && ! general_no_elim_operand (op1, mode))
15293 op1 = copy_to_mode_reg (mode, op1);
15294
15295 /* Force large constants in 64bit compilation into register
15296 to get them CSEed. */
15297 if (can_create_pseudo_p ()
15298 && (mode == DImode) && TARGET_64BIT
15299 && immediate_operand (op1, mode)
15300 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15301 && !register_operand (op0, mode)
15302 && optimize)
15303 op1 = copy_to_mode_reg (mode, op1);
15304
15305 if (can_create_pseudo_p ()
15306 && FLOAT_MODE_P (mode)
15307 && GET_CODE (op1) == CONST_DOUBLE)
15308 {
15309 /* If we are loading a floating point constant to a register,
15310 force the value to memory now, since we'll get better code
15311 out the back end. */
15312
15313 op1 = validize_mem (force_const_mem (mode, op1));
15314 if (!register_operand (op0, mode))
15315 {
15316 rtx temp = gen_reg_rtx (mode);
15317 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15318 emit_move_insn (op0, temp);
15319 return;
15320 }
15321 }
15322 }
15323
15324 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15325 }
15326
15327 void
15328 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15329 {
15330 rtx op0 = operands[0], op1 = operands[1];
15331 unsigned int align = GET_MODE_ALIGNMENT (mode);
15332
15333 /* Force constants other than zero into memory. We do not know how
15334 the instructions used to build constants modify the upper 64 bits
15335 of the register, once we have that information we may be able
15336 to handle some of them more efficiently. */
15337 if (can_create_pseudo_p ()
15338 && register_operand (op0, mode)
15339 && (CONSTANT_P (op1)
15340 || (GET_CODE (op1) == SUBREG
15341 && CONSTANT_P (SUBREG_REG (op1))))
15342 && !standard_sse_constant_p (op1))
15343 op1 = validize_mem (force_const_mem (mode, op1));
15344
15345 /* We need to check memory alignment for SSE mode since attribute
15346 can make operands unaligned. */
15347 if (can_create_pseudo_p ()
15348 && SSE_REG_MODE_P (mode)
15349 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15350 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15351 {
15352 rtx tmp[2];
15353
15354 /* ix86_expand_vector_move_misalign() does not like constants ... */
15355 if (CONSTANT_P (op1)
15356 || (GET_CODE (op1) == SUBREG
15357 && CONSTANT_P (SUBREG_REG (op1))))
15358 op1 = validize_mem (force_const_mem (mode, op1));
15359
15360 /* ... nor both arguments in memory. */
15361 if (!register_operand (op0, mode)
15362 && !register_operand (op1, mode))
15363 op1 = force_reg (mode, op1);
15364
15365 tmp[0] = op0; tmp[1] = op1;
15366 ix86_expand_vector_move_misalign (mode, tmp);
15367 return;
15368 }
15369
15370 /* Make operand1 a register if it isn't already. */
15371 if (can_create_pseudo_p ()
15372 && !register_operand (op0, mode)
15373 && !register_operand (op1, mode))
15374 {
15375 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15376 return;
15377 }
15378
15379 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15380 }
15381
15382 /* Split 32-byte AVX unaligned load and store if needed. */
15383
15384 static void
15385 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15386 {
15387 rtx m;
15388 rtx (*extract) (rtx, rtx, rtx);
15389 rtx (*move_unaligned) (rtx, rtx);
15390 enum machine_mode mode;
15391
15392 switch (GET_MODE (op0))
15393 {
15394 default:
15395 gcc_unreachable ();
15396 case V32QImode:
15397 extract = gen_avx_vextractf128v32qi;
15398 move_unaligned = gen_avx_movdqu256;
15399 mode = V16QImode;
15400 break;
15401 case V8SFmode:
15402 extract = gen_avx_vextractf128v8sf;
15403 move_unaligned = gen_avx_movups256;
15404 mode = V4SFmode;
15405 break;
15406 case V4DFmode:
15407 extract = gen_avx_vextractf128v4df;
15408 move_unaligned = gen_avx_movupd256;
15409 mode = V2DFmode;
15410 break;
15411 }
15412
15413 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15414 {
15415 rtx r = gen_reg_rtx (mode);
15416 m = adjust_address (op1, mode, 0);
15417 emit_move_insn (r, m);
15418 m = adjust_address (op1, mode, 16);
15419 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15420 emit_move_insn (op0, r);
15421 }
15422 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15423 {
15424 m = adjust_address (op0, mode, 0);
15425 emit_insn (extract (m, op1, const0_rtx));
15426 m = adjust_address (op0, mode, 16);
15427 emit_insn (extract (m, op1, const1_rtx));
15428 }
15429 else
15430 emit_insn (move_unaligned (op0, op1));
15431 }
15432
15433 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15434 straight to ix86_expand_vector_move. */
15435 /* Code generation for scalar reg-reg moves of single and double precision data:
15436 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15437 movaps reg, reg
15438 else
15439 movss reg, reg
15440 if (x86_sse_partial_reg_dependency == true)
15441 movapd reg, reg
15442 else
15443 movsd reg, reg
15444
15445 Code generation for scalar loads of double precision data:
15446 if (x86_sse_split_regs == true)
15447 movlpd mem, reg (gas syntax)
15448 else
15449 movsd mem, reg
15450
15451 Code generation for unaligned packed loads of single precision data
15452 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15453 if (x86_sse_unaligned_move_optimal)
15454 movups mem, reg
15455
15456 if (x86_sse_partial_reg_dependency == true)
15457 {
15458 xorps reg, reg
15459 movlps mem, reg
15460 movhps mem+8, reg
15461 }
15462 else
15463 {
15464 movlps mem, reg
15465 movhps mem+8, reg
15466 }
15467
15468 Code generation for unaligned packed loads of double precision data
15469 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15470 if (x86_sse_unaligned_move_optimal)
15471 movupd mem, reg
15472
15473 if (x86_sse_split_regs == true)
15474 {
15475 movlpd mem, reg
15476 movhpd mem+8, reg
15477 }
15478 else
15479 {
15480 movsd mem, reg
15481 movhpd mem+8, reg
15482 }
15483 */
15484
15485 void
15486 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15487 {
15488 rtx op0, op1, m;
15489
15490 op0 = operands[0];
15491 op1 = operands[1];
15492
15493 if (TARGET_AVX)
15494 {
15495 switch (GET_MODE_CLASS (mode))
15496 {
15497 case MODE_VECTOR_INT:
15498 case MODE_INT:
15499 switch (GET_MODE_SIZE (mode))
15500 {
15501 case 16:
15502 /* If we're optimizing for size, movups is the smallest. */
15503 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15504 {
15505 op0 = gen_lowpart (V4SFmode, op0);
15506 op1 = gen_lowpart (V4SFmode, op1);
15507 emit_insn (gen_sse_movups (op0, op1));
15508 return;
15509 }
15510 op0 = gen_lowpart (V16QImode, op0);
15511 op1 = gen_lowpart (V16QImode, op1);
15512 emit_insn (gen_sse2_movdqu (op0, op1));
15513 break;
15514 case 32:
15515 op0 = gen_lowpart (V32QImode, op0);
15516 op1 = gen_lowpart (V32QImode, op1);
15517 ix86_avx256_split_vector_move_misalign (op0, op1);
15518 break;
15519 default:
15520 gcc_unreachable ();
15521 }
15522 break;
15523 case MODE_VECTOR_FLOAT:
15524 op0 = gen_lowpart (mode, op0);
15525 op1 = gen_lowpart (mode, op1);
15526
15527 switch (mode)
15528 {
15529 case V4SFmode:
15530 emit_insn (gen_sse_movups (op0, op1));
15531 break;
15532 case V8SFmode:
15533 ix86_avx256_split_vector_move_misalign (op0, op1);
15534 break;
15535 case V2DFmode:
15536 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15537 {
15538 op0 = gen_lowpart (V4SFmode, op0);
15539 op1 = gen_lowpart (V4SFmode, op1);
15540 emit_insn (gen_sse_movups (op0, op1));
15541 return;
15542 }
15543 emit_insn (gen_sse2_movupd (op0, op1));
15544 break;
15545 case V4DFmode:
15546 ix86_avx256_split_vector_move_misalign (op0, op1);
15547 break;
15548 default:
15549 gcc_unreachable ();
15550 }
15551 break;
15552
15553 default:
15554 gcc_unreachable ();
15555 }
15556
15557 return;
15558 }
15559
15560 if (MEM_P (op1))
15561 {
15562 /* If we're optimizing for size, movups is the smallest. */
15563 if (optimize_insn_for_size_p ()
15564 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15565 {
15566 op0 = gen_lowpart (V4SFmode, op0);
15567 op1 = gen_lowpart (V4SFmode, op1);
15568 emit_insn (gen_sse_movups (op0, op1));
15569 return;
15570 }
15571
15572 /* ??? If we have typed data, then it would appear that using
15573 movdqu is the only way to get unaligned data loaded with
15574 integer type. */
15575 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15576 {
15577 op0 = gen_lowpart (V16QImode, op0);
15578 op1 = gen_lowpart (V16QImode, op1);
15579 emit_insn (gen_sse2_movdqu (op0, op1));
15580 return;
15581 }
15582
15583 if (TARGET_SSE2 && mode == V2DFmode)
15584 {
15585 rtx zero;
15586
15587 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15588 {
15589 op0 = gen_lowpart (V2DFmode, op0);
15590 op1 = gen_lowpart (V2DFmode, op1);
15591 emit_insn (gen_sse2_movupd (op0, op1));
15592 return;
15593 }
15594
15595 /* When SSE registers are split into halves, we can avoid
15596 writing to the top half twice. */
15597 if (TARGET_SSE_SPLIT_REGS)
15598 {
15599 emit_clobber (op0);
15600 zero = op0;
15601 }
15602 else
15603 {
15604 /* ??? Not sure about the best option for the Intel chips.
15605 The following would seem to satisfy; the register is
15606 entirely cleared, breaking the dependency chain. We
15607 then store to the upper half, with a dependency depth
15608 of one. A rumor has it that Intel recommends two movsd
15609 followed by an unpacklpd, but this is unconfirmed. And
15610 given that the dependency depth of the unpacklpd would
15611 still be one, I'm not sure why this would be better. */
15612 zero = CONST0_RTX (V2DFmode);
15613 }
15614
15615 m = adjust_address (op1, DFmode, 0);
15616 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15617 m = adjust_address (op1, DFmode, 8);
15618 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15619 }
15620 else
15621 {
15622 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15623 {
15624 op0 = gen_lowpart (V4SFmode, op0);
15625 op1 = gen_lowpart (V4SFmode, op1);
15626 emit_insn (gen_sse_movups (op0, op1));
15627 return;
15628 }
15629
15630 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
15631 emit_move_insn (op0, CONST0_RTX (mode));
15632 else
15633 emit_clobber (op0);
15634
15635 if (mode != V4SFmode)
15636 op0 = gen_lowpart (V4SFmode, op0);
15637 m = adjust_address (op1, V2SFmode, 0);
15638 emit_insn (gen_sse_loadlps (op0, op0, m));
15639 m = adjust_address (op1, V2SFmode, 8);
15640 emit_insn (gen_sse_loadhps (op0, op0, m));
15641 }
15642 }
15643 else if (MEM_P (op0))
15644 {
15645 /* If we're optimizing for size, movups is the smallest. */
15646 if (optimize_insn_for_size_p ()
15647 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15648 {
15649 op0 = gen_lowpart (V4SFmode, op0);
15650 op1 = gen_lowpart (V4SFmode, op1);
15651 emit_insn (gen_sse_movups (op0, op1));
15652 return;
15653 }
15654
15655 /* ??? Similar to above, only less clear because of quote
15656 typeless stores unquote. */
15657 if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
15658 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15659 {
15660 op0 = gen_lowpart (V16QImode, op0);
15661 op1 = gen_lowpart (V16QImode, op1);
15662 emit_insn (gen_sse2_movdqu (op0, op1));
15663 return;
15664 }
15665
15666 if (TARGET_SSE2 && mode == V2DFmode)
15667 {
15668 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15669 {
15670 op0 = gen_lowpart (V2DFmode, op0);
15671 op1 = gen_lowpart (V2DFmode, op1);
15672 emit_insn (gen_sse2_movupd (op0, op1));
15673 }
15674 else
15675 {
15676 m = adjust_address (op0, DFmode, 0);
15677 emit_insn (gen_sse2_storelpd (m, op1));
15678 m = adjust_address (op0, DFmode, 8);
15679 emit_insn (gen_sse2_storehpd (m, op1));
15680 }
15681 }
15682 else
15683 {
15684 if (mode != V4SFmode)
15685 op1 = gen_lowpart (V4SFmode, op1);
15686
15687 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15688 {
15689 op0 = gen_lowpart (V4SFmode, op0);
15690 emit_insn (gen_sse_movups (op0, op1));
15691 }
15692 else
15693 {
15694 m = adjust_address (op0, V2SFmode, 0);
15695 emit_insn (gen_sse_storelps (m, op1));
15696 m = adjust_address (op0, V2SFmode, 8);
15697 emit_insn (gen_sse_storehps (m, op1));
15698 }
15699 }
15700 }
15701 else
15702 gcc_unreachable ();
15703 }
15704
15705 /* Expand a push in MODE. This is some mode for which we do not support
15706 proper push instructions, at least from the registers that we expect
15707 the value to live in. */
15708
15709 void
15710 ix86_expand_push (enum machine_mode mode, rtx x)
15711 {
15712 rtx tmp;
15713
15714 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
15715 GEN_INT (-GET_MODE_SIZE (mode)),
15716 stack_pointer_rtx, 1, OPTAB_DIRECT);
15717 if (tmp != stack_pointer_rtx)
15718 emit_move_insn (stack_pointer_rtx, tmp);
15719
15720 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
15721
15722 /* When we push an operand onto stack, it has to be aligned at least
15723 at the function argument boundary. However since we don't have
15724 the argument type, we can't determine the actual argument
15725 boundary. */
15726 emit_move_insn (tmp, x);
15727 }
15728
15729 /* Helper function of ix86_fixup_binary_operands to canonicalize
15730 operand order. Returns true if the operands should be swapped. */
15731
15732 static bool
15733 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
15734 rtx operands[])
15735 {
15736 rtx dst = operands[0];
15737 rtx src1 = operands[1];
15738 rtx src2 = operands[2];
15739
15740 /* If the operation is not commutative, we can't do anything. */
15741 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
15742 return false;
15743
15744 /* Highest priority is that src1 should match dst. */
15745 if (rtx_equal_p (dst, src1))
15746 return false;
15747 if (rtx_equal_p (dst, src2))
15748 return true;
15749
15750 /* Next highest priority is that immediate constants come second. */
15751 if (immediate_operand (src2, mode))
15752 return false;
15753 if (immediate_operand (src1, mode))
15754 return true;
15755
15756 /* Lowest priority is that memory references should come second. */
15757 if (MEM_P (src2))
15758 return false;
15759 if (MEM_P (src1))
15760 return true;
15761
15762 return false;
15763 }
15764
15765
15766 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
15767 destination to use for the operation. If different from the true
15768 destination in operands[0], a copy operation will be required. */
15769
15770 rtx
15771 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
15772 rtx operands[])
15773 {
15774 rtx dst = operands[0];
15775 rtx src1 = operands[1];
15776 rtx src2 = operands[2];
15777
15778 /* Canonicalize operand order. */
15779 if (ix86_swap_binary_operands_p (code, mode, operands))
15780 {
15781 rtx temp;
15782
15783 /* It is invalid to swap operands of different modes. */
15784 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
15785
15786 temp = src1;
15787 src1 = src2;
15788 src2 = temp;
15789 }
15790
15791 /* Both source operands cannot be in memory. */
15792 if (MEM_P (src1) && MEM_P (src2))
15793 {
15794 /* Optimization: Only read from memory once. */
15795 if (rtx_equal_p (src1, src2))
15796 {
15797 src2 = force_reg (mode, src2);
15798 src1 = src2;
15799 }
15800 else
15801 src2 = force_reg (mode, src2);
15802 }
15803
15804 /* If the destination is memory, and we do not have matching source
15805 operands, do things in registers. */
15806 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
15807 dst = gen_reg_rtx (mode);
15808
15809 /* Source 1 cannot be a constant. */
15810 if (CONSTANT_P (src1))
15811 src1 = force_reg (mode, src1);
15812
15813 /* Source 1 cannot be a non-matching memory. */
15814 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
15815 src1 = force_reg (mode, src1);
15816
15817 /* Improve address combine. */
15818 if (code == PLUS
15819 && GET_MODE_CLASS (mode) == MODE_INT
15820 && MEM_P (src2))
15821 src2 = force_reg (mode, src2);
15822
15823 operands[1] = src1;
15824 operands[2] = src2;
15825 return dst;
15826 }
15827
15828 /* Similarly, but assume that the destination has already been
15829 set up properly. */
15830
15831 void
15832 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
15833 enum machine_mode mode, rtx operands[])
15834 {
15835 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
15836 gcc_assert (dst == operands[0]);
15837 }
15838
15839 /* Attempt to expand a binary operator. Make the expansion closer to the
15840 actual machine, then just general_operand, which will allow 3 separate
15841 memory references (one output, two input) in a single insn. */
15842
15843 void
15844 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
15845 rtx operands[])
15846 {
15847 rtx src1, src2, dst, op, clob;
15848
15849 dst = ix86_fixup_binary_operands (code, mode, operands);
15850 src1 = operands[1];
15851 src2 = operands[2];
15852
15853 /* Emit the instruction. */
15854
15855 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
15856 if (reload_in_progress)
15857 {
15858 /* Reload doesn't know about the flags register, and doesn't know that
15859 it doesn't want to clobber it. We can only do this with PLUS. */
15860 gcc_assert (code == PLUS);
15861 emit_insn (op);
15862 }
15863 else if (reload_completed
15864 && code == PLUS
15865 && !rtx_equal_p (dst, src1))
15866 {
15867 /* This is going to be an LEA; avoid splitting it later. */
15868 emit_insn (op);
15869 }
15870 else
15871 {
15872 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15873 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
15874 }
15875
15876 /* Fix up the destination if needed. */
15877 if (dst != operands[0])
15878 emit_move_insn (operands[0], dst);
15879 }
15880
15881 /* Return TRUE or FALSE depending on whether the binary operator meets the
15882 appropriate constraints. */
15883
15884 bool
15885 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
15886 rtx operands[3])
15887 {
15888 rtx dst = operands[0];
15889 rtx src1 = operands[1];
15890 rtx src2 = operands[2];
15891
15892 /* Both source operands cannot be in memory. */
15893 if (MEM_P (src1) && MEM_P (src2))
15894 return false;
15895
15896 /* Canonicalize operand order for commutative operators. */
15897 if (ix86_swap_binary_operands_p (code, mode, operands))
15898 {
15899 rtx temp = src1;
15900 src1 = src2;
15901 src2 = temp;
15902 }
15903
15904 /* If the destination is memory, we must have a matching source operand. */
15905 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
15906 return false;
15907
15908 /* Source 1 cannot be a constant. */
15909 if (CONSTANT_P (src1))
15910 return false;
15911
15912 /* Source 1 cannot be a non-matching memory. */
15913 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
15914 /* Support "andhi/andsi/anddi" as a zero-extending move. */
15915 return (code == AND
15916 && (mode == HImode
15917 || mode == SImode
15918 || (TARGET_64BIT && mode == DImode))
15919 && satisfies_constraint_L (src2));
15920
15921 return true;
15922 }
15923
15924 /* Attempt to expand a unary operator. Make the expansion closer to the
15925 actual machine, then just general_operand, which will allow 2 separate
15926 memory references (one output, one input) in a single insn. */
15927
15928 void
15929 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
15930 rtx operands[])
15931 {
15932 int matching_memory;
15933 rtx src, dst, op, clob;
15934
15935 dst = operands[0];
15936 src = operands[1];
15937
15938 /* If the destination is memory, and we do not have matching source
15939 operands, do things in registers. */
15940 matching_memory = 0;
15941 if (MEM_P (dst))
15942 {
15943 if (rtx_equal_p (dst, src))
15944 matching_memory = 1;
15945 else
15946 dst = gen_reg_rtx (mode);
15947 }
15948
15949 /* When source operand is memory, destination must match. */
15950 if (MEM_P (src) && !matching_memory)
15951 src = force_reg (mode, src);
15952
15953 /* Emit the instruction. */
15954
15955 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
15956 if (reload_in_progress || code == NOT)
15957 {
15958 /* Reload doesn't know about the flags register, and doesn't know that
15959 it doesn't want to clobber it. */
15960 gcc_assert (code == NOT);
15961 emit_insn (op);
15962 }
15963 else
15964 {
15965 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15966 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
15967 }
15968
15969 /* Fix up the destination if needed. */
15970 if (dst != operands[0])
15971 emit_move_insn (operands[0], dst);
15972 }
15973
15974 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
15975 divisor are within the range [0-255]. */
15976
15977 void
15978 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
15979 bool signed_p)
15980 {
15981 rtx end_label, qimode_label;
15982 rtx insn, div, mod;
15983 rtx scratch, tmp0, tmp1, tmp2;
15984 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
15985 rtx (*gen_zero_extend) (rtx, rtx);
15986 rtx (*gen_test_ccno_1) (rtx, rtx);
15987
15988 switch (mode)
15989 {
15990 case SImode:
15991 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
15992 gen_test_ccno_1 = gen_testsi_ccno_1;
15993 gen_zero_extend = gen_zero_extendqisi2;
15994 break;
15995 case DImode:
15996 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
15997 gen_test_ccno_1 = gen_testdi_ccno_1;
15998 gen_zero_extend = gen_zero_extendqidi2;
15999 break;
16000 default:
16001 gcc_unreachable ();
16002 }
16003
16004 end_label = gen_label_rtx ();
16005 qimode_label = gen_label_rtx ();
16006
16007 scratch = gen_reg_rtx (mode);
16008
16009 /* Use 8bit unsigned divimod if dividend and divisor are within
16010 the range [0-255]. */
16011 emit_move_insn (scratch, operands[2]);
16012 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16013 scratch, 1, OPTAB_DIRECT);
16014 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16015 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16016 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16017 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16018 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16019 pc_rtx);
16020 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16021 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16022 JUMP_LABEL (insn) = qimode_label;
16023
16024 /* Generate original signed/unsigned divimod. */
16025 div = gen_divmod4_1 (operands[0], operands[1],
16026 operands[2], operands[3]);
16027 emit_insn (div);
16028
16029 /* Branch to the end. */
16030 emit_jump_insn (gen_jump (end_label));
16031 emit_barrier ();
16032
16033 /* Generate 8bit unsigned divide. */
16034 emit_label (qimode_label);
16035 /* Don't use operands[0] for result of 8bit divide since not all
16036 registers support QImode ZERO_EXTRACT. */
16037 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16038 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16039 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16040 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16041
16042 if (signed_p)
16043 {
16044 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16045 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16046 }
16047 else
16048 {
16049 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16050 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16051 }
16052
16053 /* Extract remainder from AH. */
16054 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16055 if (REG_P (operands[1]))
16056 insn = emit_move_insn (operands[1], tmp1);
16057 else
16058 {
16059 /* Need a new scratch register since the old one has result
16060 of 8bit divide. */
16061 scratch = gen_reg_rtx (mode);
16062 emit_move_insn (scratch, tmp1);
16063 insn = emit_move_insn (operands[1], scratch);
16064 }
16065 set_unique_reg_note (insn, REG_EQUAL, mod);
16066
16067 /* Zero extend quotient from AL. */
16068 tmp1 = gen_lowpart (QImode, tmp0);
16069 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16070 set_unique_reg_note (insn, REG_EQUAL, div);
16071
16072 emit_label (end_label);
16073 }
16074
16075 #define LEA_MAX_STALL (3)
16076 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16077
16078 /* Increase given DISTANCE in half-cycles according to
16079 dependencies between PREV and NEXT instructions.
16080 Add 1 half-cycle if there is no dependency and
16081 go to next cycle if there is some dependecy. */
16082
16083 static unsigned int
16084 increase_distance (rtx prev, rtx next, unsigned int distance)
16085 {
16086 df_ref *use_rec;
16087 df_ref *def_rec;
16088
16089 if (!prev || !next)
16090 return distance + (distance & 1) + 2;
16091
16092 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16093 return distance + 1;
16094
16095 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16096 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16097 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16098 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16099 return distance + (distance & 1) + 2;
16100
16101 return distance + 1;
16102 }
16103
16104 /* Function checks if instruction INSN defines register number
16105 REGNO1 or REGNO2. */
16106
16107 static bool
16108 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16109 rtx insn)
16110 {
16111 df_ref *def_rec;
16112
16113 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16114 if (DF_REF_REG_DEF_P (*def_rec)
16115 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16116 && (regno1 == DF_REF_REGNO (*def_rec)
16117 || regno2 == DF_REF_REGNO (*def_rec)))
16118 {
16119 return true;
16120 }
16121
16122 return false;
16123 }
16124
16125 /* Function checks if instruction INSN uses register number
16126 REGNO as a part of address expression. */
16127
16128 static bool
16129 insn_uses_reg_mem (unsigned int regno, rtx insn)
16130 {
16131 df_ref *use_rec;
16132
16133 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16134 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16135 return true;
16136
16137 return false;
16138 }
16139
16140 /* Search backward for non-agu definition of register number REGNO1
16141 or register number REGNO2 in basic block starting from instruction
16142 START up to head of basic block or instruction INSN.
16143
16144 Function puts true value into *FOUND var if definition was found
16145 and false otherwise.
16146
16147 Distance in half-cycles between START and found instruction or head
16148 of BB is added to DISTANCE and returned. */
16149
16150 static int
16151 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16152 rtx insn, int distance,
16153 rtx start, bool *found)
16154 {
16155 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16156 rtx prev = start;
16157 rtx next = NULL;
16158 enum attr_type insn_type;
16159
16160 *found = false;
16161
16162 while (prev
16163 && prev != insn
16164 && distance < LEA_SEARCH_THRESHOLD)
16165 {
16166 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16167 {
16168 distance = increase_distance (prev, next, distance);
16169 if (insn_defines_reg (regno1, regno2, prev))
16170 {
16171 insn_type = get_attr_type (prev);
16172 if (insn_type != TYPE_LEA)
16173 {
16174 *found = true;
16175 return distance;
16176 }
16177 }
16178
16179 next = prev;
16180 }
16181 if (prev == BB_HEAD (bb))
16182 break;
16183
16184 prev = PREV_INSN (prev);
16185 }
16186
16187 return distance;
16188 }
16189
16190 /* Search backward for non-agu definition of register number REGNO1
16191 or register number REGNO2 in INSN's basic block until
16192 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16193 2. Reach neighbour BBs boundary, or
16194 3. Reach agu definition.
16195 Returns the distance between the non-agu definition point and INSN.
16196 If no definition point, returns -1. */
16197
16198 static int
16199 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16200 rtx insn)
16201 {
16202 basic_block bb = BLOCK_FOR_INSN (insn);
16203 int distance = 0;
16204 bool found = false;
16205
16206 if (insn != BB_HEAD (bb))
16207 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16208 distance, PREV_INSN (insn),
16209 &found);
16210
16211 if (!found && distance < LEA_SEARCH_THRESHOLD)
16212 {
16213 edge e;
16214 edge_iterator ei;
16215 bool simple_loop = false;
16216
16217 FOR_EACH_EDGE (e, ei, bb->preds)
16218 if (e->src == bb)
16219 {
16220 simple_loop = true;
16221 break;
16222 }
16223
16224 if (simple_loop)
16225 distance = distance_non_agu_define_in_bb (regno1, regno2,
16226 insn, distance,
16227 BB_END (bb), &found);
16228 else
16229 {
16230 int shortest_dist = -1;
16231 bool found_in_bb = false;
16232
16233 FOR_EACH_EDGE (e, ei, bb->preds)
16234 {
16235 int bb_dist
16236 = distance_non_agu_define_in_bb (regno1, regno2,
16237 insn, distance,
16238 BB_END (e->src),
16239 &found_in_bb);
16240 if (found_in_bb)
16241 {
16242 if (shortest_dist < 0)
16243 shortest_dist = bb_dist;
16244 else if (bb_dist > 0)
16245 shortest_dist = MIN (bb_dist, shortest_dist);
16246
16247 found = true;
16248 }
16249 }
16250
16251 distance = shortest_dist;
16252 }
16253 }
16254
16255 /* get_attr_type may modify recog data. We want to make sure
16256 that recog data is valid for instruction INSN, on which
16257 distance_non_agu_define is called. INSN is unchanged here. */
16258 extract_insn_cached (insn);
16259
16260 if (!found)
16261 return -1;
16262
16263 return distance >> 1;
16264 }
16265
16266 /* Return the distance in half-cycles between INSN and the next
16267 insn that uses register number REGNO in memory address added
16268 to DISTANCE. Return -1 if REGNO0 is set.
16269
16270 Put true value into *FOUND if register usage was found and
16271 false otherwise.
16272 Put true value into *REDEFINED if register redefinition was
16273 found and false otherwise. */
16274
16275 static int
16276 distance_agu_use_in_bb (unsigned int regno,
16277 rtx insn, int distance, rtx start,
16278 bool *found, bool *redefined)
16279 {
16280 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16281 rtx next = start;
16282 rtx prev = NULL;
16283
16284 *found = false;
16285 *redefined = false;
16286
16287 while (next
16288 && next != insn
16289 && distance < LEA_SEARCH_THRESHOLD)
16290 {
16291 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16292 {
16293 distance = increase_distance(prev, next, distance);
16294 if (insn_uses_reg_mem (regno, next))
16295 {
16296 /* Return DISTANCE if OP0 is used in memory
16297 address in NEXT. */
16298 *found = true;
16299 return distance;
16300 }
16301
16302 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16303 {
16304 /* Return -1 if OP0 is set in NEXT. */
16305 *redefined = true;
16306 return -1;
16307 }
16308
16309 prev = next;
16310 }
16311
16312 if (next == BB_END (bb))
16313 break;
16314
16315 next = NEXT_INSN (next);
16316 }
16317
16318 return distance;
16319 }
16320
16321 /* Return the distance between INSN and the next insn that uses
16322 register number REGNO0 in memory address. Return -1 if no such
16323 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16324
16325 static int
16326 distance_agu_use (unsigned int regno0, rtx insn)
16327 {
16328 basic_block bb = BLOCK_FOR_INSN (insn);
16329 int distance = 0;
16330 bool found = false;
16331 bool redefined = false;
16332
16333 if (insn != BB_END (bb))
16334 distance = distance_agu_use_in_bb (regno0, insn, distance,
16335 NEXT_INSN (insn),
16336 &found, &redefined);
16337
16338 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16339 {
16340 edge e;
16341 edge_iterator ei;
16342 bool simple_loop = false;
16343
16344 FOR_EACH_EDGE (e, ei, bb->succs)
16345 if (e->dest == bb)
16346 {
16347 simple_loop = true;
16348 break;
16349 }
16350
16351 if (simple_loop)
16352 distance = distance_agu_use_in_bb (regno0, insn,
16353 distance, BB_HEAD (bb),
16354 &found, &redefined);
16355 else
16356 {
16357 int shortest_dist = -1;
16358 bool found_in_bb = false;
16359 bool redefined_in_bb = false;
16360
16361 FOR_EACH_EDGE (e, ei, bb->succs)
16362 {
16363 int bb_dist
16364 = distance_agu_use_in_bb (regno0, insn,
16365 distance, BB_HEAD (e->dest),
16366 &found_in_bb, &redefined_in_bb);
16367 if (found_in_bb)
16368 {
16369 if (shortest_dist < 0)
16370 shortest_dist = bb_dist;
16371 else if (bb_dist > 0)
16372 shortest_dist = MIN (bb_dist, shortest_dist);
16373
16374 found = true;
16375 }
16376 }
16377
16378 distance = shortest_dist;
16379 }
16380 }
16381
16382 if (!found || redefined)
16383 return -1;
16384
16385 return distance >> 1;
16386 }
16387
16388 /* Define this macro to tune LEA priority vs ADD, it take effect when
16389 there is a dilemma of choicing LEA or ADD
16390 Negative value: ADD is more preferred than LEA
16391 Zero: Netrual
16392 Positive value: LEA is more preferred than ADD*/
16393 #define IX86_LEA_PRIORITY 0
16394
16395 /* Return true if usage of lea INSN has performance advantage
16396 over a sequence of instructions. Instructions sequence has
16397 SPLIT_COST cycles higher latency than lea latency. */
16398
16399 bool
16400 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16401 unsigned int regno2, unsigned int split_cost)
16402 {
16403 int dist_define, dist_use;
16404
16405 dist_define = distance_non_agu_define (regno1, regno2, insn);
16406 dist_use = distance_agu_use (regno0, insn);
16407
16408 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16409 {
16410 /* If there is no non AGU operand definition, no AGU
16411 operand usage and split cost is 0 then both lea
16412 and non lea variants have same priority. Currently
16413 we prefer lea for 64 bit code and non lea on 32 bit
16414 code. */
16415 if (dist_use < 0 && split_cost == 0)
16416 return TARGET_64BIT || IX86_LEA_PRIORITY;
16417 else
16418 return true;
16419 }
16420
16421 /* With longer definitions distance lea is more preferable.
16422 Here we change it to take into account splitting cost and
16423 lea priority. */
16424 dist_define += split_cost + IX86_LEA_PRIORITY;
16425
16426 /* If there is no use in memory addess then we just check
16427 that split cost does not exceed AGU stall. */
16428 if (dist_use < 0)
16429 return dist_define >= LEA_MAX_STALL;
16430
16431 /* If this insn has both backward non-agu dependence and forward
16432 agu dependence, the one with short distance takes effect. */
16433 return dist_define >= dist_use;
16434 }
16435
16436 /* Return true if it is legal to clobber flags by INSN and
16437 false otherwise. */
16438
16439 static bool
16440 ix86_ok_to_clobber_flags (rtx insn)
16441 {
16442 basic_block bb = BLOCK_FOR_INSN (insn);
16443 df_ref *use;
16444 bitmap live;
16445
16446 while (insn)
16447 {
16448 if (NONDEBUG_INSN_P (insn))
16449 {
16450 for (use = DF_INSN_USES (insn); *use; use++)
16451 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16452 return false;
16453
16454 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16455 return true;
16456 }
16457
16458 if (insn == BB_END (bb))
16459 break;
16460
16461 insn = NEXT_INSN (insn);
16462 }
16463
16464 live = df_get_live_out(bb);
16465 return !REGNO_REG_SET_P (live, FLAGS_REG);
16466 }
16467
16468 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16469 move and add to avoid AGU stalls. */
16470
16471 bool
16472 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16473 {
16474 unsigned int regno0 = true_regnum (operands[0]);
16475 unsigned int regno1 = true_regnum (operands[1]);
16476 unsigned int regno2 = true_regnum (operands[2]);
16477
16478 /* Check if we need to optimize. */
16479 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16480 return false;
16481
16482 /* Check it is correct to split here. */
16483 if (!ix86_ok_to_clobber_flags(insn))
16484 return false;
16485
16486 /* We need to split only adds with non destructive
16487 destination operand. */
16488 if (regno0 == regno1 || regno0 == regno2)
16489 return false;
16490 else
16491 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16492 }
16493
16494 /* Return true if we need to split lea into a sequence of
16495 instructions to avoid AGU stalls. */
16496
16497 bool
16498 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16499 {
16500 unsigned int regno0 = true_regnum (operands[0]) ;
16501 unsigned int regno1 = -1;
16502 unsigned int regno2 = -1;
16503 unsigned int split_cost = 0;
16504 struct ix86_address parts;
16505 int ok;
16506
16507 /* Check we need to optimize. */
16508 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16509 return false;
16510
16511 /* Check it is correct to split here. */
16512 if (!ix86_ok_to_clobber_flags(insn))
16513 return false;
16514
16515 ok = ix86_decompose_address (operands[1], &parts);
16516 gcc_assert (ok);
16517
16518 /* We should not split into add if non legitimate pic
16519 operand is used as displacement. */
16520 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16521 return false;
16522
16523 if (parts.base)
16524 regno1 = true_regnum (parts.base);
16525 if (parts.index)
16526 regno2 = true_regnum (parts.index);
16527
16528 /* Compute how many cycles we will add to execution time
16529 if split lea into a sequence of instructions. */
16530 if (parts.base || parts.index)
16531 {
16532 /* Have to use mov instruction if non desctructive
16533 destination form is used. */
16534 if (regno1 != regno0 && regno2 != regno0)
16535 split_cost += 1;
16536
16537 /* Have to add index to base if both exist. */
16538 if (parts.base && parts.index)
16539 split_cost += 1;
16540
16541 /* Have to use shift and adds if scale is 2 or greater. */
16542 if (parts.scale > 1)
16543 {
16544 if (regno0 != regno1)
16545 split_cost += 1;
16546 else if (regno2 == regno0)
16547 split_cost += 4;
16548 else
16549 split_cost += parts.scale;
16550 }
16551
16552 /* Have to use add instruction with immediate if
16553 disp is non zero. */
16554 if (parts.disp && parts.disp != const0_rtx)
16555 split_cost += 1;
16556
16557 /* Subtract the price of lea. */
16558 split_cost -= 1;
16559 }
16560
16561 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
16562 }
16563
16564 /* Emit x86 binary operand CODE in mode MODE, where the first operand
16565 matches destination. RTX includes clobber of FLAGS_REG. */
16566
16567 static void
16568 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
16569 rtx dst, rtx src)
16570 {
16571 rtx op, clob;
16572
16573 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
16574 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16575
16576 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16577 }
16578
16579 /* Split lea instructions into a sequence of instructions
16580 which are executed on ALU to avoid AGU stalls.
16581 It is assumed that it is allowed to clobber flags register
16582 at lea position. */
16583
16584 extern void
16585 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
16586 {
16587 unsigned int regno0 = true_regnum (operands[0]) ;
16588 unsigned int regno1 = INVALID_REGNUM;
16589 unsigned int regno2 = INVALID_REGNUM;
16590 struct ix86_address parts;
16591 rtx tmp;
16592 int ok, adds;
16593
16594 ok = ix86_decompose_address (operands[1], &parts);
16595 gcc_assert (ok);
16596
16597 if (parts.base)
16598 {
16599 if (GET_MODE (parts.base) != mode)
16600 parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
16601 regno1 = true_regnum (parts.base);
16602 }
16603
16604 if (parts.index)
16605 {
16606 if (GET_MODE (parts.index) != mode)
16607 parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
16608 regno2 = true_regnum (parts.index);
16609 }
16610
16611 if (parts.scale > 1)
16612 {
16613 /* Case r1 = r1 + ... */
16614 if (regno1 == regno0)
16615 {
16616 /* If we have a case r1 = r1 + C * r1 then we
16617 should use multiplication which is very
16618 expensive. Assume cost model is wrong if we
16619 have such case here. */
16620 gcc_assert (regno2 != regno0);
16621
16622 for (adds = parts.scale; adds > 0; adds--)
16623 ix86_emit_binop (PLUS, mode, operands[0], parts.index);
16624 }
16625 else
16626 {
16627 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
16628 if (regno0 != regno2)
16629 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16630
16631 /* Use shift for scaling. */
16632 ix86_emit_binop (ASHIFT, mode, operands[0],
16633 GEN_INT (exact_log2 (parts.scale)));
16634
16635 if (parts.base)
16636 ix86_emit_binop (PLUS, mode, operands[0], parts.base);
16637
16638 if (parts.disp && parts.disp != const0_rtx)
16639 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16640 }
16641 }
16642 else if (!parts.base && !parts.index)
16643 {
16644 gcc_assert(parts.disp);
16645 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
16646 }
16647 else
16648 {
16649 if (!parts.base)
16650 {
16651 if (regno0 != regno2)
16652 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16653 }
16654 else if (!parts.index)
16655 {
16656 if (regno0 != regno1)
16657 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16658 }
16659 else
16660 {
16661 if (regno0 == regno1)
16662 tmp = parts.index;
16663 else if (regno0 == regno2)
16664 tmp = parts.base;
16665 else
16666 {
16667 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16668 tmp = parts.index;
16669 }
16670
16671 ix86_emit_binop (PLUS, mode, operands[0], tmp);
16672 }
16673
16674 if (parts.disp && parts.disp != const0_rtx)
16675 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16676 }
16677 }
16678
16679 /* Return true if it is ok to optimize an ADD operation to LEA
16680 operation to avoid flag register consumation. For most processors,
16681 ADD is faster than LEA. For the processors like ATOM, if the
16682 destination register of LEA holds an actual address which will be
16683 used soon, LEA is better and otherwise ADD is better. */
16684
16685 bool
16686 ix86_lea_for_add_ok (rtx insn, rtx operands[])
16687 {
16688 unsigned int regno0 = true_regnum (operands[0]);
16689 unsigned int regno1 = true_regnum (operands[1]);
16690 unsigned int regno2 = true_regnum (operands[2]);
16691
16692 /* If a = b + c, (a!=b && a!=c), must use lea form. */
16693 if (regno0 != regno1 && regno0 != regno2)
16694 return true;
16695
16696 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16697 return false;
16698
16699 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
16700 }
16701
16702 /* Return true if destination reg of SET_BODY is shift count of
16703 USE_BODY. */
16704
16705 static bool
16706 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
16707 {
16708 rtx set_dest;
16709 rtx shift_rtx;
16710 int i;
16711
16712 /* Retrieve destination of SET_BODY. */
16713 switch (GET_CODE (set_body))
16714 {
16715 case SET:
16716 set_dest = SET_DEST (set_body);
16717 if (!set_dest || !REG_P (set_dest))
16718 return false;
16719 break;
16720 case PARALLEL:
16721 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
16722 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
16723 use_body))
16724 return true;
16725 default:
16726 return false;
16727 break;
16728 }
16729
16730 /* Retrieve shift count of USE_BODY. */
16731 switch (GET_CODE (use_body))
16732 {
16733 case SET:
16734 shift_rtx = XEXP (use_body, 1);
16735 break;
16736 case PARALLEL:
16737 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
16738 if (ix86_dep_by_shift_count_body (set_body,
16739 XVECEXP (use_body, 0, i)))
16740 return true;
16741 default:
16742 return false;
16743 break;
16744 }
16745
16746 if (shift_rtx
16747 && (GET_CODE (shift_rtx) == ASHIFT
16748 || GET_CODE (shift_rtx) == LSHIFTRT
16749 || GET_CODE (shift_rtx) == ASHIFTRT
16750 || GET_CODE (shift_rtx) == ROTATE
16751 || GET_CODE (shift_rtx) == ROTATERT))
16752 {
16753 rtx shift_count = XEXP (shift_rtx, 1);
16754
16755 /* Return true if shift count is dest of SET_BODY. */
16756 if (REG_P (shift_count)
16757 && true_regnum (set_dest) == true_regnum (shift_count))
16758 return true;
16759 }
16760
16761 return false;
16762 }
16763
16764 /* Return true if destination reg of SET_INSN is shift count of
16765 USE_INSN. */
16766
16767 bool
16768 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
16769 {
16770 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
16771 PATTERN (use_insn));
16772 }
16773
16774 /* Return TRUE or FALSE depending on whether the unary operator meets the
16775 appropriate constraints. */
16776
16777 bool
16778 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
16779 enum machine_mode mode ATTRIBUTE_UNUSED,
16780 rtx operands[2] ATTRIBUTE_UNUSED)
16781 {
16782 /* If one of operands is memory, source and destination must match. */
16783 if ((MEM_P (operands[0])
16784 || MEM_P (operands[1]))
16785 && ! rtx_equal_p (operands[0], operands[1]))
16786 return false;
16787 return true;
16788 }
16789
16790 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
16791 are ok, keeping in mind the possible movddup alternative. */
16792
16793 bool
16794 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
16795 {
16796 if (MEM_P (operands[0]))
16797 return rtx_equal_p (operands[0], operands[1 + high]);
16798 if (MEM_P (operands[1]) && MEM_P (operands[2]))
16799 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
16800 return true;
16801 }
16802
16803 /* Post-reload splitter for converting an SF or DFmode value in an
16804 SSE register into an unsigned SImode. */
16805
16806 void
16807 ix86_split_convert_uns_si_sse (rtx operands[])
16808 {
16809 enum machine_mode vecmode;
16810 rtx value, large, zero_or_two31, input, two31, x;
16811
16812 large = operands[1];
16813 zero_or_two31 = operands[2];
16814 input = operands[3];
16815 two31 = operands[4];
16816 vecmode = GET_MODE (large);
16817 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
16818
16819 /* Load up the value into the low element. We must ensure that the other
16820 elements are valid floats -- zero is the easiest such value. */
16821 if (MEM_P (input))
16822 {
16823 if (vecmode == V4SFmode)
16824 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
16825 else
16826 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
16827 }
16828 else
16829 {
16830 input = gen_rtx_REG (vecmode, REGNO (input));
16831 emit_move_insn (value, CONST0_RTX (vecmode));
16832 if (vecmode == V4SFmode)
16833 emit_insn (gen_sse_movss (value, value, input));
16834 else
16835 emit_insn (gen_sse2_movsd (value, value, input));
16836 }
16837
16838 emit_move_insn (large, two31);
16839 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
16840
16841 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
16842 emit_insn (gen_rtx_SET (VOIDmode, large, x));
16843
16844 x = gen_rtx_AND (vecmode, zero_or_two31, large);
16845 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
16846
16847 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
16848 emit_insn (gen_rtx_SET (VOIDmode, value, x));
16849
16850 large = gen_rtx_REG (V4SImode, REGNO (large));
16851 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
16852
16853 x = gen_rtx_REG (V4SImode, REGNO (value));
16854 if (vecmode == V4SFmode)
16855 emit_insn (gen_sse2_cvttps2dq (x, value));
16856 else
16857 emit_insn (gen_sse2_cvttpd2dq (x, value));
16858 value = x;
16859
16860 emit_insn (gen_xorv4si3 (value, value, large));
16861 }
16862
16863 /* Convert an unsigned DImode value into a DFmode, using only SSE.
16864 Expects the 64-bit DImode to be supplied in a pair of integral
16865 registers. Requires SSE2; will use SSE3 if available. For x86_32,
16866 -mfpmath=sse, !optimize_size only. */
16867
16868 void
16869 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
16870 {
16871 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
16872 rtx int_xmm, fp_xmm;
16873 rtx biases, exponents;
16874 rtx x;
16875
16876 int_xmm = gen_reg_rtx (V4SImode);
16877 if (TARGET_INTER_UNIT_MOVES)
16878 emit_insn (gen_movdi_to_sse (int_xmm, input));
16879 else if (TARGET_SSE_SPLIT_REGS)
16880 {
16881 emit_clobber (int_xmm);
16882 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
16883 }
16884 else
16885 {
16886 x = gen_reg_rtx (V2DImode);
16887 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
16888 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
16889 }
16890
16891 x = gen_rtx_CONST_VECTOR (V4SImode,
16892 gen_rtvec (4, GEN_INT (0x43300000UL),
16893 GEN_INT (0x45300000UL),
16894 const0_rtx, const0_rtx));
16895 exponents = validize_mem (force_const_mem (V4SImode, x));
16896
16897 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
16898 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
16899
16900 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
16901 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
16902 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
16903 (0x1.0p84 + double(fp_value_hi_xmm)).
16904 Note these exponents differ by 32. */
16905
16906 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
16907
16908 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
16909 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
16910 real_ldexp (&bias_lo_rvt, &dconst1, 52);
16911 real_ldexp (&bias_hi_rvt, &dconst1, 84);
16912 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
16913 x = const_double_from_real_value (bias_hi_rvt, DFmode);
16914 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
16915 biases = validize_mem (force_const_mem (V2DFmode, biases));
16916 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
16917
16918 /* Add the upper and lower DFmode values together. */
16919 if (TARGET_SSE3)
16920 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
16921 else
16922 {
16923 x = copy_to_mode_reg (V2DFmode, fp_xmm);
16924 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
16925 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
16926 }
16927
16928 ix86_expand_vector_extract (false, target, fp_xmm, 0);
16929 }
16930
16931 /* Not used, but eases macroization of patterns. */
16932 void
16933 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
16934 rtx input ATTRIBUTE_UNUSED)
16935 {
16936 gcc_unreachable ();
16937 }
16938
16939 /* Convert an unsigned SImode value into a DFmode. Only currently used
16940 for SSE, but applicable anywhere. */
16941
16942 void
16943 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
16944 {
16945 REAL_VALUE_TYPE TWO31r;
16946 rtx x, fp;
16947
16948 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
16949 NULL, 1, OPTAB_DIRECT);
16950
16951 fp = gen_reg_rtx (DFmode);
16952 emit_insn (gen_floatsidf2 (fp, x));
16953
16954 real_ldexp (&TWO31r, &dconst1, 31);
16955 x = const_double_from_real_value (TWO31r, DFmode);
16956
16957 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
16958 if (x != target)
16959 emit_move_insn (target, x);
16960 }
16961
16962 /* Convert a signed DImode value into a DFmode. Only used for SSE in
16963 32-bit mode; otherwise we have a direct convert instruction. */
16964
16965 void
16966 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
16967 {
16968 REAL_VALUE_TYPE TWO32r;
16969 rtx fp_lo, fp_hi, x;
16970
16971 fp_lo = gen_reg_rtx (DFmode);
16972 fp_hi = gen_reg_rtx (DFmode);
16973
16974 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
16975
16976 real_ldexp (&TWO32r, &dconst1, 32);
16977 x = const_double_from_real_value (TWO32r, DFmode);
16978 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
16979
16980 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
16981
16982 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
16983 0, OPTAB_DIRECT);
16984 if (x != target)
16985 emit_move_insn (target, x);
16986 }
16987
16988 /* Convert an unsigned SImode value into a SFmode, using only SSE.
16989 For x86_32, -mfpmath=sse, !optimize_size only. */
16990 void
16991 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
16992 {
16993 REAL_VALUE_TYPE ONE16r;
16994 rtx fp_hi, fp_lo, int_hi, int_lo, x;
16995
16996 real_ldexp (&ONE16r, &dconst1, 16);
16997 x = const_double_from_real_value (ONE16r, SFmode);
16998 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
16999 NULL, 0, OPTAB_DIRECT);
17000 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17001 NULL, 0, OPTAB_DIRECT);
17002 fp_hi = gen_reg_rtx (SFmode);
17003 fp_lo = gen_reg_rtx (SFmode);
17004 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17005 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17006 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17007 0, OPTAB_DIRECT);
17008 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17009 0, OPTAB_DIRECT);
17010 if (!rtx_equal_p (target, fp_hi))
17011 emit_move_insn (target, fp_hi);
17012 }
17013
17014 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17015 then replicate the value for all elements of the vector
17016 register. */
17017
17018 rtx
17019 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17020 {
17021 int i, n_elt;
17022 rtvec v;
17023 enum machine_mode scalar_mode;
17024
17025 switch (mode)
17026 {
17027 case V32QImode:
17028 case V16QImode:
17029 case V16HImode:
17030 case V8HImode:
17031 case V8SImode:
17032 case V4SImode:
17033 case V4DImode:
17034 case V2DImode:
17035 gcc_assert (vect);
17036 case V8SFmode:
17037 case V4SFmode:
17038 case V4DFmode:
17039 case V2DFmode:
17040 n_elt = GET_MODE_NUNITS (mode);
17041 v = rtvec_alloc (n_elt);
17042 scalar_mode = GET_MODE_INNER (mode);
17043
17044 RTVEC_ELT (v, 0) = value;
17045
17046 for (i = 1; i < n_elt; ++i)
17047 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17048
17049 return gen_rtx_CONST_VECTOR (mode, v);
17050
17051 default:
17052 gcc_unreachable ();
17053 }
17054 }
17055
17056 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17057 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17058 for an SSE register. If VECT is true, then replicate the mask for
17059 all elements of the vector register. If INVERT is true, then create
17060 a mask excluding the sign bit. */
17061
17062 rtx
17063 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17064 {
17065 enum machine_mode vec_mode, imode;
17066 HOST_WIDE_INT hi, lo;
17067 int shift = 63;
17068 rtx v;
17069 rtx mask;
17070
17071 /* Find the sign bit, sign extended to 2*HWI. */
17072 switch (mode)
17073 {
17074 case V8SImode:
17075 case V4SImode:
17076 case V8SFmode:
17077 case V4SFmode:
17078 vec_mode = mode;
17079 mode = GET_MODE_INNER (mode);
17080 imode = SImode;
17081 lo = 0x80000000, hi = lo < 0;
17082 break;
17083
17084 case V4DImode:
17085 case V2DImode:
17086 case V4DFmode:
17087 case V2DFmode:
17088 vec_mode = mode;
17089 mode = GET_MODE_INNER (mode);
17090 imode = DImode;
17091 if (HOST_BITS_PER_WIDE_INT >= 64)
17092 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17093 else
17094 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17095 break;
17096
17097 case TImode:
17098 case TFmode:
17099 vec_mode = VOIDmode;
17100 if (HOST_BITS_PER_WIDE_INT >= 64)
17101 {
17102 imode = TImode;
17103 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17104 }
17105 else
17106 {
17107 rtvec vec;
17108
17109 imode = DImode;
17110 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17111
17112 if (invert)
17113 {
17114 lo = ~lo, hi = ~hi;
17115 v = constm1_rtx;
17116 }
17117 else
17118 v = const0_rtx;
17119
17120 mask = immed_double_const (lo, hi, imode);
17121
17122 vec = gen_rtvec (2, v, mask);
17123 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17124 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17125
17126 return v;
17127 }
17128 break;
17129
17130 default:
17131 gcc_unreachable ();
17132 }
17133
17134 if (invert)
17135 lo = ~lo, hi = ~hi;
17136
17137 /* Force this value into the low part of a fp vector constant. */
17138 mask = immed_double_const (lo, hi, imode);
17139 mask = gen_lowpart (mode, mask);
17140
17141 if (vec_mode == VOIDmode)
17142 return force_reg (mode, mask);
17143
17144 v = ix86_build_const_vector (vec_mode, vect, mask);
17145 return force_reg (vec_mode, v);
17146 }
17147
17148 /* Generate code for floating point ABS or NEG. */
17149
17150 void
17151 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17152 rtx operands[])
17153 {
17154 rtx mask, set, dst, src;
17155 bool use_sse = false;
17156 bool vector_mode = VECTOR_MODE_P (mode);
17157 enum machine_mode vmode = mode;
17158
17159 if (vector_mode)
17160 use_sse = true;
17161 else if (mode == TFmode)
17162 use_sse = true;
17163 else if (TARGET_SSE_MATH)
17164 {
17165 use_sse = SSE_FLOAT_MODE_P (mode);
17166 if (mode == SFmode)
17167 vmode = V4SFmode;
17168 else if (mode == DFmode)
17169 vmode = V2DFmode;
17170 }
17171
17172 /* NEG and ABS performed with SSE use bitwise mask operations.
17173 Create the appropriate mask now. */
17174 if (use_sse)
17175 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17176 else
17177 mask = NULL_RTX;
17178
17179 dst = operands[0];
17180 src = operands[1];
17181
17182 set = gen_rtx_fmt_e (code, mode, src);
17183 set = gen_rtx_SET (VOIDmode, dst, set);
17184
17185 if (mask)
17186 {
17187 rtx use, clob;
17188 rtvec par;
17189
17190 use = gen_rtx_USE (VOIDmode, mask);
17191 if (vector_mode)
17192 par = gen_rtvec (2, set, use);
17193 else
17194 {
17195 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17196 par = gen_rtvec (3, set, use, clob);
17197 }
17198 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17199 }
17200 else
17201 emit_insn (set);
17202 }
17203
17204 /* Expand a copysign operation. Special case operand 0 being a constant. */
17205
17206 void
17207 ix86_expand_copysign (rtx operands[])
17208 {
17209 enum machine_mode mode, vmode;
17210 rtx dest, op0, op1, mask, nmask;
17211
17212 dest = operands[0];
17213 op0 = operands[1];
17214 op1 = operands[2];
17215
17216 mode = GET_MODE (dest);
17217
17218 if (mode == SFmode)
17219 vmode = V4SFmode;
17220 else if (mode == DFmode)
17221 vmode = V2DFmode;
17222 else
17223 vmode = mode;
17224
17225 if (GET_CODE (op0) == CONST_DOUBLE)
17226 {
17227 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17228
17229 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17230 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17231
17232 if (mode == SFmode || mode == DFmode)
17233 {
17234 if (op0 == CONST0_RTX (mode))
17235 op0 = CONST0_RTX (vmode);
17236 else
17237 {
17238 rtx v = ix86_build_const_vector (vmode, false, op0);
17239
17240 op0 = force_reg (vmode, v);
17241 }
17242 }
17243 else if (op0 != CONST0_RTX (mode))
17244 op0 = force_reg (mode, op0);
17245
17246 mask = ix86_build_signbit_mask (vmode, 0, 0);
17247
17248 if (mode == SFmode)
17249 copysign_insn = gen_copysignsf3_const;
17250 else if (mode == DFmode)
17251 copysign_insn = gen_copysigndf3_const;
17252 else
17253 copysign_insn = gen_copysigntf3_const;
17254
17255 emit_insn (copysign_insn (dest, op0, op1, mask));
17256 }
17257 else
17258 {
17259 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17260
17261 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17262 mask = ix86_build_signbit_mask (vmode, 0, 0);
17263
17264 if (mode == SFmode)
17265 copysign_insn = gen_copysignsf3_var;
17266 else if (mode == DFmode)
17267 copysign_insn = gen_copysigndf3_var;
17268 else
17269 copysign_insn = gen_copysigntf3_var;
17270
17271 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17272 }
17273 }
17274
17275 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17276 be a constant, and so has already been expanded into a vector constant. */
17277
17278 void
17279 ix86_split_copysign_const (rtx operands[])
17280 {
17281 enum machine_mode mode, vmode;
17282 rtx dest, op0, mask, x;
17283
17284 dest = operands[0];
17285 op0 = operands[1];
17286 mask = operands[3];
17287
17288 mode = GET_MODE (dest);
17289 vmode = GET_MODE (mask);
17290
17291 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17292 x = gen_rtx_AND (vmode, dest, mask);
17293 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17294
17295 if (op0 != CONST0_RTX (vmode))
17296 {
17297 x = gen_rtx_IOR (vmode, dest, op0);
17298 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17299 }
17300 }
17301
17302 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17303 so we have to do two masks. */
17304
17305 void
17306 ix86_split_copysign_var (rtx operands[])
17307 {
17308 enum machine_mode mode, vmode;
17309 rtx dest, scratch, op0, op1, mask, nmask, x;
17310
17311 dest = operands[0];
17312 scratch = operands[1];
17313 op0 = operands[2];
17314 op1 = operands[3];
17315 nmask = operands[4];
17316 mask = operands[5];
17317
17318 mode = GET_MODE (dest);
17319 vmode = GET_MODE (mask);
17320
17321 if (rtx_equal_p (op0, op1))
17322 {
17323 /* Shouldn't happen often (it's useless, obviously), but when it does
17324 we'd generate incorrect code if we continue below. */
17325 emit_move_insn (dest, op0);
17326 return;
17327 }
17328
17329 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17330 {
17331 gcc_assert (REGNO (op1) == REGNO (scratch));
17332
17333 x = gen_rtx_AND (vmode, scratch, mask);
17334 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17335
17336 dest = mask;
17337 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17338 x = gen_rtx_NOT (vmode, dest);
17339 x = gen_rtx_AND (vmode, x, op0);
17340 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17341 }
17342 else
17343 {
17344 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17345 {
17346 x = gen_rtx_AND (vmode, scratch, mask);
17347 }
17348 else /* alternative 2,4 */
17349 {
17350 gcc_assert (REGNO (mask) == REGNO (scratch));
17351 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17352 x = gen_rtx_AND (vmode, scratch, op1);
17353 }
17354 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17355
17356 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17357 {
17358 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17359 x = gen_rtx_AND (vmode, dest, nmask);
17360 }
17361 else /* alternative 3,4 */
17362 {
17363 gcc_assert (REGNO (nmask) == REGNO (dest));
17364 dest = nmask;
17365 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17366 x = gen_rtx_AND (vmode, dest, op0);
17367 }
17368 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17369 }
17370
17371 x = gen_rtx_IOR (vmode, dest, scratch);
17372 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17373 }
17374
17375 /* Return TRUE or FALSE depending on whether the first SET in INSN
17376 has source and destination with matching CC modes, and that the
17377 CC mode is at least as constrained as REQ_MODE. */
17378
17379 bool
17380 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17381 {
17382 rtx set;
17383 enum machine_mode set_mode;
17384
17385 set = PATTERN (insn);
17386 if (GET_CODE (set) == PARALLEL)
17387 set = XVECEXP (set, 0, 0);
17388 gcc_assert (GET_CODE (set) == SET);
17389 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17390
17391 set_mode = GET_MODE (SET_DEST (set));
17392 switch (set_mode)
17393 {
17394 case CCNOmode:
17395 if (req_mode != CCNOmode
17396 && (req_mode != CCmode
17397 || XEXP (SET_SRC (set), 1) != const0_rtx))
17398 return false;
17399 break;
17400 case CCmode:
17401 if (req_mode == CCGCmode)
17402 return false;
17403 /* FALLTHRU */
17404 case CCGCmode:
17405 if (req_mode == CCGOCmode || req_mode == CCNOmode)
17406 return false;
17407 /* FALLTHRU */
17408 case CCGOCmode:
17409 if (req_mode == CCZmode)
17410 return false;
17411 /* FALLTHRU */
17412 case CCZmode:
17413 break;
17414
17415 case CCAmode:
17416 case CCCmode:
17417 case CCOmode:
17418 case CCSmode:
17419 if (set_mode != req_mode)
17420 return false;
17421 break;
17422
17423 default:
17424 gcc_unreachable ();
17425 }
17426
17427 return GET_MODE (SET_SRC (set)) == set_mode;
17428 }
17429
17430 /* Generate insn patterns to do an integer compare of OPERANDS. */
17431
17432 static rtx
17433 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
17434 {
17435 enum machine_mode cmpmode;
17436 rtx tmp, flags;
17437
17438 cmpmode = SELECT_CC_MODE (code, op0, op1);
17439 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
17440
17441 /* This is very simple, but making the interface the same as in the
17442 FP case makes the rest of the code easier. */
17443 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
17444 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
17445
17446 /* Return the test that should be put into the flags user, i.e.
17447 the bcc, scc, or cmov instruction. */
17448 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
17449 }
17450
17451 /* Figure out whether to use ordered or unordered fp comparisons.
17452 Return the appropriate mode to use. */
17453
17454 enum machine_mode
17455 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
17456 {
17457 /* ??? In order to make all comparisons reversible, we do all comparisons
17458 non-trapping when compiling for IEEE. Once gcc is able to distinguish
17459 all forms trapping and nontrapping comparisons, we can make inequality
17460 comparisons trapping again, since it results in better code when using
17461 FCOM based compares. */
17462 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
17463 }
17464
17465 enum machine_mode
17466 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
17467 {
17468 enum machine_mode mode = GET_MODE (op0);
17469
17470 if (SCALAR_FLOAT_MODE_P (mode))
17471 {
17472 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17473 return ix86_fp_compare_mode (code);
17474 }
17475
17476 switch (code)
17477 {
17478 /* Only zero flag is needed. */
17479 case EQ: /* ZF=0 */
17480 case NE: /* ZF!=0 */
17481 return CCZmode;
17482 /* Codes needing carry flag. */
17483 case GEU: /* CF=0 */
17484 case LTU: /* CF=1 */
17485 /* Detect overflow checks. They need just the carry flag. */
17486 if (GET_CODE (op0) == PLUS
17487 && rtx_equal_p (op1, XEXP (op0, 0)))
17488 return CCCmode;
17489 else
17490 return CCmode;
17491 case GTU: /* CF=0 & ZF=0 */
17492 case LEU: /* CF=1 | ZF=1 */
17493 /* Detect overflow checks. They need just the carry flag. */
17494 if (GET_CODE (op0) == MINUS
17495 && rtx_equal_p (op1, XEXP (op0, 0)))
17496 return CCCmode;
17497 else
17498 return CCmode;
17499 /* Codes possibly doable only with sign flag when
17500 comparing against zero. */
17501 case GE: /* SF=OF or SF=0 */
17502 case LT: /* SF<>OF or SF=1 */
17503 if (op1 == const0_rtx)
17504 return CCGOCmode;
17505 else
17506 /* For other cases Carry flag is not required. */
17507 return CCGCmode;
17508 /* Codes doable only with sign flag when comparing
17509 against zero, but we miss jump instruction for it
17510 so we need to use relational tests against overflow
17511 that thus needs to be zero. */
17512 case GT: /* ZF=0 & SF=OF */
17513 case LE: /* ZF=1 | SF<>OF */
17514 if (op1 == const0_rtx)
17515 return CCNOmode;
17516 else
17517 return CCGCmode;
17518 /* strcmp pattern do (use flags) and combine may ask us for proper
17519 mode. */
17520 case USE:
17521 return CCmode;
17522 default:
17523 gcc_unreachable ();
17524 }
17525 }
17526
17527 /* Return the fixed registers used for condition codes. */
17528
17529 static bool
17530 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
17531 {
17532 *p1 = FLAGS_REG;
17533 *p2 = FPSR_REG;
17534 return true;
17535 }
17536
17537 /* If two condition code modes are compatible, return a condition code
17538 mode which is compatible with both. Otherwise, return
17539 VOIDmode. */
17540
17541 static enum machine_mode
17542 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
17543 {
17544 if (m1 == m2)
17545 return m1;
17546
17547 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
17548 return VOIDmode;
17549
17550 if ((m1 == CCGCmode && m2 == CCGOCmode)
17551 || (m1 == CCGOCmode && m2 == CCGCmode))
17552 return CCGCmode;
17553
17554 switch (m1)
17555 {
17556 default:
17557 gcc_unreachable ();
17558
17559 case CCmode:
17560 case CCGCmode:
17561 case CCGOCmode:
17562 case CCNOmode:
17563 case CCAmode:
17564 case CCCmode:
17565 case CCOmode:
17566 case CCSmode:
17567 case CCZmode:
17568 switch (m2)
17569 {
17570 default:
17571 return VOIDmode;
17572
17573 case CCmode:
17574 case CCGCmode:
17575 case CCGOCmode:
17576 case CCNOmode:
17577 case CCAmode:
17578 case CCCmode:
17579 case CCOmode:
17580 case CCSmode:
17581 case CCZmode:
17582 return CCmode;
17583 }
17584
17585 case CCFPmode:
17586 case CCFPUmode:
17587 /* These are only compatible with themselves, which we already
17588 checked above. */
17589 return VOIDmode;
17590 }
17591 }
17592
17593
17594 /* Return a comparison we can do and that it is equivalent to
17595 swap_condition (code) apart possibly from orderedness.
17596 But, never change orderedness if TARGET_IEEE_FP, returning
17597 UNKNOWN in that case if necessary. */
17598
17599 static enum rtx_code
17600 ix86_fp_swap_condition (enum rtx_code code)
17601 {
17602 switch (code)
17603 {
17604 case GT: /* GTU - CF=0 & ZF=0 */
17605 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
17606 case GE: /* GEU - CF=0 */
17607 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
17608 case UNLT: /* LTU - CF=1 */
17609 return TARGET_IEEE_FP ? UNKNOWN : GT;
17610 case UNLE: /* LEU - CF=1 | ZF=1 */
17611 return TARGET_IEEE_FP ? UNKNOWN : GE;
17612 default:
17613 return swap_condition (code);
17614 }
17615 }
17616
17617 /* Return cost of comparison CODE using the best strategy for performance.
17618 All following functions do use number of instructions as a cost metrics.
17619 In future this should be tweaked to compute bytes for optimize_size and
17620 take into account performance of various instructions on various CPUs. */
17621
17622 static int
17623 ix86_fp_comparison_cost (enum rtx_code code)
17624 {
17625 int arith_cost;
17626
17627 /* The cost of code using bit-twiddling on %ah. */
17628 switch (code)
17629 {
17630 case UNLE:
17631 case UNLT:
17632 case LTGT:
17633 case GT:
17634 case GE:
17635 case UNORDERED:
17636 case ORDERED:
17637 case UNEQ:
17638 arith_cost = 4;
17639 break;
17640 case LT:
17641 case NE:
17642 case EQ:
17643 case UNGE:
17644 arith_cost = TARGET_IEEE_FP ? 5 : 4;
17645 break;
17646 case LE:
17647 case UNGT:
17648 arith_cost = TARGET_IEEE_FP ? 6 : 4;
17649 break;
17650 default:
17651 gcc_unreachable ();
17652 }
17653
17654 switch (ix86_fp_comparison_strategy (code))
17655 {
17656 case IX86_FPCMP_COMI:
17657 return arith_cost > 4 ? 3 : 2;
17658 case IX86_FPCMP_SAHF:
17659 return arith_cost > 4 ? 4 : 3;
17660 default:
17661 return arith_cost;
17662 }
17663 }
17664
17665 /* Return strategy to use for floating-point. We assume that fcomi is always
17666 preferrable where available, since that is also true when looking at size
17667 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
17668
17669 enum ix86_fpcmp_strategy
17670 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
17671 {
17672 /* Do fcomi/sahf based test when profitable. */
17673
17674 if (TARGET_CMOVE)
17675 return IX86_FPCMP_COMI;
17676
17677 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
17678 return IX86_FPCMP_SAHF;
17679
17680 return IX86_FPCMP_ARITH;
17681 }
17682
17683 /* Swap, force into registers, or otherwise massage the two operands
17684 to a fp comparison. The operands are updated in place; the new
17685 comparison code is returned. */
17686
17687 static enum rtx_code
17688 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
17689 {
17690 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
17691 rtx op0 = *pop0, op1 = *pop1;
17692 enum machine_mode op_mode = GET_MODE (op0);
17693 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
17694
17695 /* All of the unordered compare instructions only work on registers.
17696 The same is true of the fcomi compare instructions. The XFmode
17697 compare instructions require registers except when comparing
17698 against zero or when converting operand 1 from fixed point to
17699 floating point. */
17700
17701 if (!is_sse
17702 && (fpcmp_mode == CCFPUmode
17703 || (op_mode == XFmode
17704 && ! (standard_80387_constant_p (op0) == 1
17705 || standard_80387_constant_p (op1) == 1)
17706 && GET_CODE (op1) != FLOAT)
17707 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
17708 {
17709 op0 = force_reg (op_mode, op0);
17710 op1 = force_reg (op_mode, op1);
17711 }
17712 else
17713 {
17714 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
17715 things around if they appear profitable, otherwise force op0
17716 into a register. */
17717
17718 if (standard_80387_constant_p (op0) == 0
17719 || (MEM_P (op0)
17720 && ! (standard_80387_constant_p (op1) == 0
17721 || MEM_P (op1))))
17722 {
17723 enum rtx_code new_code = ix86_fp_swap_condition (code);
17724 if (new_code != UNKNOWN)
17725 {
17726 rtx tmp;
17727 tmp = op0, op0 = op1, op1 = tmp;
17728 code = new_code;
17729 }
17730 }
17731
17732 if (!REG_P (op0))
17733 op0 = force_reg (op_mode, op0);
17734
17735 if (CONSTANT_P (op1))
17736 {
17737 int tmp = standard_80387_constant_p (op1);
17738 if (tmp == 0)
17739 op1 = validize_mem (force_const_mem (op_mode, op1));
17740 else if (tmp == 1)
17741 {
17742 if (TARGET_CMOVE)
17743 op1 = force_reg (op_mode, op1);
17744 }
17745 else
17746 op1 = force_reg (op_mode, op1);
17747 }
17748 }
17749
17750 /* Try to rearrange the comparison to make it cheaper. */
17751 if (ix86_fp_comparison_cost (code)
17752 > ix86_fp_comparison_cost (swap_condition (code))
17753 && (REG_P (op1) || can_create_pseudo_p ()))
17754 {
17755 rtx tmp;
17756 tmp = op0, op0 = op1, op1 = tmp;
17757 code = swap_condition (code);
17758 if (!REG_P (op0))
17759 op0 = force_reg (op_mode, op0);
17760 }
17761
17762 *pop0 = op0;
17763 *pop1 = op1;
17764 return code;
17765 }
17766
17767 /* Convert comparison codes we use to represent FP comparison to integer
17768 code that will result in proper branch. Return UNKNOWN if no such code
17769 is available. */
17770
17771 enum rtx_code
17772 ix86_fp_compare_code_to_integer (enum rtx_code code)
17773 {
17774 switch (code)
17775 {
17776 case GT:
17777 return GTU;
17778 case GE:
17779 return GEU;
17780 case ORDERED:
17781 case UNORDERED:
17782 return code;
17783 break;
17784 case UNEQ:
17785 return EQ;
17786 break;
17787 case UNLT:
17788 return LTU;
17789 break;
17790 case UNLE:
17791 return LEU;
17792 break;
17793 case LTGT:
17794 return NE;
17795 break;
17796 default:
17797 return UNKNOWN;
17798 }
17799 }
17800
17801 /* Generate insn patterns to do a floating point compare of OPERANDS. */
17802
17803 static rtx
17804 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
17805 {
17806 enum machine_mode fpcmp_mode, intcmp_mode;
17807 rtx tmp, tmp2;
17808
17809 fpcmp_mode = ix86_fp_compare_mode (code);
17810 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
17811
17812 /* Do fcomi/sahf based test when profitable. */
17813 switch (ix86_fp_comparison_strategy (code))
17814 {
17815 case IX86_FPCMP_COMI:
17816 intcmp_mode = fpcmp_mode;
17817 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
17818 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
17819 tmp);
17820 emit_insn (tmp);
17821 break;
17822
17823 case IX86_FPCMP_SAHF:
17824 intcmp_mode = fpcmp_mode;
17825 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
17826 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
17827 tmp);
17828
17829 if (!scratch)
17830 scratch = gen_reg_rtx (HImode);
17831 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
17832 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
17833 break;
17834
17835 case IX86_FPCMP_ARITH:
17836 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
17837 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
17838 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
17839 if (!scratch)
17840 scratch = gen_reg_rtx (HImode);
17841 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
17842
17843 /* In the unordered case, we have to check C2 for NaN's, which
17844 doesn't happen to work out to anything nice combination-wise.
17845 So do some bit twiddling on the value we've got in AH to come
17846 up with an appropriate set of condition codes. */
17847
17848 intcmp_mode = CCNOmode;
17849 switch (code)
17850 {
17851 case GT:
17852 case UNGT:
17853 if (code == GT || !TARGET_IEEE_FP)
17854 {
17855 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
17856 code = EQ;
17857 }
17858 else
17859 {
17860 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
17861 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
17862 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
17863 intcmp_mode = CCmode;
17864 code = GEU;
17865 }
17866 break;
17867 case LT:
17868 case UNLT:
17869 if (code == LT && TARGET_IEEE_FP)
17870 {
17871 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
17872 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
17873 intcmp_mode = CCmode;
17874 code = EQ;
17875 }
17876 else
17877 {
17878 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
17879 code = NE;
17880 }
17881 break;
17882 case GE:
17883 case UNGE:
17884 if (code == GE || !TARGET_IEEE_FP)
17885 {
17886 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
17887 code = EQ;
17888 }
17889 else
17890 {
17891 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
17892 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
17893 code = NE;
17894 }
17895 break;
17896 case LE:
17897 case UNLE:
17898 if (code == LE && TARGET_IEEE_FP)
17899 {
17900 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
17901 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
17902 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
17903 intcmp_mode = CCmode;
17904 code = LTU;
17905 }
17906 else
17907 {
17908 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
17909 code = NE;
17910 }
17911 break;
17912 case EQ:
17913 case UNEQ:
17914 if (code == EQ && TARGET_IEEE_FP)
17915 {
17916 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
17917 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
17918 intcmp_mode = CCmode;
17919 code = EQ;
17920 }
17921 else
17922 {
17923 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
17924 code = NE;
17925 }
17926 break;
17927 case NE:
17928 case LTGT:
17929 if (code == NE && TARGET_IEEE_FP)
17930 {
17931 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
17932 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
17933 GEN_INT (0x40)));
17934 code = NE;
17935 }
17936 else
17937 {
17938 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
17939 code = EQ;
17940 }
17941 break;
17942
17943 case UNORDERED:
17944 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
17945 code = NE;
17946 break;
17947 case ORDERED:
17948 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
17949 code = EQ;
17950 break;
17951
17952 default:
17953 gcc_unreachable ();
17954 }
17955 break;
17956
17957 default:
17958 gcc_unreachable();
17959 }
17960
17961 /* Return the test that should be put into the flags user, i.e.
17962 the bcc, scc, or cmov instruction. */
17963 return gen_rtx_fmt_ee (code, VOIDmode,
17964 gen_rtx_REG (intcmp_mode, FLAGS_REG),
17965 const0_rtx);
17966 }
17967
17968 static rtx
17969 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
17970 {
17971 rtx ret;
17972
17973 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
17974 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
17975
17976 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
17977 {
17978 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
17979 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
17980 }
17981 else
17982 ret = ix86_expand_int_compare (code, op0, op1);
17983
17984 return ret;
17985 }
17986
17987 void
17988 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
17989 {
17990 enum machine_mode mode = GET_MODE (op0);
17991 rtx tmp;
17992
17993 switch (mode)
17994 {
17995 case SFmode:
17996 case DFmode:
17997 case XFmode:
17998 case QImode:
17999 case HImode:
18000 case SImode:
18001 simple:
18002 tmp = ix86_expand_compare (code, op0, op1);
18003 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18004 gen_rtx_LABEL_REF (VOIDmode, label),
18005 pc_rtx);
18006 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18007 return;
18008
18009 case DImode:
18010 if (TARGET_64BIT)
18011 goto simple;
18012 case TImode:
18013 /* Expand DImode branch into multiple compare+branch. */
18014 {
18015 rtx lo[2], hi[2], label2;
18016 enum rtx_code code1, code2, code3;
18017 enum machine_mode submode;
18018
18019 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18020 {
18021 tmp = op0, op0 = op1, op1 = tmp;
18022 code = swap_condition (code);
18023 }
18024
18025 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18026 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18027
18028 submode = mode == DImode ? SImode : DImode;
18029
18030 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18031 avoid two branches. This costs one extra insn, so disable when
18032 optimizing for size. */
18033
18034 if ((code == EQ || code == NE)
18035 && (!optimize_insn_for_size_p ()
18036 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18037 {
18038 rtx xor0, xor1;
18039
18040 xor1 = hi[0];
18041 if (hi[1] != const0_rtx)
18042 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18043 NULL_RTX, 0, OPTAB_WIDEN);
18044
18045 xor0 = lo[0];
18046 if (lo[1] != const0_rtx)
18047 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18048 NULL_RTX, 0, OPTAB_WIDEN);
18049
18050 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18051 NULL_RTX, 0, OPTAB_WIDEN);
18052
18053 ix86_expand_branch (code, tmp, const0_rtx, label);
18054 return;
18055 }
18056
18057 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18058 op1 is a constant and the low word is zero, then we can just
18059 examine the high word. Similarly for low word -1 and
18060 less-or-equal-than or greater-than. */
18061
18062 if (CONST_INT_P (hi[1]))
18063 switch (code)
18064 {
18065 case LT: case LTU: case GE: case GEU:
18066 if (lo[1] == const0_rtx)
18067 {
18068 ix86_expand_branch (code, hi[0], hi[1], label);
18069 return;
18070 }
18071 break;
18072 case LE: case LEU: case GT: case GTU:
18073 if (lo[1] == constm1_rtx)
18074 {
18075 ix86_expand_branch (code, hi[0], hi[1], label);
18076 return;
18077 }
18078 break;
18079 default:
18080 break;
18081 }
18082
18083 /* Otherwise, we need two or three jumps. */
18084
18085 label2 = gen_label_rtx ();
18086
18087 code1 = code;
18088 code2 = swap_condition (code);
18089 code3 = unsigned_condition (code);
18090
18091 switch (code)
18092 {
18093 case LT: case GT: case LTU: case GTU:
18094 break;
18095
18096 case LE: code1 = LT; code2 = GT; break;
18097 case GE: code1 = GT; code2 = LT; break;
18098 case LEU: code1 = LTU; code2 = GTU; break;
18099 case GEU: code1 = GTU; code2 = LTU; break;
18100
18101 case EQ: code1 = UNKNOWN; code2 = NE; break;
18102 case NE: code2 = UNKNOWN; break;
18103
18104 default:
18105 gcc_unreachable ();
18106 }
18107
18108 /*
18109 * a < b =>
18110 * if (hi(a) < hi(b)) goto true;
18111 * if (hi(a) > hi(b)) goto false;
18112 * if (lo(a) < lo(b)) goto true;
18113 * false:
18114 */
18115
18116 if (code1 != UNKNOWN)
18117 ix86_expand_branch (code1, hi[0], hi[1], label);
18118 if (code2 != UNKNOWN)
18119 ix86_expand_branch (code2, hi[0], hi[1], label2);
18120
18121 ix86_expand_branch (code3, lo[0], lo[1], label);
18122
18123 if (code2 != UNKNOWN)
18124 emit_label (label2);
18125 return;
18126 }
18127
18128 default:
18129 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18130 goto simple;
18131 }
18132 }
18133
18134 /* Split branch based on floating point condition. */
18135 void
18136 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18137 rtx target1, rtx target2, rtx tmp, rtx pushed)
18138 {
18139 rtx condition;
18140 rtx i;
18141
18142 if (target2 != pc_rtx)
18143 {
18144 rtx tmp = target2;
18145 code = reverse_condition_maybe_unordered (code);
18146 target2 = target1;
18147 target1 = tmp;
18148 }
18149
18150 condition = ix86_expand_fp_compare (code, op1, op2,
18151 tmp);
18152
18153 /* Remove pushed operand from stack. */
18154 if (pushed)
18155 ix86_free_from_memory (GET_MODE (pushed));
18156
18157 i = emit_jump_insn (gen_rtx_SET
18158 (VOIDmode, pc_rtx,
18159 gen_rtx_IF_THEN_ELSE (VOIDmode,
18160 condition, target1, target2)));
18161 if (split_branch_probability >= 0)
18162 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18163 }
18164
18165 void
18166 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18167 {
18168 rtx ret;
18169
18170 gcc_assert (GET_MODE (dest) == QImode);
18171
18172 ret = ix86_expand_compare (code, op0, op1);
18173 PUT_MODE (ret, QImode);
18174 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18175 }
18176
18177 /* Expand comparison setting or clearing carry flag. Return true when
18178 successful and set pop for the operation. */
18179 static bool
18180 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18181 {
18182 enum machine_mode mode =
18183 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18184
18185 /* Do not handle double-mode compares that go through special path. */
18186 if (mode == (TARGET_64BIT ? TImode : DImode))
18187 return false;
18188
18189 if (SCALAR_FLOAT_MODE_P (mode))
18190 {
18191 rtx compare_op, compare_seq;
18192
18193 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18194
18195 /* Shortcut: following common codes never translate
18196 into carry flag compares. */
18197 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18198 || code == ORDERED || code == UNORDERED)
18199 return false;
18200
18201 /* These comparisons require zero flag; swap operands so they won't. */
18202 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18203 && !TARGET_IEEE_FP)
18204 {
18205 rtx tmp = op0;
18206 op0 = op1;
18207 op1 = tmp;
18208 code = swap_condition (code);
18209 }
18210
18211 /* Try to expand the comparison and verify that we end up with
18212 carry flag based comparison. This fails to be true only when
18213 we decide to expand comparison using arithmetic that is not
18214 too common scenario. */
18215 start_sequence ();
18216 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18217 compare_seq = get_insns ();
18218 end_sequence ();
18219
18220 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18221 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18222 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18223 else
18224 code = GET_CODE (compare_op);
18225
18226 if (code != LTU && code != GEU)
18227 return false;
18228
18229 emit_insn (compare_seq);
18230 *pop = compare_op;
18231 return true;
18232 }
18233
18234 if (!INTEGRAL_MODE_P (mode))
18235 return false;
18236
18237 switch (code)
18238 {
18239 case LTU:
18240 case GEU:
18241 break;
18242
18243 /* Convert a==0 into (unsigned)a<1. */
18244 case EQ:
18245 case NE:
18246 if (op1 != const0_rtx)
18247 return false;
18248 op1 = const1_rtx;
18249 code = (code == EQ ? LTU : GEU);
18250 break;
18251
18252 /* Convert a>b into b<a or a>=b-1. */
18253 case GTU:
18254 case LEU:
18255 if (CONST_INT_P (op1))
18256 {
18257 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18258 /* Bail out on overflow. We still can swap operands but that
18259 would force loading of the constant into register. */
18260 if (op1 == const0_rtx
18261 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18262 return false;
18263 code = (code == GTU ? GEU : LTU);
18264 }
18265 else
18266 {
18267 rtx tmp = op1;
18268 op1 = op0;
18269 op0 = tmp;
18270 code = (code == GTU ? LTU : GEU);
18271 }
18272 break;
18273
18274 /* Convert a>=0 into (unsigned)a<0x80000000. */
18275 case LT:
18276 case GE:
18277 if (mode == DImode || op1 != const0_rtx)
18278 return false;
18279 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18280 code = (code == LT ? GEU : LTU);
18281 break;
18282 case LE:
18283 case GT:
18284 if (mode == DImode || op1 != constm1_rtx)
18285 return false;
18286 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18287 code = (code == LE ? GEU : LTU);
18288 break;
18289
18290 default:
18291 return false;
18292 }
18293 /* Swapping operands may cause constant to appear as first operand. */
18294 if (!nonimmediate_operand (op0, VOIDmode))
18295 {
18296 if (!can_create_pseudo_p ())
18297 return false;
18298 op0 = force_reg (mode, op0);
18299 }
18300 *pop = ix86_expand_compare (code, op0, op1);
18301 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18302 return true;
18303 }
18304
18305 bool
18306 ix86_expand_int_movcc (rtx operands[])
18307 {
18308 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18309 rtx compare_seq, compare_op;
18310 enum machine_mode mode = GET_MODE (operands[0]);
18311 bool sign_bit_compare_p = false;
18312 rtx op0 = XEXP (operands[1], 0);
18313 rtx op1 = XEXP (operands[1], 1);
18314
18315 start_sequence ();
18316 compare_op = ix86_expand_compare (code, op0, op1);
18317 compare_seq = get_insns ();
18318 end_sequence ();
18319
18320 compare_code = GET_CODE (compare_op);
18321
18322 if ((op1 == const0_rtx && (code == GE || code == LT))
18323 || (op1 == constm1_rtx && (code == GT || code == LE)))
18324 sign_bit_compare_p = true;
18325
18326 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18327 HImode insns, we'd be swallowed in word prefix ops. */
18328
18329 if ((mode != HImode || TARGET_FAST_PREFIX)
18330 && (mode != (TARGET_64BIT ? TImode : DImode))
18331 && CONST_INT_P (operands[2])
18332 && CONST_INT_P (operands[3]))
18333 {
18334 rtx out = operands[0];
18335 HOST_WIDE_INT ct = INTVAL (operands[2]);
18336 HOST_WIDE_INT cf = INTVAL (operands[3]);
18337 HOST_WIDE_INT diff;
18338
18339 diff = ct - cf;
18340 /* Sign bit compares are better done using shifts than we do by using
18341 sbb. */
18342 if (sign_bit_compare_p
18343 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18344 {
18345 /* Detect overlap between destination and compare sources. */
18346 rtx tmp = out;
18347
18348 if (!sign_bit_compare_p)
18349 {
18350 rtx flags;
18351 bool fpcmp = false;
18352
18353 compare_code = GET_CODE (compare_op);
18354
18355 flags = XEXP (compare_op, 0);
18356
18357 if (GET_MODE (flags) == CCFPmode
18358 || GET_MODE (flags) == CCFPUmode)
18359 {
18360 fpcmp = true;
18361 compare_code
18362 = ix86_fp_compare_code_to_integer (compare_code);
18363 }
18364
18365 /* To simplify rest of code, restrict to the GEU case. */
18366 if (compare_code == LTU)
18367 {
18368 HOST_WIDE_INT tmp = ct;
18369 ct = cf;
18370 cf = tmp;
18371 compare_code = reverse_condition (compare_code);
18372 code = reverse_condition (code);
18373 }
18374 else
18375 {
18376 if (fpcmp)
18377 PUT_CODE (compare_op,
18378 reverse_condition_maybe_unordered
18379 (GET_CODE (compare_op)));
18380 else
18381 PUT_CODE (compare_op,
18382 reverse_condition (GET_CODE (compare_op)));
18383 }
18384 diff = ct - cf;
18385
18386 if (reg_overlap_mentioned_p (out, op0)
18387 || reg_overlap_mentioned_p (out, op1))
18388 tmp = gen_reg_rtx (mode);
18389
18390 if (mode == DImode)
18391 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18392 else
18393 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
18394 flags, compare_op));
18395 }
18396 else
18397 {
18398 if (code == GT || code == GE)
18399 code = reverse_condition (code);
18400 else
18401 {
18402 HOST_WIDE_INT tmp = ct;
18403 ct = cf;
18404 cf = tmp;
18405 diff = ct - cf;
18406 }
18407 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18408 }
18409
18410 if (diff == 1)
18411 {
18412 /*
18413 * cmpl op0,op1
18414 * sbbl dest,dest
18415 * [addl dest, ct]
18416 *
18417 * Size 5 - 8.
18418 */
18419 if (ct)
18420 tmp = expand_simple_binop (mode, PLUS,
18421 tmp, GEN_INT (ct),
18422 copy_rtx (tmp), 1, OPTAB_DIRECT);
18423 }
18424 else if (cf == -1)
18425 {
18426 /*
18427 * cmpl op0,op1
18428 * sbbl dest,dest
18429 * orl $ct, dest
18430 *
18431 * Size 8.
18432 */
18433 tmp = expand_simple_binop (mode, IOR,
18434 tmp, GEN_INT (ct),
18435 copy_rtx (tmp), 1, OPTAB_DIRECT);
18436 }
18437 else if (diff == -1 && ct)
18438 {
18439 /*
18440 * cmpl op0,op1
18441 * sbbl dest,dest
18442 * notl dest
18443 * [addl dest, cf]
18444 *
18445 * Size 8 - 11.
18446 */
18447 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18448 if (cf)
18449 tmp = expand_simple_binop (mode, PLUS,
18450 copy_rtx (tmp), GEN_INT (cf),
18451 copy_rtx (tmp), 1, OPTAB_DIRECT);
18452 }
18453 else
18454 {
18455 /*
18456 * cmpl op0,op1
18457 * sbbl dest,dest
18458 * [notl dest]
18459 * andl cf - ct, dest
18460 * [addl dest, ct]
18461 *
18462 * Size 8 - 11.
18463 */
18464
18465 if (cf == 0)
18466 {
18467 cf = ct;
18468 ct = 0;
18469 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18470 }
18471
18472 tmp = expand_simple_binop (mode, AND,
18473 copy_rtx (tmp),
18474 gen_int_mode (cf - ct, mode),
18475 copy_rtx (tmp), 1, OPTAB_DIRECT);
18476 if (ct)
18477 tmp = expand_simple_binop (mode, PLUS,
18478 copy_rtx (tmp), GEN_INT (ct),
18479 copy_rtx (tmp), 1, OPTAB_DIRECT);
18480 }
18481
18482 if (!rtx_equal_p (tmp, out))
18483 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
18484
18485 return true;
18486 }
18487
18488 if (diff < 0)
18489 {
18490 enum machine_mode cmp_mode = GET_MODE (op0);
18491
18492 HOST_WIDE_INT tmp;
18493 tmp = ct, ct = cf, cf = tmp;
18494 diff = -diff;
18495
18496 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18497 {
18498 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18499
18500 /* We may be reversing unordered compare to normal compare, that
18501 is not valid in general (we may convert non-trapping condition
18502 to trapping one), however on i386 we currently emit all
18503 comparisons unordered. */
18504 compare_code = reverse_condition_maybe_unordered (compare_code);
18505 code = reverse_condition_maybe_unordered (code);
18506 }
18507 else
18508 {
18509 compare_code = reverse_condition (compare_code);
18510 code = reverse_condition (code);
18511 }
18512 }
18513
18514 compare_code = UNKNOWN;
18515 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
18516 && CONST_INT_P (op1))
18517 {
18518 if (op1 == const0_rtx
18519 && (code == LT || code == GE))
18520 compare_code = code;
18521 else if (op1 == constm1_rtx)
18522 {
18523 if (code == LE)
18524 compare_code = LT;
18525 else if (code == GT)
18526 compare_code = GE;
18527 }
18528 }
18529
18530 /* Optimize dest = (op0 < 0) ? -1 : cf. */
18531 if (compare_code != UNKNOWN
18532 && GET_MODE (op0) == GET_MODE (out)
18533 && (cf == -1 || ct == -1))
18534 {
18535 /* If lea code below could be used, only optimize
18536 if it results in a 2 insn sequence. */
18537
18538 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
18539 || diff == 3 || diff == 5 || diff == 9)
18540 || (compare_code == LT && ct == -1)
18541 || (compare_code == GE && cf == -1))
18542 {
18543 /*
18544 * notl op1 (if necessary)
18545 * sarl $31, op1
18546 * orl cf, op1
18547 */
18548 if (ct != -1)
18549 {
18550 cf = ct;
18551 ct = -1;
18552 code = reverse_condition (code);
18553 }
18554
18555 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18556
18557 out = expand_simple_binop (mode, IOR,
18558 out, GEN_INT (cf),
18559 out, 1, OPTAB_DIRECT);
18560 if (out != operands[0])
18561 emit_move_insn (operands[0], out);
18562
18563 return true;
18564 }
18565 }
18566
18567
18568 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
18569 || diff == 3 || diff == 5 || diff == 9)
18570 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
18571 && (mode != DImode
18572 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
18573 {
18574 /*
18575 * xorl dest,dest
18576 * cmpl op1,op2
18577 * setcc dest
18578 * lea cf(dest*(ct-cf)),dest
18579 *
18580 * Size 14.
18581 *
18582 * This also catches the degenerate setcc-only case.
18583 */
18584
18585 rtx tmp;
18586 int nops;
18587
18588 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18589
18590 nops = 0;
18591 /* On x86_64 the lea instruction operates on Pmode, so we need
18592 to get arithmetics done in proper mode to match. */
18593 if (diff == 1)
18594 tmp = copy_rtx (out);
18595 else
18596 {
18597 rtx out1;
18598 out1 = copy_rtx (out);
18599 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
18600 nops++;
18601 if (diff & 1)
18602 {
18603 tmp = gen_rtx_PLUS (mode, tmp, out1);
18604 nops++;
18605 }
18606 }
18607 if (cf != 0)
18608 {
18609 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
18610 nops++;
18611 }
18612 if (!rtx_equal_p (tmp, out))
18613 {
18614 if (nops == 1)
18615 out = force_operand (tmp, copy_rtx (out));
18616 else
18617 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
18618 }
18619 if (!rtx_equal_p (out, operands[0]))
18620 emit_move_insn (operands[0], copy_rtx (out));
18621
18622 return true;
18623 }
18624
18625 /*
18626 * General case: Jumpful:
18627 * xorl dest,dest cmpl op1, op2
18628 * cmpl op1, op2 movl ct, dest
18629 * setcc dest jcc 1f
18630 * decl dest movl cf, dest
18631 * andl (cf-ct),dest 1:
18632 * addl ct,dest
18633 *
18634 * Size 20. Size 14.
18635 *
18636 * This is reasonably steep, but branch mispredict costs are
18637 * high on modern cpus, so consider failing only if optimizing
18638 * for space.
18639 */
18640
18641 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
18642 && BRANCH_COST (optimize_insn_for_speed_p (),
18643 false) >= 2)
18644 {
18645 if (cf == 0)
18646 {
18647 enum machine_mode cmp_mode = GET_MODE (op0);
18648
18649 cf = ct;
18650 ct = 0;
18651
18652 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18653 {
18654 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18655
18656 /* We may be reversing unordered compare to normal compare,
18657 that is not valid in general (we may convert non-trapping
18658 condition to trapping one), however on i386 we currently
18659 emit all comparisons unordered. */
18660 code = reverse_condition_maybe_unordered (code);
18661 }
18662 else
18663 {
18664 code = reverse_condition (code);
18665 if (compare_code != UNKNOWN)
18666 compare_code = reverse_condition (compare_code);
18667 }
18668 }
18669
18670 if (compare_code != UNKNOWN)
18671 {
18672 /* notl op1 (if needed)
18673 sarl $31, op1
18674 andl (cf-ct), op1
18675 addl ct, op1
18676
18677 For x < 0 (resp. x <= -1) there will be no notl,
18678 so if possible swap the constants to get rid of the
18679 complement.
18680 True/false will be -1/0 while code below (store flag
18681 followed by decrement) is 0/-1, so the constants need
18682 to be exchanged once more. */
18683
18684 if (compare_code == GE || !cf)
18685 {
18686 code = reverse_condition (code);
18687 compare_code = LT;
18688 }
18689 else
18690 {
18691 HOST_WIDE_INT tmp = cf;
18692 cf = ct;
18693 ct = tmp;
18694 }
18695
18696 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18697 }
18698 else
18699 {
18700 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18701
18702 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
18703 constm1_rtx,
18704 copy_rtx (out), 1, OPTAB_DIRECT);
18705 }
18706
18707 out = expand_simple_binop (mode, AND, copy_rtx (out),
18708 gen_int_mode (cf - ct, mode),
18709 copy_rtx (out), 1, OPTAB_DIRECT);
18710 if (ct)
18711 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
18712 copy_rtx (out), 1, OPTAB_DIRECT);
18713 if (!rtx_equal_p (out, operands[0]))
18714 emit_move_insn (operands[0], copy_rtx (out));
18715
18716 return true;
18717 }
18718 }
18719
18720 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
18721 {
18722 /* Try a few things more with specific constants and a variable. */
18723
18724 optab op;
18725 rtx var, orig_out, out, tmp;
18726
18727 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
18728 return false;
18729
18730 /* If one of the two operands is an interesting constant, load a
18731 constant with the above and mask it in with a logical operation. */
18732
18733 if (CONST_INT_P (operands[2]))
18734 {
18735 var = operands[3];
18736 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
18737 operands[3] = constm1_rtx, op = and_optab;
18738 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
18739 operands[3] = const0_rtx, op = ior_optab;
18740 else
18741 return false;
18742 }
18743 else if (CONST_INT_P (operands[3]))
18744 {
18745 var = operands[2];
18746 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
18747 operands[2] = constm1_rtx, op = and_optab;
18748 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
18749 operands[2] = const0_rtx, op = ior_optab;
18750 else
18751 return false;
18752 }
18753 else
18754 return false;
18755
18756 orig_out = operands[0];
18757 tmp = gen_reg_rtx (mode);
18758 operands[0] = tmp;
18759
18760 /* Recurse to get the constant loaded. */
18761 if (ix86_expand_int_movcc (operands) == 0)
18762 return false;
18763
18764 /* Mask in the interesting variable. */
18765 out = expand_binop (mode, op, var, tmp, orig_out, 0,
18766 OPTAB_WIDEN);
18767 if (!rtx_equal_p (out, orig_out))
18768 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
18769
18770 return true;
18771 }
18772
18773 /*
18774 * For comparison with above,
18775 *
18776 * movl cf,dest
18777 * movl ct,tmp
18778 * cmpl op1,op2
18779 * cmovcc tmp,dest
18780 *
18781 * Size 15.
18782 */
18783
18784 if (! nonimmediate_operand (operands[2], mode))
18785 operands[2] = force_reg (mode, operands[2]);
18786 if (! nonimmediate_operand (operands[3], mode))
18787 operands[3] = force_reg (mode, operands[3]);
18788
18789 if (! register_operand (operands[2], VOIDmode)
18790 && (mode == QImode
18791 || ! register_operand (operands[3], VOIDmode)))
18792 operands[2] = force_reg (mode, operands[2]);
18793
18794 if (mode == QImode
18795 && ! register_operand (operands[3], VOIDmode))
18796 operands[3] = force_reg (mode, operands[3]);
18797
18798 emit_insn (compare_seq);
18799 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
18800 gen_rtx_IF_THEN_ELSE (mode,
18801 compare_op, operands[2],
18802 operands[3])));
18803 return true;
18804 }
18805
18806 /* Swap, force into registers, or otherwise massage the two operands
18807 to an sse comparison with a mask result. Thus we differ a bit from
18808 ix86_prepare_fp_compare_args which expects to produce a flags result.
18809
18810 The DEST operand exists to help determine whether to commute commutative
18811 operators. The POP0/POP1 operands are updated in place. The new
18812 comparison code is returned, or UNKNOWN if not implementable. */
18813
18814 static enum rtx_code
18815 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
18816 rtx *pop0, rtx *pop1)
18817 {
18818 rtx tmp;
18819
18820 switch (code)
18821 {
18822 case LTGT:
18823 case UNEQ:
18824 /* AVX supports all the needed comparisons. */
18825 if (TARGET_AVX)
18826 break;
18827 /* We have no LTGT as an operator. We could implement it with
18828 NE & ORDERED, but this requires an extra temporary. It's
18829 not clear that it's worth it. */
18830 return UNKNOWN;
18831
18832 case LT:
18833 case LE:
18834 case UNGT:
18835 case UNGE:
18836 /* These are supported directly. */
18837 break;
18838
18839 case EQ:
18840 case NE:
18841 case UNORDERED:
18842 case ORDERED:
18843 /* AVX has 3 operand comparisons, no need to swap anything. */
18844 if (TARGET_AVX)
18845 break;
18846 /* For commutative operators, try to canonicalize the destination
18847 operand to be first in the comparison - this helps reload to
18848 avoid extra moves. */
18849 if (!dest || !rtx_equal_p (dest, *pop1))
18850 break;
18851 /* FALLTHRU */
18852
18853 case GE:
18854 case GT:
18855 case UNLE:
18856 case UNLT:
18857 /* These are not supported directly before AVX, and furthermore
18858 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
18859 comparison operands to transform into something that is
18860 supported. */
18861 tmp = *pop0;
18862 *pop0 = *pop1;
18863 *pop1 = tmp;
18864 code = swap_condition (code);
18865 break;
18866
18867 default:
18868 gcc_unreachable ();
18869 }
18870
18871 return code;
18872 }
18873
18874 /* Detect conditional moves that exactly match min/max operational
18875 semantics. Note that this is IEEE safe, as long as we don't
18876 interchange the operands.
18877
18878 Returns FALSE if this conditional move doesn't match a MIN/MAX,
18879 and TRUE if the operation is successful and instructions are emitted. */
18880
18881 static bool
18882 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
18883 rtx cmp_op1, rtx if_true, rtx if_false)
18884 {
18885 enum machine_mode mode;
18886 bool is_min;
18887 rtx tmp;
18888
18889 if (code == LT)
18890 ;
18891 else if (code == UNGE)
18892 {
18893 tmp = if_true;
18894 if_true = if_false;
18895 if_false = tmp;
18896 }
18897 else
18898 return false;
18899
18900 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
18901 is_min = true;
18902 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
18903 is_min = false;
18904 else
18905 return false;
18906
18907 mode = GET_MODE (dest);
18908
18909 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
18910 but MODE may be a vector mode and thus not appropriate. */
18911 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
18912 {
18913 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
18914 rtvec v;
18915
18916 if_true = force_reg (mode, if_true);
18917 v = gen_rtvec (2, if_true, if_false);
18918 tmp = gen_rtx_UNSPEC (mode, v, u);
18919 }
18920 else
18921 {
18922 code = is_min ? SMIN : SMAX;
18923 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
18924 }
18925
18926 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
18927 return true;
18928 }
18929
18930 /* Expand an sse vector comparison. Return the register with the result. */
18931
18932 static rtx
18933 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
18934 rtx op_true, rtx op_false)
18935 {
18936 enum machine_mode mode = GET_MODE (dest);
18937 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
18938 rtx x;
18939
18940 cmp_op0 = force_reg (cmp_mode, cmp_op0);
18941 if (!nonimmediate_operand (cmp_op1, cmp_mode))
18942 cmp_op1 = force_reg (cmp_mode, cmp_op1);
18943
18944 if (optimize
18945 || reg_overlap_mentioned_p (dest, op_true)
18946 || reg_overlap_mentioned_p (dest, op_false))
18947 dest = gen_reg_rtx (mode);
18948
18949 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
18950 if (cmp_mode != mode)
18951 {
18952 x = force_reg (cmp_mode, x);
18953 convert_move (dest, x, false);
18954 }
18955 else
18956 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18957
18958 return dest;
18959 }
18960
18961 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
18962 operations. This is used for both scalar and vector conditional moves. */
18963
18964 static void
18965 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
18966 {
18967 enum machine_mode mode = GET_MODE (dest);
18968 rtx t2, t3, x;
18969
18970 if (vector_all_ones_operand (op_true, mode)
18971 && rtx_equal_p (op_false, CONST0_RTX (mode)))
18972 {
18973 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
18974 }
18975 else if (op_false == CONST0_RTX (mode))
18976 {
18977 op_true = force_reg (mode, op_true);
18978 x = gen_rtx_AND (mode, cmp, op_true);
18979 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18980 }
18981 else if (op_true == CONST0_RTX (mode))
18982 {
18983 op_false = force_reg (mode, op_false);
18984 x = gen_rtx_NOT (mode, cmp);
18985 x = gen_rtx_AND (mode, x, op_false);
18986 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18987 }
18988 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
18989 {
18990 op_false = force_reg (mode, op_false);
18991 x = gen_rtx_IOR (mode, cmp, op_false);
18992 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18993 }
18994 else if (TARGET_XOP)
18995 {
18996 op_true = force_reg (mode, op_true);
18997
18998 if (!nonimmediate_operand (op_false, mode))
18999 op_false = force_reg (mode, op_false);
19000
19001 emit_insn (gen_rtx_SET (mode, dest,
19002 gen_rtx_IF_THEN_ELSE (mode, cmp,
19003 op_true,
19004 op_false)));
19005 }
19006 else
19007 {
19008 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19009
19010 if (!nonimmediate_operand (op_true, mode))
19011 op_true = force_reg (mode, op_true);
19012
19013 op_false = force_reg (mode, op_false);
19014
19015 switch (mode)
19016 {
19017 case V4SFmode:
19018 if (TARGET_SSE4_1)
19019 gen = gen_sse4_1_blendvps;
19020 break;
19021 case V2DFmode:
19022 if (TARGET_SSE4_1)
19023 gen = gen_sse4_1_blendvpd;
19024 break;
19025 case V16QImode:
19026 case V8HImode:
19027 case V4SImode:
19028 case V2DImode:
19029 if (TARGET_SSE4_1)
19030 {
19031 gen = gen_sse4_1_pblendvb;
19032 dest = gen_lowpart (V16QImode, dest);
19033 op_false = gen_lowpart (V16QImode, op_false);
19034 op_true = gen_lowpart (V16QImode, op_true);
19035 cmp = gen_lowpart (V16QImode, cmp);
19036 }
19037 break;
19038 case V8SFmode:
19039 if (TARGET_AVX)
19040 gen = gen_avx_blendvps256;
19041 break;
19042 case V4DFmode:
19043 if (TARGET_AVX)
19044 gen = gen_avx_blendvpd256;
19045 break;
19046 case V32QImode:
19047 case V16HImode:
19048 case V8SImode:
19049 case V4DImode:
19050 if (TARGET_AVX2)
19051 {
19052 gen = gen_avx2_pblendvb;
19053 dest = gen_lowpart (V32QImode, dest);
19054 op_false = gen_lowpart (V32QImode, op_false);
19055 op_true = gen_lowpart (V32QImode, op_true);
19056 cmp = gen_lowpart (V32QImode, cmp);
19057 }
19058 break;
19059 default:
19060 break;
19061 }
19062
19063 if (gen != NULL)
19064 emit_insn (gen (dest, op_false, op_true, cmp));
19065 else
19066 {
19067 op_true = force_reg (mode, op_true);
19068
19069 t2 = gen_reg_rtx (mode);
19070 if (optimize)
19071 t3 = gen_reg_rtx (mode);
19072 else
19073 t3 = dest;
19074
19075 x = gen_rtx_AND (mode, op_true, cmp);
19076 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19077
19078 x = gen_rtx_NOT (mode, cmp);
19079 x = gen_rtx_AND (mode, x, op_false);
19080 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19081
19082 x = gen_rtx_IOR (mode, t3, t2);
19083 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19084 }
19085 }
19086 }
19087
19088 /* Expand a floating-point conditional move. Return true if successful. */
19089
19090 bool
19091 ix86_expand_fp_movcc (rtx operands[])
19092 {
19093 enum machine_mode mode = GET_MODE (operands[0]);
19094 enum rtx_code code = GET_CODE (operands[1]);
19095 rtx tmp, compare_op;
19096 rtx op0 = XEXP (operands[1], 0);
19097 rtx op1 = XEXP (operands[1], 1);
19098
19099 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19100 {
19101 enum machine_mode cmode;
19102
19103 /* Since we've no cmove for sse registers, don't force bad register
19104 allocation just to gain access to it. Deny movcc when the
19105 comparison mode doesn't match the move mode. */
19106 cmode = GET_MODE (op0);
19107 if (cmode == VOIDmode)
19108 cmode = GET_MODE (op1);
19109 if (cmode != mode)
19110 return false;
19111
19112 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19113 if (code == UNKNOWN)
19114 return false;
19115
19116 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19117 operands[2], operands[3]))
19118 return true;
19119
19120 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19121 operands[2], operands[3]);
19122 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19123 return true;
19124 }
19125
19126 /* The floating point conditional move instructions don't directly
19127 support conditions resulting from a signed integer comparison. */
19128
19129 compare_op = ix86_expand_compare (code, op0, op1);
19130 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19131 {
19132 tmp = gen_reg_rtx (QImode);
19133 ix86_expand_setcc (tmp, code, op0, op1);
19134
19135 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19136 }
19137
19138 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19139 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19140 operands[2], operands[3])));
19141
19142 return true;
19143 }
19144
19145 /* Expand a floating-point vector conditional move; a vcond operation
19146 rather than a movcc operation. */
19147
19148 bool
19149 ix86_expand_fp_vcond (rtx operands[])
19150 {
19151 enum rtx_code code = GET_CODE (operands[3]);
19152 rtx cmp;
19153
19154 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19155 &operands[4], &operands[5]);
19156 if (code == UNKNOWN)
19157 {
19158 rtx temp;
19159 switch (GET_CODE (operands[3]))
19160 {
19161 case LTGT:
19162 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19163 operands[5], operands[0], operands[0]);
19164 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19165 operands[5], operands[1], operands[2]);
19166 code = AND;
19167 break;
19168 case UNEQ:
19169 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19170 operands[5], operands[0], operands[0]);
19171 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19172 operands[5], operands[1], operands[2]);
19173 code = IOR;
19174 break;
19175 default:
19176 gcc_unreachable ();
19177 }
19178 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19179 OPTAB_DIRECT);
19180 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19181 return true;
19182 }
19183
19184 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19185 operands[5], operands[1], operands[2]))
19186 return true;
19187
19188 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19189 operands[1], operands[2]);
19190 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19191 return true;
19192 }
19193
19194 /* Expand a signed/unsigned integral vector conditional move. */
19195
19196 bool
19197 ix86_expand_int_vcond (rtx operands[])
19198 {
19199 enum machine_mode data_mode = GET_MODE (operands[0]);
19200 enum machine_mode mode = GET_MODE (operands[4]);
19201 enum rtx_code code = GET_CODE (operands[3]);
19202 bool negate = false;
19203 rtx x, cop0, cop1;
19204
19205 cop0 = operands[4];
19206 cop1 = operands[5];
19207
19208 /* XOP supports all of the comparisons on all vector int types. */
19209 if (!TARGET_XOP)
19210 {
19211 /* Canonicalize the comparison to EQ, GT, GTU. */
19212 switch (code)
19213 {
19214 case EQ:
19215 case GT:
19216 case GTU:
19217 break;
19218
19219 case NE:
19220 case LE:
19221 case LEU:
19222 code = reverse_condition (code);
19223 negate = true;
19224 break;
19225
19226 case GE:
19227 case GEU:
19228 code = reverse_condition (code);
19229 negate = true;
19230 /* FALLTHRU */
19231
19232 case LT:
19233 case LTU:
19234 code = swap_condition (code);
19235 x = cop0, cop0 = cop1, cop1 = x;
19236 break;
19237
19238 default:
19239 gcc_unreachable ();
19240 }
19241
19242 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19243 if (mode == V2DImode)
19244 {
19245 switch (code)
19246 {
19247 case EQ:
19248 /* SSE4.1 supports EQ. */
19249 if (!TARGET_SSE4_1)
19250 return false;
19251 break;
19252
19253 case GT:
19254 case GTU:
19255 /* SSE4.2 supports GT/GTU. */
19256 if (!TARGET_SSE4_2)
19257 return false;
19258 break;
19259
19260 default:
19261 gcc_unreachable ();
19262 }
19263 }
19264
19265 /* Unsigned parallel compare is not supported by the hardware.
19266 Play some tricks to turn this into a signed comparison
19267 against 0. */
19268 if (code == GTU)
19269 {
19270 cop0 = force_reg (mode, cop0);
19271
19272 switch (mode)
19273 {
19274 case V8SImode:
19275 case V4DImode:
19276 case V4SImode:
19277 case V2DImode:
19278 {
19279 rtx t1, t2, mask;
19280 rtx (*gen_sub3) (rtx, rtx, rtx);
19281
19282 switch (mode)
19283 {
19284 case V8SImode: gen_sub3 = gen_subv8si3; break;
19285 case V4DImode: gen_sub3 = gen_subv4di3; break;
19286 case V4SImode: gen_sub3 = gen_subv4si3; break;
19287 case V2DImode: gen_sub3 = gen_subv2di3; break;
19288 default:
19289 gcc_unreachable ();
19290 }
19291 /* Subtract (-(INT MAX) - 1) from both operands to make
19292 them signed. */
19293 mask = ix86_build_signbit_mask (mode, true, false);
19294 t1 = gen_reg_rtx (mode);
19295 emit_insn (gen_sub3 (t1, cop0, mask));
19296
19297 t2 = gen_reg_rtx (mode);
19298 emit_insn (gen_sub3 (t2, cop1, mask));
19299
19300 cop0 = t1;
19301 cop1 = t2;
19302 code = GT;
19303 }
19304 break;
19305
19306 case V32QImode:
19307 case V16HImode:
19308 case V16QImode:
19309 case V8HImode:
19310 /* Perform a parallel unsigned saturating subtraction. */
19311 x = gen_reg_rtx (mode);
19312 emit_insn (gen_rtx_SET (VOIDmode, x,
19313 gen_rtx_US_MINUS (mode, cop0, cop1)));
19314
19315 cop0 = x;
19316 cop1 = CONST0_RTX (mode);
19317 code = EQ;
19318 negate = !negate;
19319 break;
19320
19321 default:
19322 gcc_unreachable ();
19323 }
19324 }
19325 }
19326
19327 /* Allow the comparison to be done in one mode, but the movcc to
19328 happen in another mode. */
19329 if (data_mode == mode)
19330 {
19331 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19332 operands[1+negate], operands[2-negate]);
19333 }
19334 else
19335 {
19336 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19337 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19338 code, cop0, cop1,
19339 operands[1+negate], operands[2-negate]);
19340 x = gen_lowpart (data_mode, x);
19341 }
19342
19343 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19344 operands[2-negate]);
19345 return true;
19346 }
19347
19348 /* Expand a variable vector permutation. */
19349
19350 void
19351 ix86_expand_vec_perm (rtx operands[])
19352 {
19353 rtx target = operands[0];
19354 rtx op0 = operands[1];
19355 rtx op1 = operands[2];
19356 rtx mask = operands[3];
19357 rtx t1, t2, t3, t4, vt, vt2, vec[32];
19358 enum machine_mode mode = GET_MODE (op0);
19359 enum machine_mode maskmode = GET_MODE (mask);
19360 int w, e, i;
19361 bool one_operand_shuffle = rtx_equal_p (op0, op1);
19362
19363 /* Number of elements in the vector. */
19364 w = GET_MODE_NUNITS (mode);
19365 e = GET_MODE_UNIT_SIZE (mode);
19366 gcc_assert (w <= 32);
19367
19368 if (TARGET_AVX2)
19369 {
19370 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19371 {
19372 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19373 an constant shuffle operand. With a tiny bit of effort we can
19374 use VPERMD instead. A re-interpretation stall for V4DFmode is
19375 unfortunate but there's no avoiding it.
19376 Similarly for V16HImode we don't have instructions for variable
19377 shuffling, while for V32QImode we can use after preparing suitable
19378 masks vpshufb; vpshufb; vpermq; vpor. */
19379
19380 if (mode == V16HImode)
19381 {
19382 maskmode = mode = V32QImode;
19383 w = 32;
19384 e = 1;
19385 }
19386 else
19387 {
19388 maskmode = mode = V8SImode;
19389 w = 8;
19390 e = 4;
19391 }
19392 t1 = gen_reg_rtx (maskmode);
19393
19394 /* Replicate the low bits of the V4DImode mask into V8SImode:
19395 mask = { A B C D }
19396 t1 = { A A B B C C D D }. */
19397 for (i = 0; i < w / 2; ++i)
19398 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
19399 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19400 vt = force_reg (maskmode, vt);
19401 mask = gen_lowpart (maskmode, mask);
19402 if (maskmode == V8SImode)
19403 emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
19404 else
19405 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
19406
19407 /* Multiply the shuffle indicies by two. */
19408 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
19409 OPTAB_DIRECT);
19410
19411 /* Add one to the odd shuffle indicies:
19412 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
19413 for (i = 0; i < w / 2; ++i)
19414 {
19415 vec[i * 2] = const0_rtx;
19416 vec[i * 2 + 1] = const1_rtx;
19417 }
19418 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19419 vt = force_const_mem (maskmode, vt);
19420 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
19421 OPTAB_DIRECT);
19422
19423 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
19424 operands[3] = mask = t1;
19425 target = gen_lowpart (mode, target);
19426 op0 = gen_lowpart (mode, op0);
19427 op1 = gen_lowpart (mode, op1);
19428 }
19429
19430 switch (mode)
19431 {
19432 case V8SImode:
19433 /* The VPERMD and VPERMPS instructions already properly ignore
19434 the high bits of the shuffle elements. No need for us to
19435 perform an AND ourselves. */
19436 if (one_operand_shuffle)
19437 emit_insn (gen_avx2_permvarv8si (target, mask, op0));
19438 else
19439 {
19440 t1 = gen_reg_rtx (V8SImode);
19441 t2 = gen_reg_rtx (V8SImode);
19442 emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
19443 emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
19444 goto merge_two;
19445 }
19446 return;
19447
19448 case V8SFmode:
19449 mask = gen_lowpart (V8SFmode, mask);
19450 if (one_operand_shuffle)
19451 emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
19452 else
19453 {
19454 t1 = gen_reg_rtx (V8SFmode);
19455 t2 = gen_reg_rtx (V8SFmode);
19456 emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
19457 emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
19458 goto merge_two;
19459 }
19460 return;
19461
19462 case V4SImode:
19463 /* By combining the two 128-bit input vectors into one 256-bit
19464 input vector, we can use VPERMD and VPERMPS for the full
19465 two-operand shuffle. */
19466 t1 = gen_reg_rtx (V8SImode);
19467 t2 = gen_reg_rtx (V8SImode);
19468 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
19469 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19470 emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
19471 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
19472 return;
19473
19474 case V4SFmode:
19475 t1 = gen_reg_rtx (V8SFmode);
19476 t2 = gen_reg_rtx (V8SFmode);
19477 mask = gen_lowpart (V4SFmode, mask);
19478 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
19479 emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
19480 emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
19481 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
19482 return;
19483
19484 case V32QImode:
19485 t1 = gen_reg_rtx (V32QImode);
19486 t2 = gen_reg_rtx (V32QImode);
19487 t3 = gen_reg_rtx (V32QImode);
19488 vt2 = GEN_INT (128);
19489 for (i = 0; i < 32; i++)
19490 vec[i] = vt2;
19491 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19492 vt = force_reg (V32QImode, vt);
19493 for (i = 0; i < 32; i++)
19494 vec[i] = i < 16 ? vt2 : const0_rtx;
19495 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19496 vt2 = force_reg (V32QImode, vt2);
19497 /* From mask create two adjusted masks, which contain the same
19498 bits as mask in the low 7 bits of each vector element.
19499 The first mask will have the most significant bit clear
19500 if it requests element from the same 128-bit lane
19501 and MSB set if it requests element from the other 128-bit lane.
19502 The second mask will have the opposite values of the MSB,
19503 and additionally will have its 128-bit lanes swapped.
19504 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
19505 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
19506 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
19507 stands for other 12 bytes. */
19508 /* The bit whether element is from the same lane or the other
19509 lane is bit 4, so shift it up by 3 to the MSB position. */
19510 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
19511 gen_lowpart (V4DImode, mask),
19512 GEN_INT (3)));
19513 /* Clear MSB bits from the mask just in case it had them set. */
19514 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
19515 /* After this t1 will have MSB set for elements from other lane. */
19516 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
19517 /* Clear bits other than MSB. */
19518 emit_insn (gen_andv32qi3 (t1, t1, vt));
19519 /* Or in the lower bits from mask into t3. */
19520 emit_insn (gen_iorv32qi3 (t3, t1, t2));
19521 /* And invert MSB bits in t1, so MSB is set for elements from the same
19522 lane. */
19523 emit_insn (gen_xorv32qi3 (t1, t1, vt));
19524 /* Swap 128-bit lanes in t3. */
19525 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19526 gen_lowpart (V4DImode, t3),
19527 const2_rtx, GEN_INT (3),
19528 const0_rtx, const1_rtx));
19529 /* And or in the lower bits from mask into t1. */
19530 emit_insn (gen_iorv32qi3 (t1, t1, t2));
19531 if (one_operand_shuffle)
19532 {
19533 /* Each of these shuffles will put 0s in places where
19534 element from the other 128-bit lane is needed, otherwise
19535 will shuffle in the requested value. */
19536 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
19537 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
19538 /* For t3 the 128-bit lanes are swapped again. */
19539 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19540 gen_lowpart (V4DImode, t3),
19541 const2_rtx, GEN_INT (3),
19542 const0_rtx, const1_rtx));
19543 /* And oring both together leads to the result. */
19544 emit_insn (gen_iorv32qi3 (target, t1, t3));
19545 return;
19546 }
19547
19548 t4 = gen_reg_rtx (V32QImode);
19549 /* Similarly to the above one_operand_shuffle code,
19550 just for repeated twice for each operand. merge_two:
19551 code will merge the two results together. */
19552 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
19553 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
19554 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
19555 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
19556 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
19557 gen_lowpart (V4DImode, t4),
19558 const2_rtx, GEN_INT (3),
19559 const0_rtx, const1_rtx));
19560 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19561 gen_lowpart (V4DImode, t3),
19562 const2_rtx, GEN_INT (3),
19563 const0_rtx, const1_rtx));
19564 emit_insn (gen_iorv32qi3 (t4, t2, t4));
19565 emit_insn (gen_iorv32qi3 (t3, t1, t3));
19566 t1 = t4;
19567 t2 = t3;
19568 goto merge_two;
19569
19570 default:
19571 gcc_assert (GET_MODE_SIZE (mode) <= 16);
19572 break;
19573 }
19574 }
19575
19576 if (TARGET_XOP)
19577 {
19578 /* The XOP VPPERM insn supports three inputs. By ignoring the
19579 one_operand_shuffle special case, we avoid creating another
19580 set of constant vectors in memory. */
19581 one_operand_shuffle = false;
19582
19583 /* mask = mask & {2*w-1, ...} */
19584 vt = GEN_INT (2*w - 1);
19585 }
19586 else
19587 {
19588 /* mask = mask & {w-1, ...} */
19589 vt = GEN_INT (w - 1);
19590 }
19591
19592 for (i = 0; i < w; i++)
19593 vec[i] = vt;
19594 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19595 mask = expand_simple_binop (maskmode, AND, mask, vt,
19596 NULL_RTX, 0, OPTAB_DIRECT);
19597
19598 /* For non-QImode operations, convert the word permutation control
19599 into a byte permutation control. */
19600 if (mode != V16QImode)
19601 {
19602 mask = expand_simple_binop (maskmode, ASHIFT, mask,
19603 GEN_INT (exact_log2 (e)),
19604 NULL_RTX, 0, OPTAB_DIRECT);
19605
19606 /* Convert mask to vector of chars. */
19607 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
19608
19609 /* Replicate each of the input bytes into byte positions:
19610 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
19611 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
19612 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
19613 for (i = 0; i < 16; ++i)
19614 vec[i] = GEN_INT (i/e * e);
19615 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
19616 vt = force_const_mem (V16QImode, vt);
19617 if (TARGET_XOP)
19618 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
19619 else
19620 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
19621
19622 /* Convert it into the byte positions by doing
19623 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
19624 for (i = 0; i < 16; ++i)
19625 vec[i] = GEN_INT (i % e);
19626 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
19627 vt = force_const_mem (V16QImode, vt);
19628 emit_insn (gen_addv16qi3 (mask, mask, vt));
19629 }
19630
19631 /* The actual shuffle operations all operate on V16QImode. */
19632 op0 = gen_lowpart (V16QImode, op0);
19633 op1 = gen_lowpart (V16QImode, op1);
19634 target = gen_lowpart (V16QImode, target);
19635
19636 if (TARGET_XOP)
19637 {
19638 emit_insn (gen_xop_pperm (target, op0, op1, mask));
19639 }
19640 else if (one_operand_shuffle)
19641 {
19642 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
19643 }
19644 else
19645 {
19646 rtx xops[6];
19647 bool ok;
19648
19649 /* Shuffle the two input vectors independently. */
19650 t1 = gen_reg_rtx (V16QImode);
19651 t2 = gen_reg_rtx (V16QImode);
19652 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
19653 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
19654
19655 merge_two:
19656 /* Then merge them together. The key is whether any given control
19657 element contained a bit set that indicates the second word. */
19658 mask = operands[3];
19659 vt = GEN_INT (w);
19660 if (maskmode == V2DImode && !TARGET_SSE4_1)
19661 {
19662 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
19663 more shuffle to convert the V2DI input mask into a V4SI
19664 input mask. At which point the masking that expand_int_vcond
19665 will work as desired. */
19666 rtx t3 = gen_reg_rtx (V4SImode);
19667 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
19668 const0_rtx, const0_rtx,
19669 const2_rtx, const2_rtx));
19670 mask = t3;
19671 maskmode = V4SImode;
19672 e = w = 4;
19673 }
19674
19675 for (i = 0; i < w; i++)
19676 vec[i] = vt;
19677 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19678 vt = force_reg (maskmode, vt);
19679 mask = expand_simple_binop (maskmode, AND, mask, vt,
19680 NULL_RTX, 0, OPTAB_DIRECT);
19681
19682 xops[0] = gen_lowpart (mode, operands[0]);
19683 xops[1] = gen_lowpart (mode, t2);
19684 xops[2] = gen_lowpart (mode, t1);
19685 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
19686 xops[4] = mask;
19687 xops[5] = vt;
19688 ok = ix86_expand_int_vcond (xops);
19689 gcc_assert (ok);
19690 }
19691 }
19692
19693 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
19694 true if we should do zero extension, else sign extension. HIGH_P is
19695 true if we want the N/2 high elements, else the low elements. */
19696
19697 void
19698 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
19699 {
19700 enum machine_mode imode = GET_MODE (operands[1]);
19701 rtx tmp, dest;
19702
19703 if (TARGET_SSE4_1)
19704 {
19705 rtx (*unpack)(rtx, rtx);
19706 rtx (*extract)(rtx, rtx) = NULL;
19707 enum machine_mode halfmode = BLKmode;
19708
19709 switch (imode)
19710 {
19711 case V32QImode:
19712 if (unsigned_p)
19713 unpack = gen_avx2_zero_extendv16qiv16hi2;
19714 else
19715 unpack = gen_avx2_sign_extendv16qiv16hi2;
19716 halfmode = V16QImode;
19717 extract
19718 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
19719 break;
19720 case V16HImode:
19721 if (unsigned_p)
19722 unpack = gen_avx2_zero_extendv8hiv8si2;
19723 else
19724 unpack = gen_avx2_sign_extendv8hiv8si2;
19725 halfmode = V8HImode;
19726 extract
19727 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
19728 break;
19729 case V8SImode:
19730 if (unsigned_p)
19731 unpack = gen_avx2_zero_extendv4siv4di2;
19732 else
19733 unpack = gen_avx2_sign_extendv4siv4di2;
19734 halfmode = V4SImode;
19735 extract
19736 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
19737 break;
19738 case V16QImode:
19739 if (unsigned_p)
19740 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
19741 else
19742 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
19743 break;
19744 case V8HImode:
19745 if (unsigned_p)
19746 unpack = gen_sse4_1_zero_extendv4hiv4si2;
19747 else
19748 unpack = gen_sse4_1_sign_extendv4hiv4si2;
19749 break;
19750 case V4SImode:
19751 if (unsigned_p)
19752 unpack = gen_sse4_1_zero_extendv2siv2di2;
19753 else
19754 unpack = gen_sse4_1_sign_extendv2siv2di2;
19755 break;
19756 default:
19757 gcc_unreachable ();
19758 }
19759
19760 if (GET_MODE_SIZE (imode) == 32)
19761 {
19762 tmp = gen_reg_rtx (halfmode);
19763 emit_insn (extract (tmp, operands[1]));
19764 }
19765 else if (high_p)
19766 {
19767 /* Shift higher 8 bytes to lower 8 bytes. */
19768 tmp = gen_reg_rtx (imode);
19769 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
19770 gen_lowpart (V1TImode, operands[1]),
19771 GEN_INT (64)));
19772 }
19773 else
19774 tmp = operands[1];
19775
19776 emit_insn (unpack (operands[0], tmp));
19777 }
19778 else
19779 {
19780 rtx (*unpack)(rtx, rtx, rtx);
19781
19782 switch (imode)
19783 {
19784 case V16QImode:
19785 if (high_p)
19786 unpack = gen_vec_interleave_highv16qi;
19787 else
19788 unpack = gen_vec_interleave_lowv16qi;
19789 break;
19790 case V8HImode:
19791 if (high_p)
19792 unpack = gen_vec_interleave_highv8hi;
19793 else
19794 unpack = gen_vec_interleave_lowv8hi;
19795 break;
19796 case V4SImode:
19797 if (high_p)
19798 unpack = gen_vec_interleave_highv4si;
19799 else
19800 unpack = gen_vec_interleave_lowv4si;
19801 break;
19802 default:
19803 gcc_unreachable ();
19804 }
19805
19806 dest = gen_lowpart (imode, operands[0]);
19807
19808 if (unsigned_p)
19809 tmp = force_reg (imode, CONST0_RTX (imode));
19810 else
19811 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
19812 operands[1], pc_rtx, pc_rtx);
19813
19814 emit_insn (unpack (dest, operands[1], tmp));
19815 }
19816 }
19817
19818 /* Expand conditional increment or decrement using adb/sbb instructions.
19819 The default case using setcc followed by the conditional move can be
19820 done by generic code. */
19821 bool
19822 ix86_expand_int_addcc (rtx operands[])
19823 {
19824 enum rtx_code code = GET_CODE (operands[1]);
19825 rtx flags;
19826 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
19827 rtx compare_op;
19828 rtx val = const0_rtx;
19829 bool fpcmp = false;
19830 enum machine_mode mode;
19831 rtx op0 = XEXP (operands[1], 0);
19832 rtx op1 = XEXP (operands[1], 1);
19833
19834 if (operands[3] != const1_rtx
19835 && operands[3] != constm1_rtx)
19836 return false;
19837 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19838 return false;
19839 code = GET_CODE (compare_op);
19840
19841 flags = XEXP (compare_op, 0);
19842
19843 if (GET_MODE (flags) == CCFPmode
19844 || GET_MODE (flags) == CCFPUmode)
19845 {
19846 fpcmp = true;
19847 code = ix86_fp_compare_code_to_integer (code);
19848 }
19849
19850 if (code != LTU)
19851 {
19852 val = constm1_rtx;
19853 if (fpcmp)
19854 PUT_CODE (compare_op,
19855 reverse_condition_maybe_unordered
19856 (GET_CODE (compare_op)));
19857 else
19858 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
19859 }
19860
19861 mode = GET_MODE (operands[0]);
19862
19863 /* Construct either adc or sbb insn. */
19864 if ((code == LTU) == (operands[3] == constm1_rtx))
19865 {
19866 switch (mode)
19867 {
19868 case QImode:
19869 insn = gen_subqi3_carry;
19870 break;
19871 case HImode:
19872 insn = gen_subhi3_carry;
19873 break;
19874 case SImode:
19875 insn = gen_subsi3_carry;
19876 break;
19877 case DImode:
19878 insn = gen_subdi3_carry;
19879 break;
19880 default:
19881 gcc_unreachable ();
19882 }
19883 }
19884 else
19885 {
19886 switch (mode)
19887 {
19888 case QImode:
19889 insn = gen_addqi3_carry;
19890 break;
19891 case HImode:
19892 insn = gen_addhi3_carry;
19893 break;
19894 case SImode:
19895 insn = gen_addsi3_carry;
19896 break;
19897 case DImode:
19898 insn = gen_adddi3_carry;
19899 break;
19900 default:
19901 gcc_unreachable ();
19902 }
19903 }
19904 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
19905
19906 return true;
19907 }
19908
19909
19910 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
19911 but works for floating pointer parameters and nonoffsetable memories.
19912 For pushes, it returns just stack offsets; the values will be saved
19913 in the right order. Maximally three parts are generated. */
19914
19915 static int
19916 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
19917 {
19918 int size;
19919
19920 if (!TARGET_64BIT)
19921 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
19922 else
19923 size = (GET_MODE_SIZE (mode) + 4) / 8;
19924
19925 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
19926 gcc_assert (size >= 2 && size <= 4);
19927
19928 /* Optimize constant pool reference to immediates. This is used by fp
19929 moves, that force all constants to memory to allow combining. */
19930 if (MEM_P (operand) && MEM_READONLY_P (operand))
19931 {
19932 rtx tmp = maybe_get_pool_constant (operand);
19933 if (tmp)
19934 operand = tmp;
19935 }
19936
19937 if (MEM_P (operand) && !offsettable_memref_p (operand))
19938 {
19939 /* The only non-offsetable memories we handle are pushes. */
19940 int ok = push_operand (operand, VOIDmode);
19941
19942 gcc_assert (ok);
19943
19944 operand = copy_rtx (operand);
19945 PUT_MODE (operand, Pmode);
19946 parts[0] = parts[1] = parts[2] = parts[3] = operand;
19947 return size;
19948 }
19949
19950 if (GET_CODE (operand) == CONST_VECTOR)
19951 {
19952 enum machine_mode imode = int_mode_for_mode (mode);
19953 /* Caution: if we looked through a constant pool memory above,
19954 the operand may actually have a different mode now. That's
19955 ok, since we want to pun this all the way back to an integer. */
19956 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
19957 gcc_assert (operand != NULL);
19958 mode = imode;
19959 }
19960
19961 if (!TARGET_64BIT)
19962 {
19963 if (mode == DImode)
19964 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
19965 else
19966 {
19967 int i;
19968
19969 if (REG_P (operand))
19970 {
19971 gcc_assert (reload_completed);
19972 for (i = 0; i < size; i++)
19973 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
19974 }
19975 else if (offsettable_memref_p (operand))
19976 {
19977 operand = adjust_address (operand, SImode, 0);
19978 parts[0] = operand;
19979 for (i = 1; i < size; i++)
19980 parts[i] = adjust_address (operand, SImode, 4 * i);
19981 }
19982 else if (GET_CODE (operand) == CONST_DOUBLE)
19983 {
19984 REAL_VALUE_TYPE r;
19985 long l[4];
19986
19987 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
19988 switch (mode)
19989 {
19990 case TFmode:
19991 real_to_target (l, &r, mode);
19992 parts[3] = gen_int_mode (l[3], SImode);
19993 parts[2] = gen_int_mode (l[2], SImode);
19994 break;
19995 case XFmode:
19996 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
19997 parts[2] = gen_int_mode (l[2], SImode);
19998 break;
19999 case DFmode:
20000 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20001 break;
20002 default:
20003 gcc_unreachable ();
20004 }
20005 parts[1] = gen_int_mode (l[1], SImode);
20006 parts[0] = gen_int_mode (l[0], SImode);
20007 }
20008 else
20009 gcc_unreachable ();
20010 }
20011 }
20012 else
20013 {
20014 if (mode == TImode)
20015 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20016 if (mode == XFmode || mode == TFmode)
20017 {
20018 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20019 if (REG_P (operand))
20020 {
20021 gcc_assert (reload_completed);
20022 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20023 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20024 }
20025 else if (offsettable_memref_p (operand))
20026 {
20027 operand = adjust_address (operand, DImode, 0);
20028 parts[0] = operand;
20029 parts[1] = adjust_address (operand, upper_mode, 8);
20030 }
20031 else if (GET_CODE (operand) == CONST_DOUBLE)
20032 {
20033 REAL_VALUE_TYPE r;
20034 long l[4];
20035
20036 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20037 real_to_target (l, &r, mode);
20038
20039 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20040 if (HOST_BITS_PER_WIDE_INT >= 64)
20041 parts[0]
20042 = gen_int_mode
20043 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20044 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20045 DImode);
20046 else
20047 parts[0] = immed_double_const (l[0], l[1], DImode);
20048
20049 if (upper_mode == SImode)
20050 parts[1] = gen_int_mode (l[2], SImode);
20051 else if (HOST_BITS_PER_WIDE_INT >= 64)
20052 parts[1]
20053 = gen_int_mode
20054 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20055 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20056 DImode);
20057 else
20058 parts[1] = immed_double_const (l[2], l[3], DImode);
20059 }
20060 else
20061 gcc_unreachable ();
20062 }
20063 }
20064
20065 return size;
20066 }
20067
20068 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20069 Return false when normal moves are needed; true when all required
20070 insns have been emitted. Operands 2-4 contain the input values
20071 int the correct order; operands 5-7 contain the output values. */
20072
20073 void
20074 ix86_split_long_move (rtx operands[])
20075 {
20076 rtx part[2][4];
20077 int nparts, i, j;
20078 int push = 0;
20079 int collisions = 0;
20080 enum machine_mode mode = GET_MODE (operands[0]);
20081 bool collisionparts[4];
20082
20083 /* The DFmode expanders may ask us to move double.
20084 For 64bit target this is single move. By hiding the fact
20085 here we simplify i386.md splitters. */
20086 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20087 {
20088 /* Optimize constant pool reference to immediates. This is used by
20089 fp moves, that force all constants to memory to allow combining. */
20090
20091 if (MEM_P (operands[1])
20092 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20093 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20094 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20095 if (push_operand (operands[0], VOIDmode))
20096 {
20097 operands[0] = copy_rtx (operands[0]);
20098 PUT_MODE (operands[0], Pmode);
20099 }
20100 else
20101 operands[0] = gen_lowpart (DImode, operands[0]);
20102 operands[1] = gen_lowpart (DImode, operands[1]);
20103 emit_move_insn (operands[0], operands[1]);
20104 return;
20105 }
20106
20107 /* The only non-offsettable memory we handle is push. */
20108 if (push_operand (operands[0], VOIDmode))
20109 push = 1;
20110 else
20111 gcc_assert (!MEM_P (operands[0])
20112 || offsettable_memref_p (operands[0]));
20113
20114 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20115 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20116
20117 /* When emitting push, take care for source operands on the stack. */
20118 if (push && MEM_P (operands[1])
20119 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20120 {
20121 rtx src_base = XEXP (part[1][nparts - 1], 0);
20122
20123 /* Compensate for the stack decrement by 4. */
20124 if (!TARGET_64BIT && nparts == 3
20125 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20126 src_base = plus_constant (src_base, 4);
20127
20128 /* src_base refers to the stack pointer and is
20129 automatically decreased by emitted push. */
20130 for (i = 0; i < nparts; i++)
20131 part[1][i] = change_address (part[1][i],
20132 GET_MODE (part[1][i]), src_base);
20133 }
20134
20135 /* We need to do copy in the right order in case an address register
20136 of the source overlaps the destination. */
20137 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20138 {
20139 rtx tmp;
20140
20141 for (i = 0; i < nparts; i++)
20142 {
20143 collisionparts[i]
20144 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20145 if (collisionparts[i])
20146 collisions++;
20147 }
20148
20149 /* Collision in the middle part can be handled by reordering. */
20150 if (collisions == 1 && nparts == 3 && collisionparts [1])
20151 {
20152 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20153 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20154 }
20155 else if (collisions == 1
20156 && nparts == 4
20157 && (collisionparts [1] || collisionparts [2]))
20158 {
20159 if (collisionparts [1])
20160 {
20161 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20162 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20163 }
20164 else
20165 {
20166 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20167 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20168 }
20169 }
20170
20171 /* If there are more collisions, we can't handle it by reordering.
20172 Do an lea to the last part and use only one colliding move. */
20173 else if (collisions > 1)
20174 {
20175 rtx base;
20176
20177 collisions = 1;
20178
20179 base = part[0][nparts - 1];
20180
20181 /* Handle the case when the last part isn't valid for lea.
20182 Happens in 64-bit mode storing the 12-byte XFmode. */
20183 if (GET_MODE (base) != Pmode)
20184 base = gen_rtx_REG (Pmode, REGNO (base));
20185
20186 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20187 part[1][0] = replace_equiv_address (part[1][0], base);
20188 for (i = 1; i < nparts; i++)
20189 {
20190 tmp = plus_constant (base, UNITS_PER_WORD * i);
20191 part[1][i] = replace_equiv_address (part[1][i], tmp);
20192 }
20193 }
20194 }
20195
20196 if (push)
20197 {
20198 if (!TARGET_64BIT)
20199 {
20200 if (nparts == 3)
20201 {
20202 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20203 emit_insn (gen_addsi3 (stack_pointer_rtx,
20204 stack_pointer_rtx, GEN_INT (-4)));
20205 emit_move_insn (part[0][2], part[1][2]);
20206 }
20207 else if (nparts == 4)
20208 {
20209 emit_move_insn (part[0][3], part[1][3]);
20210 emit_move_insn (part[0][2], part[1][2]);
20211 }
20212 }
20213 else
20214 {
20215 /* In 64bit mode we don't have 32bit push available. In case this is
20216 register, it is OK - we will just use larger counterpart. We also
20217 retype memory - these comes from attempt to avoid REX prefix on
20218 moving of second half of TFmode value. */
20219 if (GET_MODE (part[1][1]) == SImode)
20220 {
20221 switch (GET_CODE (part[1][1]))
20222 {
20223 case MEM:
20224 part[1][1] = adjust_address (part[1][1], DImode, 0);
20225 break;
20226
20227 case REG:
20228 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20229 break;
20230
20231 default:
20232 gcc_unreachable ();
20233 }
20234
20235 if (GET_MODE (part[1][0]) == SImode)
20236 part[1][0] = part[1][1];
20237 }
20238 }
20239 emit_move_insn (part[0][1], part[1][1]);
20240 emit_move_insn (part[0][0], part[1][0]);
20241 return;
20242 }
20243
20244 /* Choose correct order to not overwrite the source before it is copied. */
20245 if ((REG_P (part[0][0])
20246 && REG_P (part[1][1])
20247 && (REGNO (part[0][0]) == REGNO (part[1][1])
20248 || (nparts == 3
20249 && REGNO (part[0][0]) == REGNO (part[1][2]))
20250 || (nparts == 4
20251 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20252 || (collisions > 0
20253 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20254 {
20255 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20256 {
20257 operands[2 + i] = part[0][j];
20258 operands[6 + i] = part[1][j];
20259 }
20260 }
20261 else
20262 {
20263 for (i = 0; i < nparts; i++)
20264 {
20265 operands[2 + i] = part[0][i];
20266 operands[6 + i] = part[1][i];
20267 }
20268 }
20269
20270 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20271 if (optimize_insn_for_size_p ())
20272 {
20273 for (j = 0; j < nparts - 1; j++)
20274 if (CONST_INT_P (operands[6 + j])
20275 && operands[6 + j] != const0_rtx
20276 && REG_P (operands[2 + j]))
20277 for (i = j; i < nparts - 1; i++)
20278 if (CONST_INT_P (operands[7 + i])
20279 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20280 operands[7 + i] = operands[2 + j];
20281 }
20282
20283 for (i = 0; i < nparts; i++)
20284 emit_move_insn (operands[2 + i], operands[6 + i]);
20285
20286 return;
20287 }
20288
20289 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20290 left shift by a constant, either using a single shift or
20291 a sequence of add instructions. */
20292
20293 static void
20294 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20295 {
20296 rtx (*insn)(rtx, rtx, rtx);
20297
20298 if (count == 1
20299 || (count * ix86_cost->add <= ix86_cost->shift_const
20300 && !optimize_insn_for_size_p ()))
20301 {
20302 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20303 while (count-- > 0)
20304 emit_insn (insn (operand, operand, operand));
20305 }
20306 else
20307 {
20308 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20309 emit_insn (insn (operand, operand, GEN_INT (count)));
20310 }
20311 }
20312
20313 void
20314 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20315 {
20316 rtx (*gen_ashl3)(rtx, rtx, rtx);
20317 rtx (*gen_shld)(rtx, rtx, rtx);
20318 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20319
20320 rtx low[2], high[2];
20321 int count;
20322
20323 if (CONST_INT_P (operands[2]))
20324 {
20325 split_double_mode (mode, operands, 2, low, high);
20326 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20327
20328 if (count >= half_width)
20329 {
20330 emit_move_insn (high[0], low[1]);
20331 emit_move_insn (low[0], const0_rtx);
20332
20333 if (count > half_width)
20334 ix86_expand_ashl_const (high[0], count - half_width, mode);
20335 }
20336 else
20337 {
20338 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20339
20340 if (!rtx_equal_p (operands[0], operands[1]))
20341 emit_move_insn (operands[0], operands[1]);
20342
20343 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20344 ix86_expand_ashl_const (low[0], count, mode);
20345 }
20346 return;
20347 }
20348
20349 split_double_mode (mode, operands, 1, low, high);
20350
20351 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20352
20353 if (operands[1] == const1_rtx)
20354 {
20355 /* Assuming we've chosen a QImode capable registers, then 1 << N
20356 can be done with two 32/64-bit shifts, no branches, no cmoves. */
20357 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20358 {
20359 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20360
20361 ix86_expand_clear (low[0]);
20362 ix86_expand_clear (high[0]);
20363 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20364
20365 d = gen_lowpart (QImode, low[0]);
20366 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20367 s = gen_rtx_EQ (QImode, flags, const0_rtx);
20368 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20369
20370 d = gen_lowpart (QImode, high[0]);
20371 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20372 s = gen_rtx_NE (QImode, flags, const0_rtx);
20373 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20374 }
20375
20376 /* Otherwise, we can get the same results by manually performing
20377 a bit extract operation on bit 5/6, and then performing the two
20378 shifts. The two methods of getting 0/1 into low/high are exactly
20379 the same size. Avoiding the shift in the bit extract case helps
20380 pentium4 a bit; no one else seems to care much either way. */
20381 else
20382 {
20383 enum machine_mode half_mode;
20384 rtx (*gen_lshr3)(rtx, rtx, rtx);
20385 rtx (*gen_and3)(rtx, rtx, rtx);
20386 rtx (*gen_xor3)(rtx, rtx, rtx);
20387 HOST_WIDE_INT bits;
20388 rtx x;
20389
20390 if (mode == DImode)
20391 {
20392 half_mode = SImode;
20393 gen_lshr3 = gen_lshrsi3;
20394 gen_and3 = gen_andsi3;
20395 gen_xor3 = gen_xorsi3;
20396 bits = 5;
20397 }
20398 else
20399 {
20400 half_mode = DImode;
20401 gen_lshr3 = gen_lshrdi3;
20402 gen_and3 = gen_anddi3;
20403 gen_xor3 = gen_xordi3;
20404 bits = 6;
20405 }
20406
20407 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
20408 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
20409 else
20410 x = gen_lowpart (half_mode, operands[2]);
20411 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
20412
20413 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
20414 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
20415 emit_move_insn (low[0], high[0]);
20416 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
20417 }
20418
20419 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20420 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
20421 return;
20422 }
20423
20424 if (operands[1] == constm1_rtx)
20425 {
20426 /* For -1 << N, we can avoid the shld instruction, because we
20427 know that we're shifting 0...31/63 ones into a -1. */
20428 emit_move_insn (low[0], constm1_rtx);
20429 if (optimize_insn_for_size_p ())
20430 emit_move_insn (high[0], low[0]);
20431 else
20432 emit_move_insn (high[0], constm1_rtx);
20433 }
20434 else
20435 {
20436 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20437
20438 if (!rtx_equal_p (operands[0], operands[1]))
20439 emit_move_insn (operands[0], operands[1]);
20440
20441 split_double_mode (mode, operands, 1, low, high);
20442 emit_insn (gen_shld (high[0], low[0], operands[2]));
20443 }
20444
20445 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20446
20447 if (TARGET_CMOVE && scratch)
20448 {
20449 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20450 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20451
20452 ix86_expand_clear (scratch);
20453 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
20454 }
20455 else
20456 {
20457 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20458 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20459
20460 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
20461 }
20462 }
20463
20464 void
20465 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
20466 {
20467 rtx (*gen_ashr3)(rtx, rtx, rtx)
20468 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
20469 rtx (*gen_shrd)(rtx, rtx, rtx);
20470 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20471
20472 rtx low[2], high[2];
20473 int count;
20474
20475 if (CONST_INT_P (operands[2]))
20476 {
20477 split_double_mode (mode, operands, 2, low, high);
20478 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20479
20480 if (count == GET_MODE_BITSIZE (mode) - 1)
20481 {
20482 emit_move_insn (high[0], high[1]);
20483 emit_insn (gen_ashr3 (high[0], high[0],
20484 GEN_INT (half_width - 1)));
20485 emit_move_insn (low[0], high[0]);
20486
20487 }
20488 else if (count >= half_width)
20489 {
20490 emit_move_insn (low[0], high[1]);
20491 emit_move_insn (high[0], low[0]);
20492 emit_insn (gen_ashr3 (high[0], high[0],
20493 GEN_INT (half_width - 1)));
20494
20495 if (count > half_width)
20496 emit_insn (gen_ashr3 (low[0], low[0],
20497 GEN_INT (count - half_width)));
20498 }
20499 else
20500 {
20501 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20502
20503 if (!rtx_equal_p (operands[0], operands[1]))
20504 emit_move_insn (operands[0], operands[1]);
20505
20506 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20507 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
20508 }
20509 }
20510 else
20511 {
20512 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20513
20514 if (!rtx_equal_p (operands[0], operands[1]))
20515 emit_move_insn (operands[0], operands[1]);
20516
20517 split_double_mode (mode, operands, 1, low, high);
20518
20519 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20520 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
20521
20522 if (TARGET_CMOVE && scratch)
20523 {
20524 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20525 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20526
20527 emit_move_insn (scratch, high[0]);
20528 emit_insn (gen_ashr3 (scratch, scratch,
20529 GEN_INT (half_width - 1)));
20530 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20531 scratch));
20532 }
20533 else
20534 {
20535 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
20536 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
20537
20538 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
20539 }
20540 }
20541 }
20542
20543 void
20544 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
20545 {
20546 rtx (*gen_lshr3)(rtx, rtx, rtx)
20547 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
20548 rtx (*gen_shrd)(rtx, rtx, rtx);
20549 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20550
20551 rtx low[2], high[2];
20552 int count;
20553
20554 if (CONST_INT_P (operands[2]))
20555 {
20556 split_double_mode (mode, operands, 2, low, high);
20557 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20558
20559 if (count >= half_width)
20560 {
20561 emit_move_insn (low[0], high[1]);
20562 ix86_expand_clear (high[0]);
20563
20564 if (count > half_width)
20565 emit_insn (gen_lshr3 (low[0], low[0],
20566 GEN_INT (count - half_width)));
20567 }
20568 else
20569 {
20570 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20571
20572 if (!rtx_equal_p (operands[0], operands[1]))
20573 emit_move_insn (operands[0], operands[1]);
20574
20575 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20576 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
20577 }
20578 }
20579 else
20580 {
20581 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20582
20583 if (!rtx_equal_p (operands[0], operands[1]))
20584 emit_move_insn (operands[0], operands[1]);
20585
20586 split_double_mode (mode, operands, 1, low, high);
20587
20588 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20589 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
20590
20591 if (TARGET_CMOVE && scratch)
20592 {
20593 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20594 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20595
20596 ix86_expand_clear (scratch);
20597 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20598 scratch));
20599 }
20600 else
20601 {
20602 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20603 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20604
20605 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
20606 }
20607 }
20608 }
20609
20610 /* Predict just emitted jump instruction to be taken with probability PROB. */
20611 static void
20612 predict_jump (int prob)
20613 {
20614 rtx insn = get_last_insn ();
20615 gcc_assert (JUMP_P (insn));
20616 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
20617 }
20618
20619 /* Helper function for the string operations below. Dest VARIABLE whether
20620 it is aligned to VALUE bytes. If true, jump to the label. */
20621 static rtx
20622 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
20623 {
20624 rtx label = gen_label_rtx ();
20625 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
20626 if (GET_MODE (variable) == DImode)
20627 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
20628 else
20629 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
20630 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
20631 1, label);
20632 if (epilogue)
20633 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20634 else
20635 predict_jump (REG_BR_PROB_BASE * 90 / 100);
20636 return label;
20637 }
20638
20639 /* Adjust COUNTER by the VALUE. */
20640 static void
20641 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
20642 {
20643 rtx (*gen_add)(rtx, rtx, rtx)
20644 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
20645
20646 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
20647 }
20648
20649 /* Zero extend possibly SImode EXP to Pmode register. */
20650 rtx
20651 ix86_zero_extend_to_Pmode (rtx exp)
20652 {
20653 rtx r;
20654 if (GET_MODE (exp) == VOIDmode)
20655 return force_reg (Pmode, exp);
20656 if (GET_MODE (exp) == Pmode)
20657 return copy_to_mode_reg (Pmode, exp);
20658 r = gen_reg_rtx (Pmode);
20659 emit_insn (gen_zero_extendsidi2 (r, exp));
20660 return r;
20661 }
20662
20663 /* Divide COUNTREG by SCALE. */
20664 static rtx
20665 scale_counter (rtx countreg, int scale)
20666 {
20667 rtx sc;
20668
20669 if (scale == 1)
20670 return countreg;
20671 if (CONST_INT_P (countreg))
20672 return GEN_INT (INTVAL (countreg) / scale);
20673 gcc_assert (REG_P (countreg));
20674
20675 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
20676 GEN_INT (exact_log2 (scale)),
20677 NULL, 1, OPTAB_DIRECT);
20678 return sc;
20679 }
20680
20681 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
20682 DImode for constant loop counts. */
20683
20684 static enum machine_mode
20685 counter_mode (rtx count_exp)
20686 {
20687 if (GET_MODE (count_exp) != VOIDmode)
20688 return GET_MODE (count_exp);
20689 if (!CONST_INT_P (count_exp))
20690 return Pmode;
20691 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
20692 return DImode;
20693 return SImode;
20694 }
20695
20696 /* When SRCPTR is non-NULL, output simple loop to move memory
20697 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
20698 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
20699 equivalent loop to set memory by VALUE (supposed to be in MODE).
20700
20701 The size is rounded down to whole number of chunk size moved at once.
20702 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
20703
20704
20705 static void
20706 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
20707 rtx destptr, rtx srcptr, rtx value,
20708 rtx count, enum machine_mode mode, int unroll,
20709 int expected_size)
20710 {
20711 rtx out_label, top_label, iter, tmp;
20712 enum machine_mode iter_mode = counter_mode (count);
20713 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
20714 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
20715 rtx size;
20716 rtx x_addr;
20717 rtx y_addr;
20718 int i;
20719
20720 top_label = gen_label_rtx ();
20721 out_label = gen_label_rtx ();
20722 iter = gen_reg_rtx (iter_mode);
20723
20724 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
20725 NULL, 1, OPTAB_DIRECT);
20726 /* Those two should combine. */
20727 if (piece_size == const1_rtx)
20728 {
20729 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
20730 true, out_label);
20731 predict_jump (REG_BR_PROB_BASE * 10 / 100);
20732 }
20733 emit_move_insn (iter, const0_rtx);
20734
20735 emit_label (top_label);
20736
20737 tmp = convert_modes (Pmode, iter_mode, iter, true);
20738 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
20739 destmem = change_address (destmem, mode, x_addr);
20740
20741 if (srcmem)
20742 {
20743 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
20744 srcmem = change_address (srcmem, mode, y_addr);
20745
20746 /* When unrolling for chips that reorder memory reads and writes,
20747 we can save registers by using single temporary.
20748 Also using 4 temporaries is overkill in 32bit mode. */
20749 if (!TARGET_64BIT && 0)
20750 {
20751 for (i = 0; i < unroll; i++)
20752 {
20753 if (i)
20754 {
20755 destmem =
20756 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
20757 srcmem =
20758 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
20759 }
20760 emit_move_insn (destmem, srcmem);
20761 }
20762 }
20763 else
20764 {
20765 rtx tmpreg[4];
20766 gcc_assert (unroll <= 4);
20767 for (i = 0; i < unroll; i++)
20768 {
20769 tmpreg[i] = gen_reg_rtx (mode);
20770 if (i)
20771 {
20772 srcmem =
20773 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
20774 }
20775 emit_move_insn (tmpreg[i], srcmem);
20776 }
20777 for (i = 0; i < unroll; i++)
20778 {
20779 if (i)
20780 {
20781 destmem =
20782 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
20783 }
20784 emit_move_insn (destmem, tmpreg[i]);
20785 }
20786 }
20787 }
20788 else
20789 for (i = 0; i < unroll; i++)
20790 {
20791 if (i)
20792 destmem =
20793 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
20794 emit_move_insn (destmem, value);
20795 }
20796
20797 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
20798 true, OPTAB_LIB_WIDEN);
20799 if (tmp != iter)
20800 emit_move_insn (iter, tmp);
20801
20802 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
20803 true, top_label);
20804 if (expected_size != -1)
20805 {
20806 expected_size /= GET_MODE_SIZE (mode) * unroll;
20807 if (expected_size == 0)
20808 predict_jump (0);
20809 else if (expected_size > REG_BR_PROB_BASE)
20810 predict_jump (REG_BR_PROB_BASE - 1);
20811 else
20812 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
20813 }
20814 else
20815 predict_jump (REG_BR_PROB_BASE * 80 / 100);
20816 iter = ix86_zero_extend_to_Pmode (iter);
20817 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
20818 true, OPTAB_LIB_WIDEN);
20819 if (tmp != destptr)
20820 emit_move_insn (destptr, tmp);
20821 if (srcptr)
20822 {
20823 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
20824 true, OPTAB_LIB_WIDEN);
20825 if (tmp != srcptr)
20826 emit_move_insn (srcptr, tmp);
20827 }
20828 emit_label (out_label);
20829 }
20830
20831 /* Output "rep; mov" instruction.
20832 Arguments have same meaning as for previous function */
20833 static void
20834 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
20835 rtx destptr, rtx srcptr,
20836 rtx count,
20837 enum machine_mode mode)
20838 {
20839 rtx destexp;
20840 rtx srcexp;
20841 rtx countreg;
20842 HOST_WIDE_INT rounded_count;
20843
20844 /* If the size is known, it is shorter to use rep movs. */
20845 if (mode == QImode && CONST_INT_P (count)
20846 && !(INTVAL (count) & 3))
20847 mode = SImode;
20848
20849 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
20850 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
20851 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
20852 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
20853 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
20854 if (mode != QImode)
20855 {
20856 destexp = gen_rtx_ASHIFT (Pmode, countreg,
20857 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
20858 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
20859 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
20860 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
20861 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
20862 }
20863 else
20864 {
20865 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
20866 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
20867 }
20868 if (CONST_INT_P (count))
20869 {
20870 rounded_count = (INTVAL (count)
20871 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
20872 destmem = shallow_copy_rtx (destmem);
20873 srcmem = shallow_copy_rtx (srcmem);
20874 set_mem_size (destmem, rounded_count);
20875 set_mem_size (srcmem, rounded_count);
20876 }
20877 else
20878 {
20879 if (MEM_SIZE_KNOWN_P (destmem))
20880 clear_mem_size (destmem);
20881 if (MEM_SIZE_KNOWN_P (srcmem))
20882 clear_mem_size (srcmem);
20883 }
20884 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
20885 destexp, srcexp));
20886 }
20887
20888 /* Output "rep; stos" instruction.
20889 Arguments have same meaning as for previous function */
20890 static void
20891 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
20892 rtx count, enum machine_mode mode,
20893 rtx orig_value)
20894 {
20895 rtx destexp;
20896 rtx countreg;
20897 HOST_WIDE_INT rounded_count;
20898
20899 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
20900 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
20901 value = force_reg (mode, gen_lowpart (mode, value));
20902 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
20903 if (mode != QImode)
20904 {
20905 destexp = gen_rtx_ASHIFT (Pmode, countreg,
20906 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
20907 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
20908 }
20909 else
20910 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
20911 if (orig_value == const0_rtx && CONST_INT_P (count))
20912 {
20913 rounded_count = (INTVAL (count)
20914 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
20915 destmem = shallow_copy_rtx (destmem);
20916 set_mem_size (destmem, rounded_count);
20917 }
20918 else if (MEM_SIZE_KNOWN_P (destmem))
20919 clear_mem_size (destmem);
20920 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
20921 }
20922
20923 static void
20924 emit_strmov (rtx destmem, rtx srcmem,
20925 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
20926 {
20927 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
20928 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
20929 emit_insn (gen_strmov (destptr, dest, srcptr, src));
20930 }
20931
20932 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
20933 static void
20934 expand_movmem_epilogue (rtx destmem, rtx srcmem,
20935 rtx destptr, rtx srcptr, rtx count, int max_size)
20936 {
20937 rtx src, dest;
20938 if (CONST_INT_P (count))
20939 {
20940 HOST_WIDE_INT countval = INTVAL (count);
20941 int offset = 0;
20942
20943 if ((countval & 0x10) && max_size > 16)
20944 {
20945 if (TARGET_64BIT)
20946 {
20947 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
20948 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
20949 }
20950 else
20951 gcc_unreachable ();
20952 offset += 16;
20953 }
20954 if ((countval & 0x08) && max_size > 8)
20955 {
20956 if (TARGET_64BIT)
20957 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
20958 else
20959 {
20960 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
20961 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
20962 }
20963 offset += 8;
20964 }
20965 if ((countval & 0x04) && max_size > 4)
20966 {
20967 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
20968 offset += 4;
20969 }
20970 if ((countval & 0x02) && max_size > 2)
20971 {
20972 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
20973 offset += 2;
20974 }
20975 if ((countval & 0x01) && max_size > 1)
20976 {
20977 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
20978 offset += 1;
20979 }
20980 return;
20981 }
20982 if (max_size > 8)
20983 {
20984 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
20985 count, 1, OPTAB_DIRECT);
20986 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
20987 count, QImode, 1, 4);
20988 return;
20989 }
20990
20991 /* When there are stringops, we can cheaply increase dest and src pointers.
20992 Otherwise we save code size by maintaining offset (zero is readily
20993 available from preceding rep operation) and using x86 addressing modes.
20994 */
20995 if (TARGET_SINGLE_STRINGOP)
20996 {
20997 if (max_size > 4)
20998 {
20999 rtx label = ix86_expand_aligntest (count, 4, true);
21000 src = change_address (srcmem, SImode, srcptr);
21001 dest = change_address (destmem, SImode, destptr);
21002 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21003 emit_label (label);
21004 LABEL_NUSES (label) = 1;
21005 }
21006 if (max_size > 2)
21007 {
21008 rtx label = ix86_expand_aligntest (count, 2, true);
21009 src = change_address (srcmem, HImode, srcptr);
21010 dest = change_address (destmem, HImode, destptr);
21011 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21012 emit_label (label);
21013 LABEL_NUSES (label) = 1;
21014 }
21015 if (max_size > 1)
21016 {
21017 rtx label = ix86_expand_aligntest (count, 1, true);
21018 src = change_address (srcmem, QImode, srcptr);
21019 dest = change_address (destmem, QImode, destptr);
21020 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21021 emit_label (label);
21022 LABEL_NUSES (label) = 1;
21023 }
21024 }
21025 else
21026 {
21027 rtx offset = force_reg (Pmode, const0_rtx);
21028 rtx tmp;
21029
21030 if (max_size > 4)
21031 {
21032 rtx label = ix86_expand_aligntest (count, 4, true);
21033 src = change_address (srcmem, SImode, srcptr);
21034 dest = change_address (destmem, SImode, destptr);
21035 emit_move_insn (dest, src);
21036 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21037 true, OPTAB_LIB_WIDEN);
21038 if (tmp != offset)
21039 emit_move_insn (offset, tmp);
21040 emit_label (label);
21041 LABEL_NUSES (label) = 1;
21042 }
21043 if (max_size > 2)
21044 {
21045 rtx label = ix86_expand_aligntest (count, 2, true);
21046 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21047 src = change_address (srcmem, HImode, tmp);
21048 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21049 dest = change_address (destmem, HImode, tmp);
21050 emit_move_insn (dest, src);
21051 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21052 true, OPTAB_LIB_WIDEN);
21053 if (tmp != offset)
21054 emit_move_insn (offset, tmp);
21055 emit_label (label);
21056 LABEL_NUSES (label) = 1;
21057 }
21058 if (max_size > 1)
21059 {
21060 rtx label = ix86_expand_aligntest (count, 1, true);
21061 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21062 src = change_address (srcmem, QImode, tmp);
21063 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21064 dest = change_address (destmem, QImode, tmp);
21065 emit_move_insn (dest, src);
21066 emit_label (label);
21067 LABEL_NUSES (label) = 1;
21068 }
21069 }
21070 }
21071
21072 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21073 static void
21074 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21075 rtx count, int max_size)
21076 {
21077 count =
21078 expand_simple_binop (counter_mode (count), AND, count,
21079 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21080 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21081 gen_lowpart (QImode, value), count, QImode,
21082 1, max_size / 2);
21083 }
21084
21085 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21086 static void
21087 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21088 {
21089 rtx dest;
21090
21091 if (CONST_INT_P (count))
21092 {
21093 HOST_WIDE_INT countval = INTVAL (count);
21094 int offset = 0;
21095
21096 if ((countval & 0x10) && max_size > 16)
21097 {
21098 if (TARGET_64BIT)
21099 {
21100 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21101 emit_insn (gen_strset (destptr, dest, value));
21102 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21103 emit_insn (gen_strset (destptr, dest, value));
21104 }
21105 else
21106 gcc_unreachable ();
21107 offset += 16;
21108 }
21109 if ((countval & 0x08) && max_size > 8)
21110 {
21111 if (TARGET_64BIT)
21112 {
21113 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21114 emit_insn (gen_strset (destptr, dest, value));
21115 }
21116 else
21117 {
21118 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21119 emit_insn (gen_strset (destptr, dest, value));
21120 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21121 emit_insn (gen_strset (destptr, dest, value));
21122 }
21123 offset += 8;
21124 }
21125 if ((countval & 0x04) && max_size > 4)
21126 {
21127 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21128 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21129 offset += 4;
21130 }
21131 if ((countval & 0x02) && max_size > 2)
21132 {
21133 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21134 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21135 offset += 2;
21136 }
21137 if ((countval & 0x01) && max_size > 1)
21138 {
21139 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21140 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21141 offset += 1;
21142 }
21143 return;
21144 }
21145 if (max_size > 32)
21146 {
21147 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21148 return;
21149 }
21150 if (max_size > 16)
21151 {
21152 rtx label = ix86_expand_aligntest (count, 16, true);
21153 if (TARGET_64BIT)
21154 {
21155 dest = change_address (destmem, DImode, destptr);
21156 emit_insn (gen_strset (destptr, dest, value));
21157 emit_insn (gen_strset (destptr, dest, value));
21158 }
21159 else
21160 {
21161 dest = change_address (destmem, SImode, destptr);
21162 emit_insn (gen_strset (destptr, dest, value));
21163 emit_insn (gen_strset (destptr, dest, value));
21164 emit_insn (gen_strset (destptr, dest, value));
21165 emit_insn (gen_strset (destptr, dest, value));
21166 }
21167 emit_label (label);
21168 LABEL_NUSES (label) = 1;
21169 }
21170 if (max_size > 8)
21171 {
21172 rtx label = ix86_expand_aligntest (count, 8, true);
21173 if (TARGET_64BIT)
21174 {
21175 dest = change_address (destmem, DImode, destptr);
21176 emit_insn (gen_strset (destptr, dest, value));
21177 }
21178 else
21179 {
21180 dest = change_address (destmem, SImode, destptr);
21181 emit_insn (gen_strset (destptr, dest, value));
21182 emit_insn (gen_strset (destptr, dest, value));
21183 }
21184 emit_label (label);
21185 LABEL_NUSES (label) = 1;
21186 }
21187 if (max_size > 4)
21188 {
21189 rtx label = ix86_expand_aligntest (count, 4, true);
21190 dest = change_address (destmem, SImode, destptr);
21191 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21192 emit_label (label);
21193 LABEL_NUSES (label) = 1;
21194 }
21195 if (max_size > 2)
21196 {
21197 rtx label = ix86_expand_aligntest (count, 2, true);
21198 dest = change_address (destmem, HImode, destptr);
21199 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21200 emit_label (label);
21201 LABEL_NUSES (label) = 1;
21202 }
21203 if (max_size > 1)
21204 {
21205 rtx label = ix86_expand_aligntest (count, 1, true);
21206 dest = change_address (destmem, QImode, destptr);
21207 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21208 emit_label (label);
21209 LABEL_NUSES (label) = 1;
21210 }
21211 }
21212
21213 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21214 DESIRED_ALIGNMENT. */
21215 static void
21216 expand_movmem_prologue (rtx destmem, rtx srcmem,
21217 rtx destptr, rtx srcptr, rtx count,
21218 int align, int desired_alignment)
21219 {
21220 if (align <= 1 && desired_alignment > 1)
21221 {
21222 rtx label = ix86_expand_aligntest (destptr, 1, false);
21223 srcmem = change_address (srcmem, QImode, srcptr);
21224 destmem = change_address (destmem, QImode, destptr);
21225 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21226 ix86_adjust_counter (count, 1);
21227 emit_label (label);
21228 LABEL_NUSES (label) = 1;
21229 }
21230 if (align <= 2 && desired_alignment > 2)
21231 {
21232 rtx label = ix86_expand_aligntest (destptr, 2, false);
21233 srcmem = change_address (srcmem, HImode, srcptr);
21234 destmem = change_address (destmem, HImode, destptr);
21235 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21236 ix86_adjust_counter (count, 2);
21237 emit_label (label);
21238 LABEL_NUSES (label) = 1;
21239 }
21240 if (align <= 4 && desired_alignment > 4)
21241 {
21242 rtx label = ix86_expand_aligntest (destptr, 4, false);
21243 srcmem = change_address (srcmem, SImode, srcptr);
21244 destmem = change_address (destmem, SImode, destptr);
21245 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21246 ix86_adjust_counter (count, 4);
21247 emit_label (label);
21248 LABEL_NUSES (label) = 1;
21249 }
21250 gcc_assert (desired_alignment <= 8);
21251 }
21252
21253 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21254 ALIGN_BYTES is how many bytes need to be copied. */
21255 static rtx
21256 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21257 int desired_align, int align_bytes)
21258 {
21259 rtx src = *srcp;
21260 rtx orig_dst = dst;
21261 rtx orig_src = src;
21262 int off = 0;
21263 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21264 if (src_align_bytes >= 0)
21265 src_align_bytes = desired_align - src_align_bytes;
21266 if (align_bytes & 1)
21267 {
21268 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21269 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21270 off = 1;
21271 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21272 }
21273 if (align_bytes & 2)
21274 {
21275 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21276 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21277 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21278 set_mem_align (dst, 2 * BITS_PER_UNIT);
21279 if (src_align_bytes >= 0
21280 && (src_align_bytes & 1) == (align_bytes & 1)
21281 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21282 set_mem_align (src, 2 * BITS_PER_UNIT);
21283 off = 2;
21284 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21285 }
21286 if (align_bytes & 4)
21287 {
21288 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21289 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21290 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21291 set_mem_align (dst, 4 * BITS_PER_UNIT);
21292 if (src_align_bytes >= 0)
21293 {
21294 unsigned int src_align = 0;
21295 if ((src_align_bytes & 3) == (align_bytes & 3))
21296 src_align = 4;
21297 else if ((src_align_bytes & 1) == (align_bytes & 1))
21298 src_align = 2;
21299 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21300 set_mem_align (src, src_align * BITS_PER_UNIT);
21301 }
21302 off = 4;
21303 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21304 }
21305 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21306 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21307 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21308 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21309 if (src_align_bytes >= 0)
21310 {
21311 unsigned int src_align = 0;
21312 if ((src_align_bytes & 7) == (align_bytes & 7))
21313 src_align = 8;
21314 else if ((src_align_bytes & 3) == (align_bytes & 3))
21315 src_align = 4;
21316 else if ((src_align_bytes & 1) == (align_bytes & 1))
21317 src_align = 2;
21318 if (src_align > (unsigned int) desired_align)
21319 src_align = desired_align;
21320 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21321 set_mem_align (src, src_align * BITS_PER_UNIT);
21322 }
21323 if (MEM_SIZE_KNOWN_P (orig_dst))
21324 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21325 if (MEM_SIZE_KNOWN_P (orig_src))
21326 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21327 *srcp = src;
21328 return dst;
21329 }
21330
21331 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21332 DESIRED_ALIGNMENT. */
21333 static void
21334 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21335 int align, int desired_alignment)
21336 {
21337 if (align <= 1 && desired_alignment > 1)
21338 {
21339 rtx label = ix86_expand_aligntest (destptr, 1, false);
21340 destmem = change_address (destmem, QImode, destptr);
21341 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21342 ix86_adjust_counter (count, 1);
21343 emit_label (label);
21344 LABEL_NUSES (label) = 1;
21345 }
21346 if (align <= 2 && desired_alignment > 2)
21347 {
21348 rtx label = ix86_expand_aligntest (destptr, 2, false);
21349 destmem = change_address (destmem, HImode, destptr);
21350 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21351 ix86_adjust_counter (count, 2);
21352 emit_label (label);
21353 LABEL_NUSES (label) = 1;
21354 }
21355 if (align <= 4 && desired_alignment > 4)
21356 {
21357 rtx label = ix86_expand_aligntest (destptr, 4, false);
21358 destmem = change_address (destmem, SImode, destptr);
21359 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21360 ix86_adjust_counter (count, 4);
21361 emit_label (label);
21362 LABEL_NUSES (label) = 1;
21363 }
21364 gcc_assert (desired_alignment <= 8);
21365 }
21366
21367 /* Set enough from DST to align DST known to by aligned by ALIGN to
21368 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
21369 static rtx
21370 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21371 int desired_align, int align_bytes)
21372 {
21373 int off = 0;
21374 rtx orig_dst = dst;
21375 if (align_bytes & 1)
21376 {
21377 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21378 off = 1;
21379 emit_insn (gen_strset (destreg, dst,
21380 gen_lowpart (QImode, value)));
21381 }
21382 if (align_bytes & 2)
21383 {
21384 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21385 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21386 set_mem_align (dst, 2 * BITS_PER_UNIT);
21387 off = 2;
21388 emit_insn (gen_strset (destreg, dst,
21389 gen_lowpart (HImode, value)));
21390 }
21391 if (align_bytes & 4)
21392 {
21393 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21394 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21395 set_mem_align (dst, 4 * BITS_PER_UNIT);
21396 off = 4;
21397 emit_insn (gen_strset (destreg, dst,
21398 gen_lowpart (SImode, value)));
21399 }
21400 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21401 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21402 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21403 if (MEM_SIZE_KNOWN_P (orig_dst))
21404 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21405 return dst;
21406 }
21407
21408 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
21409 static enum stringop_alg
21410 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
21411 int *dynamic_check)
21412 {
21413 const struct stringop_algs * algs;
21414 bool optimize_for_speed;
21415 /* Algorithms using the rep prefix want at least edi and ecx;
21416 additionally, memset wants eax and memcpy wants esi. Don't
21417 consider such algorithms if the user has appropriated those
21418 registers for their own purposes. */
21419 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
21420 || (memset
21421 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
21422
21423 #define ALG_USABLE_P(alg) (rep_prefix_usable \
21424 || (alg != rep_prefix_1_byte \
21425 && alg != rep_prefix_4_byte \
21426 && alg != rep_prefix_8_byte))
21427 const struct processor_costs *cost;
21428
21429 /* Even if the string operation call is cold, we still might spend a lot
21430 of time processing large blocks. */
21431 if (optimize_function_for_size_p (cfun)
21432 || (optimize_insn_for_size_p ()
21433 && expected_size != -1 && expected_size < 256))
21434 optimize_for_speed = false;
21435 else
21436 optimize_for_speed = true;
21437
21438 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
21439
21440 *dynamic_check = -1;
21441 if (memset)
21442 algs = &cost->memset[TARGET_64BIT != 0];
21443 else
21444 algs = &cost->memcpy[TARGET_64BIT != 0];
21445 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
21446 return ix86_stringop_alg;
21447 /* rep; movq or rep; movl is the smallest variant. */
21448 else if (!optimize_for_speed)
21449 {
21450 if (!count || (count & 3))
21451 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
21452 else
21453 return rep_prefix_usable ? rep_prefix_4_byte : loop;
21454 }
21455 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
21456 */
21457 else if (expected_size != -1 && expected_size < 4)
21458 return loop_1_byte;
21459 else if (expected_size != -1)
21460 {
21461 unsigned int i;
21462 enum stringop_alg alg = libcall;
21463 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21464 {
21465 /* We get here if the algorithms that were not libcall-based
21466 were rep-prefix based and we are unable to use rep prefixes
21467 based on global register usage. Break out of the loop and
21468 use the heuristic below. */
21469 if (algs->size[i].max == 0)
21470 break;
21471 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
21472 {
21473 enum stringop_alg candidate = algs->size[i].alg;
21474
21475 if (candidate != libcall && ALG_USABLE_P (candidate))
21476 alg = candidate;
21477 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
21478 last non-libcall inline algorithm. */
21479 if (TARGET_INLINE_ALL_STRINGOPS)
21480 {
21481 /* When the current size is best to be copied by a libcall,
21482 but we are still forced to inline, run the heuristic below
21483 that will pick code for medium sized blocks. */
21484 if (alg != libcall)
21485 return alg;
21486 break;
21487 }
21488 else if (ALG_USABLE_P (candidate))
21489 return candidate;
21490 }
21491 }
21492 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
21493 }
21494 /* When asked to inline the call anyway, try to pick meaningful choice.
21495 We look for maximal size of block that is faster to copy by hand and
21496 take blocks of at most of that size guessing that average size will
21497 be roughly half of the block.
21498
21499 If this turns out to be bad, we might simply specify the preferred
21500 choice in ix86_costs. */
21501 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21502 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
21503 {
21504 int max = -1;
21505 enum stringop_alg alg;
21506 int i;
21507 bool any_alg_usable_p = true;
21508
21509 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21510 {
21511 enum stringop_alg candidate = algs->size[i].alg;
21512 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
21513
21514 if (candidate != libcall && candidate
21515 && ALG_USABLE_P (candidate))
21516 max = algs->size[i].max;
21517 }
21518 /* If there aren't any usable algorithms, then recursing on
21519 smaller sizes isn't going to find anything. Just return the
21520 simple byte-at-a-time copy loop. */
21521 if (!any_alg_usable_p)
21522 {
21523 /* Pick something reasonable. */
21524 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21525 *dynamic_check = 128;
21526 return loop_1_byte;
21527 }
21528 if (max == -1)
21529 max = 4096;
21530 alg = decide_alg (count, max / 2, memset, dynamic_check);
21531 gcc_assert (*dynamic_check == -1);
21532 gcc_assert (alg != libcall);
21533 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21534 *dynamic_check = max;
21535 return alg;
21536 }
21537 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
21538 #undef ALG_USABLE_P
21539 }
21540
21541 /* Decide on alignment. We know that the operand is already aligned to ALIGN
21542 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
21543 static int
21544 decide_alignment (int align,
21545 enum stringop_alg alg,
21546 int expected_size)
21547 {
21548 int desired_align = 0;
21549 switch (alg)
21550 {
21551 case no_stringop:
21552 gcc_unreachable ();
21553 case loop:
21554 case unrolled_loop:
21555 desired_align = GET_MODE_SIZE (Pmode);
21556 break;
21557 case rep_prefix_8_byte:
21558 desired_align = 8;
21559 break;
21560 case rep_prefix_4_byte:
21561 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21562 copying whole cacheline at once. */
21563 if (TARGET_PENTIUMPRO)
21564 desired_align = 8;
21565 else
21566 desired_align = 4;
21567 break;
21568 case rep_prefix_1_byte:
21569 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21570 copying whole cacheline at once. */
21571 if (TARGET_PENTIUMPRO)
21572 desired_align = 8;
21573 else
21574 desired_align = 1;
21575 break;
21576 case loop_1_byte:
21577 desired_align = 1;
21578 break;
21579 case libcall:
21580 return 0;
21581 }
21582
21583 if (optimize_size)
21584 desired_align = 1;
21585 if (desired_align < align)
21586 desired_align = align;
21587 if (expected_size != -1 && expected_size < 4)
21588 desired_align = align;
21589 return desired_align;
21590 }
21591
21592 /* Return the smallest power of 2 greater than VAL. */
21593 static int
21594 smallest_pow2_greater_than (int val)
21595 {
21596 int ret = 1;
21597 while (ret <= val)
21598 ret <<= 1;
21599 return ret;
21600 }
21601
21602 /* Expand string move (memcpy) operation. Use i386 string operations
21603 when profitable. expand_setmem contains similar code. The code
21604 depends upon architecture, block size and alignment, but always has
21605 the same overall structure:
21606
21607 1) Prologue guard: Conditional that jumps up to epilogues for small
21608 blocks that can be handled by epilogue alone. This is faster
21609 but also needed for correctness, since prologue assume the block
21610 is larger than the desired alignment.
21611
21612 Optional dynamic check for size and libcall for large
21613 blocks is emitted here too, with -minline-stringops-dynamically.
21614
21615 2) Prologue: copy first few bytes in order to get destination
21616 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
21617 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
21618 copied. We emit either a jump tree on power of two sized
21619 blocks, or a byte loop.
21620
21621 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
21622 with specified algorithm.
21623
21624 4) Epilogue: code copying tail of the block that is too small to be
21625 handled by main body (or up to size guarded by prologue guard). */
21626
21627 bool
21628 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
21629 rtx expected_align_exp, rtx expected_size_exp)
21630 {
21631 rtx destreg;
21632 rtx srcreg;
21633 rtx label = NULL;
21634 rtx tmp;
21635 rtx jump_around_label = NULL;
21636 HOST_WIDE_INT align = 1;
21637 unsigned HOST_WIDE_INT count = 0;
21638 HOST_WIDE_INT expected_size = -1;
21639 int size_needed = 0, epilogue_size_needed;
21640 int desired_align = 0, align_bytes = 0;
21641 enum stringop_alg alg;
21642 int dynamic_check;
21643 bool need_zero_guard = false;
21644
21645 if (CONST_INT_P (align_exp))
21646 align = INTVAL (align_exp);
21647 /* i386 can do misaligned access on reasonably increased cost. */
21648 if (CONST_INT_P (expected_align_exp)
21649 && INTVAL (expected_align_exp) > align)
21650 align = INTVAL (expected_align_exp);
21651 /* ALIGN is the minimum of destination and source alignment, but we care here
21652 just about destination alignment. */
21653 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
21654 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
21655
21656 if (CONST_INT_P (count_exp))
21657 count = expected_size = INTVAL (count_exp);
21658 if (CONST_INT_P (expected_size_exp) && count == 0)
21659 expected_size = INTVAL (expected_size_exp);
21660
21661 /* Make sure we don't need to care about overflow later on. */
21662 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
21663 return false;
21664
21665 /* Step 0: Decide on preferred algorithm, desired alignment and
21666 size of chunks to be copied by main loop. */
21667
21668 alg = decide_alg (count, expected_size, false, &dynamic_check);
21669 desired_align = decide_alignment (align, alg, expected_size);
21670
21671 if (!TARGET_ALIGN_STRINGOPS)
21672 align = desired_align;
21673
21674 if (alg == libcall)
21675 return false;
21676 gcc_assert (alg != no_stringop);
21677 if (!count)
21678 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
21679 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
21680 srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
21681 switch (alg)
21682 {
21683 case libcall:
21684 case no_stringop:
21685 gcc_unreachable ();
21686 case loop:
21687 need_zero_guard = true;
21688 size_needed = GET_MODE_SIZE (Pmode);
21689 break;
21690 case unrolled_loop:
21691 need_zero_guard = true;
21692 size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2);
21693 break;
21694 case rep_prefix_8_byte:
21695 size_needed = 8;
21696 break;
21697 case rep_prefix_4_byte:
21698 size_needed = 4;
21699 break;
21700 case rep_prefix_1_byte:
21701 size_needed = 1;
21702 break;
21703 case loop_1_byte:
21704 need_zero_guard = true;
21705 size_needed = 1;
21706 break;
21707 }
21708
21709 epilogue_size_needed = size_needed;
21710
21711 /* Step 1: Prologue guard. */
21712
21713 /* Alignment code needs count to be in register. */
21714 if (CONST_INT_P (count_exp) && desired_align > align)
21715 {
21716 if (INTVAL (count_exp) > desired_align
21717 && INTVAL (count_exp) > size_needed)
21718 {
21719 align_bytes
21720 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
21721 if (align_bytes <= 0)
21722 align_bytes = 0;
21723 else
21724 align_bytes = desired_align - align_bytes;
21725 }
21726 if (align_bytes == 0)
21727 count_exp = force_reg (counter_mode (count_exp), count_exp);
21728 }
21729 gcc_assert (desired_align >= 1 && align >= 1);
21730
21731 /* Ensure that alignment prologue won't copy past end of block. */
21732 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
21733 {
21734 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
21735 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
21736 Make sure it is power of 2. */
21737 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
21738
21739 if (count)
21740 {
21741 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
21742 {
21743 /* If main algorithm works on QImode, no epilogue is needed.
21744 For small sizes just don't align anything. */
21745 if (size_needed == 1)
21746 desired_align = align;
21747 else
21748 goto epilogue;
21749 }
21750 }
21751 else
21752 {
21753 label = gen_label_rtx ();
21754 emit_cmp_and_jump_insns (count_exp,
21755 GEN_INT (epilogue_size_needed),
21756 LTU, 0, counter_mode (count_exp), 1, label);
21757 if (expected_size == -1 || expected_size < epilogue_size_needed)
21758 predict_jump (REG_BR_PROB_BASE * 60 / 100);
21759 else
21760 predict_jump (REG_BR_PROB_BASE * 20 / 100);
21761 }
21762 }
21763
21764 /* Emit code to decide on runtime whether library call or inline should be
21765 used. */
21766 if (dynamic_check != -1)
21767 {
21768 if (CONST_INT_P (count_exp))
21769 {
21770 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
21771 {
21772 emit_block_move_via_libcall (dst, src, count_exp, false);
21773 count_exp = const0_rtx;
21774 goto epilogue;
21775 }
21776 }
21777 else
21778 {
21779 rtx hot_label = gen_label_rtx ();
21780 jump_around_label = gen_label_rtx ();
21781 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
21782 LEU, 0, GET_MODE (count_exp), 1, hot_label);
21783 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21784 emit_block_move_via_libcall (dst, src, count_exp, false);
21785 emit_jump (jump_around_label);
21786 emit_label (hot_label);
21787 }
21788 }
21789
21790 /* Step 2: Alignment prologue. */
21791
21792 if (desired_align > align)
21793 {
21794 if (align_bytes == 0)
21795 {
21796 /* Except for the first move in epilogue, we no longer know
21797 constant offset in aliasing info. It don't seems to worth
21798 the pain to maintain it for the first move, so throw away
21799 the info early. */
21800 src = change_address (src, BLKmode, srcreg);
21801 dst = change_address (dst, BLKmode, destreg);
21802 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
21803 desired_align);
21804 }
21805 else
21806 {
21807 /* If we know how many bytes need to be stored before dst is
21808 sufficiently aligned, maintain aliasing info accurately. */
21809 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
21810 desired_align, align_bytes);
21811 count_exp = plus_constant (count_exp, -align_bytes);
21812 count -= align_bytes;
21813 }
21814 if (need_zero_guard
21815 && (count < (unsigned HOST_WIDE_INT) size_needed
21816 || (align_bytes == 0
21817 && count < ((unsigned HOST_WIDE_INT) size_needed
21818 + desired_align - align))))
21819 {
21820 /* It is possible that we copied enough so the main loop will not
21821 execute. */
21822 gcc_assert (size_needed > 1);
21823 if (label == NULL_RTX)
21824 label = gen_label_rtx ();
21825 emit_cmp_and_jump_insns (count_exp,
21826 GEN_INT (size_needed),
21827 LTU, 0, counter_mode (count_exp), 1, label);
21828 if (expected_size == -1
21829 || expected_size < (desired_align - align) / 2 + size_needed)
21830 predict_jump (REG_BR_PROB_BASE * 20 / 100);
21831 else
21832 predict_jump (REG_BR_PROB_BASE * 60 / 100);
21833 }
21834 }
21835 if (label && size_needed == 1)
21836 {
21837 emit_label (label);
21838 LABEL_NUSES (label) = 1;
21839 label = NULL;
21840 epilogue_size_needed = 1;
21841 }
21842 else if (label == NULL_RTX)
21843 epilogue_size_needed = size_needed;
21844
21845 /* Step 3: Main loop. */
21846
21847 switch (alg)
21848 {
21849 case libcall:
21850 case no_stringop:
21851 gcc_unreachable ();
21852 case loop_1_byte:
21853 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
21854 count_exp, QImode, 1, expected_size);
21855 break;
21856 case loop:
21857 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
21858 count_exp, Pmode, 1, expected_size);
21859 break;
21860 case unrolled_loop:
21861 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
21862 registers for 4 temporaries anyway. */
21863 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
21864 count_exp, Pmode, TARGET_64BIT ? 4 : 2,
21865 expected_size);
21866 break;
21867 case rep_prefix_8_byte:
21868 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
21869 DImode);
21870 break;
21871 case rep_prefix_4_byte:
21872 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
21873 SImode);
21874 break;
21875 case rep_prefix_1_byte:
21876 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
21877 QImode);
21878 break;
21879 }
21880 /* Adjust properly the offset of src and dest memory for aliasing. */
21881 if (CONST_INT_P (count_exp))
21882 {
21883 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
21884 (count / size_needed) * size_needed);
21885 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
21886 (count / size_needed) * size_needed);
21887 }
21888 else
21889 {
21890 src = change_address (src, BLKmode, srcreg);
21891 dst = change_address (dst, BLKmode, destreg);
21892 }
21893
21894 /* Step 4: Epilogue to copy the remaining bytes. */
21895 epilogue:
21896 if (label)
21897 {
21898 /* When the main loop is done, COUNT_EXP might hold original count,
21899 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
21900 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
21901 bytes. Compensate if needed. */
21902
21903 if (size_needed < epilogue_size_needed)
21904 {
21905 tmp =
21906 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
21907 GEN_INT (size_needed - 1), count_exp, 1,
21908 OPTAB_DIRECT);
21909 if (tmp != count_exp)
21910 emit_move_insn (count_exp, tmp);
21911 }
21912 emit_label (label);
21913 LABEL_NUSES (label) = 1;
21914 }
21915
21916 if (count_exp != const0_rtx && epilogue_size_needed > 1)
21917 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
21918 epilogue_size_needed);
21919 if (jump_around_label)
21920 emit_label (jump_around_label);
21921 return true;
21922 }
21923
21924 /* Helper function for memcpy. For QImode value 0xXY produce
21925 0xXYXYXYXY of wide specified by MODE. This is essentially
21926 a * 0x10101010, but we can do slightly better than
21927 synth_mult by unwinding the sequence by hand on CPUs with
21928 slow multiply. */
21929 static rtx
21930 promote_duplicated_reg (enum machine_mode mode, rtx val)
21931 {
21932 enum machine_mode valmode = GET_MODE (val);
21933 rtx tmp;
21934 int nops = mode == DImode ? 3 : 2;
21935
21936 gcc_assert (mode == SImode || mode == DImode);
21937 if (val == const0_rtx)
21938 return copy_to_mode_reg (mode, const0_rtx);
21939 if (CONST_INT_P (val))
21940 {
21941 HOST_WIDE_INT v = INTVAL (val) & 255;
21942
21943 v |= v << 8;
21944 v |= v << 16;
21945 if (mode == DImode)
21946 v |= (v << 16) << 16;
21947 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
21948 }
21949
21950 if (valmode == VOIDmode)
21951 valmode = QImode;
21952 if (valmode != QImode)
21953 val = gen_lowpart (QImode, val);
21954 if (mode == QImode)
21955 return val;
21956 if (!TARGET_PARTIAL_REG_STALL)
21957 nops--;
21958 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
21959 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
21960 <= (ix86_cost->shift_const + ix86_cost->add) * nops
21961 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
21962 {
21963 rtx reg = convert_modes (mode, QImode, val, true);
21964 tmp = promote_duplicated_reg (mode, const1_rtx);
21965 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
21966 OPTAB_DIRECT);
21967 }
21968 else
21969 {
21970 rtx reg = convert_modes (mode, QImode, val, true);
21971
21972 if (!TARGET_PARTIAL_REG_STALL)
21973 if (mode == SImode)
21974 emit_insn (gen_movsi_insv_1 (reg, reg));
21975 else
21976 emit_insn (gen_movdi_insv_1 (reg, reg));
21977 else
21978 {
21979 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
21980 NULL, 1, OPTAB_DIRECT);
21981 reg =
21982 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
21983 }
21984 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
21985 NULL, 1, OPTAB_DIRECT);
21986 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
21987 if (mode == SImode)
21988 return reg;
21989 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
21990 NULL, 1, OPTAB_DIRECT);
21991 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
21992 return reg;
21993 }
21994 }
21995
21996 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
21997 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
21998 alignment from ALIGN to DESIRED_ALIGN. */
21999 static rtx
22000 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22001 {
22002 rtx promoted_val;
22003
22004 if (TARGET_64BIT
22005 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22006 promoted_val = promote_duplicated_reg (DImode, val);
22007 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22008 promoted_val = promote_duplicated_reg (SImode, val);
22009 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22010 promoted_val = promote_duplicated_reg (HImode, val);
22011 else
22012 promoted_val = val;
22013
22014 return promoted_val;
22015 }
22016
22017 /* Expand string clear operation (bzero). Use i386 string operations when
22018 profitable. See expand_movmem comment for explanation of individual
22019 steps performed. */
22020 bool
22021 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22022 rtx expected_align_exp, rtx expected_size_exp)
22023 {
22024 rtx destreg;
22025 rtx label = NULL;
22026 rtx tmp;
22027 rtx jump_around_label = NULL;
22028 HOST_WIDE_INT align = 1;
22029 unsigned HOST_WIDE_INT count = 0;
22030 HOST_WIDE_INT expected_size = -1;
22031 int size_needed = 0, epilogue_size_needed;
22032 int desired_align = 0, align_bytes = 0;
22033 enum stringop_alg alg;
22034 rtx promoted_val = NULL;
22035 bool force_loopy_epilogue = false;
22036 int dynamic_check;
22037 bool need_zero_guard = false;
22038
22039 if (CONST_INT_P (align_exp))
22040 align = INTVAL (align_exp);
22041 /* i386 can do misaligned access on reasonably increased cost. */
22042 if (CONST_INT_P (expected_align_exp)
22043 && INTVAL (expected_align_exp) > align)
22044 align = INTVAL (expected_align_exp);
22045 if (CONST_INT_P (count_exp))
22046 count = expected_size = INTVAL (count_exp);
22047 if (CONST_INT_P (expected_size_exp) && count == 0)
22048 expected_size = INTVAL (expected_size_exp);
22049
22050 /* Make sure we don't need to care about overflow later on. */
22051 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22052 return false;
22053
22054 /* Step 0: Decide on preferred algorithm, desired alignment and
22055 size of chunks to be copied by main loop. */
22056
22057 alg = decide_alg (count, expected_size, true, &dynamic_check);
22058 desired_align = decide_alignment (align, alg, expected_size);
22059
22060 if (!TARGET_ALIGN_STRINGOPS)
22061 align = desired_align;
22062
22063 if (alg == libcall)
22064 return false;
22065 gcc_assert (alg != no_stringop);
22066 if (!count)
22067 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22068 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22069 switch (alg)
22070 {
22071 case libcall:
22072 case no_stringop:
22073 gcc_unreachable ();
22074 case loop:
22075 need_zero_guard = true;
22076 size_needed = GET_MODE_SIZE (Pmode);
22077 break;
22078 case unrolled_loop:
22079 need_zero_guard = true;
22080 size_needed = GET_MODE_SIZE (Pmode) * 4;
22081 break;
22082 case rep_prefix_8_byte:
22083 size_needed = 8;
22084 break;
22085 case rep_prefix_4_byte:
22086 size_needed = 4;
22087 break;
22088 case rep_prefix_1_byte:
22089 size_needed = 1;
22090 break;
22091 case loop_1_byte:
22092 need_zero_guard = true;
22093 size_needed = 1;
22094 break;
22095 }
22096 epilogue_size_needed = size_needed;
22097
22098 /* Step 1: Prologue guard. */
22099
22100 /* Alignment code needs count to be in register. */
22101 if (CONST_INT_P (count_exp) && desired_align > align)
22102 {
22103 if (INTVAL (count_exp) > desired_align
22104 && INTVAL (count_exp) > size_needed)
22105 {
22106 align_bytes
22107 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22108 if (align_bytes <= 0)
22109 align_bytes = 0;
22110 else
22111 align_bytes = desired_align - align_bytes;
22112 }
22113 if (align_bytes == 0)
22114 {
22115 enum machine_mode mode = SImode;
22116 if (TARGET_64BIT && (count & ~0xffffffff))
22117 mode = DImode;
22118 count_exp = force_reg (mode, count_exp);
22119 }
22120 }
22121 /* Do the cheap promotion to allow better CSE across the
22122 main loop and epilogue (ie one load of the big constant in the
22123 front of all code. */
22124 if (CONST_INT_P (val_exp))
22125 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22126 desired_align, align);
22127 /* Ensure that alignment prologue won't copy past end of block. */
22128 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22129 {
22130 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22131 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22132 Make sure it is power of 2. */
22133 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22134
22135 /* To improve performance of small blocks, we jump around the VAL
22136 promoting mode. This mean that if the promoted VAL is not constant,
22137 we might not use it in the epilogue and have to use byte
22138 loop variant. */
22139 if (epilogue_size_needed > 2 && !promoted_val)
22140 force_loopy_epilogue = true;
22141 if (count)
22142 {
22143 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22144 {
22145 /* If main algorithm works on QImode, no epilogue is needed.
22146 For small sizes just don't align anything. */
22147 if (size_needed == 1)
22148 desired_align = align;
22149 else
22150 goto epilogue;
22151 }
22152 }
22153 else
22154 {
22155 label = gen_label_rtx ();
22156 emit_cmp_and_jump_insns (count_exp,
22157 GEN_INT (epilogue_size_needed),
22158 LTU, 0, counter_mode (count_exp), 1, label);
22159 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22160 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22161 else
22162 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22163 }
22164 }
22165 if (dynamic_check != -1)
22166 {
22167 rtx hot_label = gen_label_rtx ();
22168 jump_around_label = gen_label_rtx ();
22169 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22170 LEU, 0, counter_mode (count_exp), 1, hot_label);
22171 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22172 set_storage_via_libcall (dst, count_exp, val_exp, false);
22173 emit_jump (jump_around_label);
22174 emit_label (hot_label);
22175 }
22176
22177 /* Step 2: Alignment prologue. */
22178
22179 /* Do the expensive promotion once we branched off the small blocks. */
22180 if (!promoted_val)
22181 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22182 desired_align, align);
22183 gcc_assert (desired_align >= 1 && align >= 1);
22184
22185 if (desired_align > align)
22186 {
22187 if (align_bytes == 0)
22188 {
22189 /* Except for the first move in epilogue, we no longer know
22190 constant offset in aliasing info. It don't seems to worth
22191 the pain to maintain it for the first move, so throw away
22192 the info early. */
22193 dst = change_address (dst, BLKmode, destreg);
22194 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22195 desired_align);
22196 }
22197 else
22198 {
22199 /* If we know how many bytes need to be stored before dst is
22200 sufficiently aligned, maintain aliasing info accurately. */
22201 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22202 desired_align, align_bytes);
22203 count_exp = plus_constant (count_exp, -align_bytes);
22204 count -= align_bytes;
22205 }
22206 if (need_zero_guard
22207 && (count < (unsigned HOST_WIDE_INT) size_needed
22208 || (align_bytes == 0
22209 && count < ((unsigned HOST_WIDE_INT) size_needed
22210 + desired_align - align))))
22211 {
22212 /* It is possible that we copied enough so the main loop will not
22213 execute. */
22214 gcc_assert (size_needed > 1);
22215 if (label == NULL_RTX)
22216 label = gen_label_rtx ();
22217 emit_cmp_and_jump_insns (count_exp,
22218 GEN_INT (size_needed),
22219 LTU, 0, counter_mode (count_exp), 1, label);
22220 if (expected_size == -1
22221 || expected_size < (desired_align - align) / 2 + size_needed)
22222 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22223 else
22224 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22225 }
22226 }
22227 if (label && size_needed == 1)
22228 {
22229 emit_label (label);
22230 LABEL_NUSES (label) = 1;
22231 label = NULL;
22232 promoted_val = val_exp;
22233 epilogue_size_needed = 1;
22234 }
22235 else if (label == NULL_RTX)
22236 epilogue_size_needed = size_needed;
22237
22238 /* Step 3: Main loop. */
22239
22240 switch (alg)
22241 {
22242 case libcall:
22243 case no_stringop:
22244 gcc_unreachable ();
22245 case loop_1_byte:
22246 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22247 count_exp, QImode, 1, expected_size);
22248 break;
22249 case loop:
22250 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22251 count_exp, Pmode, 1, expected_size);
22252 break;
22253 case unrolled_loop:
22254 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22255 count_exp, Pmode, 4, expected_size);
22256 break;
22257 case rep_prefix_8_byte:
22258 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22259 DImode, val_exp);
22260 break;
22261 case rep_prefix_4_byte:
22262 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22263 SImode, val_exp);
22264 break;
22265 case rep_prefix_1_byte:
22266 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22267 QImode, val_exp);
22268 break;
22269 }
22270 /* Adjust properly the offset of src and dest memory for aliasing. */
22271 if (CONST_INT_P (count_exp))
22272 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22273 (count / size_needed) * size_needed);
22274 else
22275 dst = change_address (dst, BLKmode, destreg);
22276
22277 /* Step 4: Epilogue to copy the remaining bytes. */
22278
22279 if (label)
22280 {
22281 /* When the main loop is done, COUNT_EXP might hold original count,
22282 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22283 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22284 bytes. Compensate if needed. */
22285
22286 if (size_needed < epilogue_size_needed)
22287 {
22288 tmp =
22289 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22290 GEN_INT (size_needed - 1), count_exp, 1,
22291 OPTAB_DIRECT);
22292 if (tmp != count_exp)
22293 emit_move_insn (count_exp, tmp);
22294 }
22295 emit_label (label);
22296 LABEL_NUSES (label) = 1;
22297 }
22298 epilogue:
22299 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22300 {
22301 if (force_loopy_epilogue)
22302 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22303 epilogue_size_needed);
22304 else
22305 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22306 epilogue_size_needed);
22307 }
22308 if (jump_around_label)
22309 emit_label (jump_around_label);
22310 return true;
22311 }
22312
22313 /* Expand the appropriate insns for doing strlen if not just doing
22314 repnz; scasb
22315
22316 out = result, initialized with the start address
22317 align_rtx = alignment of the address.
22318 scratch = scratch register, initialized with the startaddress when
22319 not aligned, otherwise undefined
22320
22321 This is just the body. It needs the initializations mentioned above and
22322 some address computing at the end. These things are done in i386.md. */
22323
22324 static void
22325 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22326 {
22327 int align;
22328 rtx tmp;
22329 rtx align_2_label = NULL_RTX;
22330 rtx align_3_label = NULL_RTX;
22331 rtx align_4_label = gen_label_rtx ();
22332 rtx end_0_label = gen_label_rtx ();
22333 rtx mem;
22334 rtx tmpreg = gen_reg_rtx (SImode);
22335 rtx scratch = gen_reg_rtx (SImode);
22336 rtx cmp;
22337
22338 align = 0;
22339 if (CONST_INT_P (align_rtx))
22340 align = INTVAL (align_rtx);
22341
22342 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
22343
22344 /* Is there a known alignment and is it less than 4? */
22345 if (align < 4)
22346 {
22347 rtx scratch1 = gen_reg_rtx (Pmode);
22348 emit_move_insn (scratch1, out);
22349 /* Is there a known alignment and is it not 2? */
22350 if (align != 2)
22351 {
22352 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22353 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22354
22355 /* Leave just the 3 lower bits. */
22356 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22357 NULL_RTX, 0, OPTAB_WIDEN);
22358
22359 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22360 Pmode, 1, align_4_label);
22361 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22362 Pmode, 1, align_2_label);
22363 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22364 Pmode, 1, align_3_label);
22365 }
22366 else
22367 {
22368 /* Since the alignment is 2, we have to check 2 or 0 bytes;
22369 check if is aligned to 4 - byte. */
22370
22371 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22372 NULL_RTX, 0, OPTAB_WIDEN);
22373
22374 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22375 Pmode, 1, align_4_label);
22376 }
22377
22378 mem = change_address (src, QImode, out);
22379
22380 /* Now compare the bytes. */
22381
22382 /* Compare the first n unaligned byte on a byte per byte basis. */
22383 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
22384 QImode, 1, end_0_label);
22385
22386 /* Increment the address. */
22387 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22388
22389 /* Not needed with an alignment of 2 */
22390 if (align != 2)
22391 {
22392 emit_label (align_2_label);
22393
22394 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22395 end_0_label);
22396
22397 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22398
22399 emit_label (align_3_label);
22400 }
22401
22402 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22403 end_0_label);
22404
22405 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22406 }
22407
22408 /* Generate loop to check 4 bytes at a time. It is not a good idea to
22409 align this loop. It gives only huge programs, but does not help to
22410 speed up. */
22411 emit_label (align_4_label);
22412
22413 mem = change_address (src, SImode, out);
22414 emit_move_insn (scratch, mem);
22415 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
22416
22417 /* This formula yields a nonzero result iff one of the bytes is zero.
22418 This saves three branches inside loop and many cycles. */
22419
22420 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
22421 emit_insn (gen_one_cmplsi2 (scratch, scratch));
22422 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
22423 emit_insn (gen_andsi3 (tmpreg, tmpreg,
22424 gen_int_mode (0x80808080, SImode)));
22425 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
22426 align_4_label);
22427
22428 if (TARGET_CMOVE)
22429 {
22430 rtx reg = gen_reg_rtx (SImode);
22431 rtx reg2 = gen_reg_rtx (Pmode);
22432 emit_move_insn (reg, tmpreg);
22433 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
22434
22435 /* If zero is not in the first two bytes, move two bytes forward. */
22436 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22437 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22438 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22439 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
22440 gen_rtx_IF_THEN_ELSE (SImode, tmp,
22441 reg,
22442 tmpreg)));
22443 /* Emit lea manually to avoid clobbering of flags. */
22444 emit_insn (gen_rtx_SET (SImode, reg2,
22445 gen_rtx_PLUS (Pmode, out, const2_rtx)));
22446
22447 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22448 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22449 emit_insn (gen_rtx_SET (VOIDmode, out,
22450 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
22451 reg2,
22452 out)));
22453 }
22454 else
22455 {
22456 rtx end_2_label = gen_label_rtx ();
22457 /* Is zero in the first two bytes? */
22458
22459 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22460 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22461 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
22462 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22463 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
22464 pc_rtx);
22465 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
22466 JUMP_LABEL (tmp) = end_2_label;
22467
22468 /* Not in the first two. Move two bytes forward. */
22469 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
22470 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
22471
22472 emit_label (end_2_label);
22473
22474 }
22475
22476 /* Avoid branch in fixing the byte. */
22477 tmpreg = gen_lowpart (QImode, tmpreg);
22478 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
22479 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
22480 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
22481 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
22482
22483 emit_label (end_0_label);
22484 }
22485
22486 /* Expand strlen. */
22487
22488 bool
22489 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
22490 {
22491 rtx addr, scratch1, scratch2, scratch3, scratch4;
22492
22493 /* The generic case of strlen expander is long. Avoid it's
22494 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
22495
22496 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22497 && !TARGET_INLINE_ALL_STRINGOPS
22498 && !optimize_insn_for_size_p ()
22499 && (!CONST_INT_P (align) || INTVAL (align) < 4))
22500 return false;
22501
22502 addr = force_reg (Pmode, XEXP (src, 0));
22503 scratch1 = gen_reg_rtx (Pmode);
22504
22505 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22506 && !optimize_insn_for_size_p ())
22507 {
22508 /* Well it seems that some optimizer does not combine a call like
22509 foo(strlen(bar), strlen(bar));
22510 when the move and the subtraction is done here. It does calculate
22511 the length just once when these instructions are done inside of
22512 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
22513 often used and I use one fewer register for the lifetime of
22514 output_strlen_unroll() this is better. */
22515
22516 emit_move_insn (out, addr);
22517
22518 ix86_expand_strlensi_unroll_1 (out, src, align);
22519
22520 /* strlensi_unroll_1 returns the address of the zero at the end of
22521 the string, like memchr(), so compute the length by subtracting
22522 the start address. */
22523 emit_insn (ix86_gen_sub3 (out, out, addr));
22524 }
22525 else
22526 {
22527 rtx unspec;
22528
22529 /* Can't use this if the user has appropriated eax, ecx, or edi. */
22530 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
22531 return false;
22532
22533 scratch2 = gen_reg_rtx (Pmode);
22534 scratch3 = gen_reg_rtx (Pmode);
22535 scratch4 = force_reg (Pmode, constm1_rtx);
22536
22537 emit_move_insn (scratch3, addr);
22538 eoschar = force_reg (QImode, eoschar);
22539
22540 src = replace_equiv_address_nv (src, scratch3);
22541
22542 /* If .md starts supporting :P, this can be done in .md. */
22543 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
22544 scratch4), UNSPEC_SCAS);
22545 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
22546 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
22547 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
22548 }
22549 return true;
22550 }
22551
22552 /* For given symbol (function) construct code to compute address of it's PLT
22553 entry in large x86-64 PIC model. */
22554 rtx
22555 construct_plt_address (rtx symbol)
22556 {
22557 rtx tmp = gen_reg_rtx (Pmode);
22558 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
22559
22560 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
22561 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
22562
22563 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
22564 emit_insn (gen_adddi3 (tmp, tmp, pic_offset_table_rtx));
22565 return tmp;
22566 }
22567
22568 rtx
22569 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
22570 rtx callarg2,
22571 rtx pop, bool sibcall)
22572 {
22573 /* We need to represent that SI and DI registers are clobbered
22574 by SYSV calls. */
22575 static int clobbered_registers[] = {
22576 XMM6_REG, XMM7_REG, XMM8_REG,
22577 XMM9_REG, XMM10_REG, XMM11_REG,
22578 XMM12_REG, XMM13_REG, XMM14_REG,
22579 XMM15_REG, SI_REG, DI_REG
22580 };
22581 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
22582 rtx use = NULL, call;
22583 unsigned int vec_len;
22584
22585 if (pop == const0_rtx)
22586 pop = NULL;
22587 gcc_assert (!TARGET_64BIT || !pop);
22588
22589 if (TARGET_MACHO && !TARGET_64BIT)
22590 {
22591 #if TARGET_MACHO
22592 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
22593 fnaddr = machopic_indirect_call_target (fnaddr);
22594 #endif
22595 }
22596 else
22597 {
22598 /* Static functions and indirect calls don't need the pic register. */
22599 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
22600 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
22601 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
22602 use_reg (&use, pic_offset_table_rtx);
22603 }
22604
22605 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
22606 {
22607 rtx al = gen_rtx_REG (QImode, AX_REG);
22608 emit_move_insn (al, callarg2);
22609 use_reg (&use, al);
22610 }
22611
22612 if (ix86_cmodel == CM_LARGE_PIC
22613 && MEM_P (fnaddr)
22614 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
22615 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
22616 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
22617 else if (sibcall
22618 ? !sibcall_insn_operand (XEXP (fnaddr, 0), Pmode)
22619 : !call_insn_operand (XEXP (fnaddr, 0), Pmode))
22620 {
22621 fnaddr = XEXP (fnaddr, 0);
22622 if (GET_MODE (fnaddr) != Pmode)
22623 fnaddr = convert_to_mode (Pmode, fnaddr, 1);
22624 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (Pmode, fnaddr));
22625 }
22626
22627 vec_len = 0;
22628 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
22629 if (retval)
22630 call = gen_rtx_SET (VOIDmode, retval, call);
22631 vec[vec_len++] = call;
22632
22633 if (pop)
22634 {
22635 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
22636 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
22637 vec[vec_len++] = pop;
22638 }
22639
22640 if (TARGET_64BIT_MS_ABI
22641 && (!callarg2 || INTVAL (callarg2) != -2))
22642 {
22643 unsigned i;
22644
22645 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
22646 UNSPEC_MS_TO_SYSV_CALL);
22647
22648 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
22649 vec[vec_len++]
22650 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
22651 ? TImode : DImode,
22652 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
22653 ? TImode : DImode,
22654 clobbered_registers[i]));
22655 }
22656
22657 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
22658 if (TARGET_VZEROUPPER)
22659 {
22660 int avx256;
22661 if (cfun->machine->callee_pass_avx256_p)
22662 {
22663 if (cfun->machine->callee_return_avx256_p)
22664 avx256 = callee_return_pass_avx256;
22665 else
22666 avx256 = callee_pass_avx256;
22667 }
22668 else if (cfun->machine->callee_return_avx256_p)
22669 avx256 = callee_return_avx256;
22670 else
22671 avx256 = call_no_avx256;
22672
22673 if (reload_completed)
22674 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
22675 else
22676 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
22677 gen_rtvec (1, GEN_INT (avx256)),
22678 UNSPEC_CALL_NEEDS_VZEROUPPER);
22679 }
22680
22681 if (vec_len > 1)
22682 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
22683 call = emit_call_insn (call);
22684 if (use)
22685 CALL_INSN_FUNCTION_USAGE (call) = use;
22686
22687 return call;
22688 }
22689
22690 void
22691 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
22692 {
22693 rtx pat = PATTERN (insn);
22694 rtvec vec = XVEC (pat, 0);
22695 int len = GET_NUM_ELEM (vec) - 1;
22696
22697 /* Strip off the last entry of the parallel. */
22698 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
22699 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
22700 if (len == 1)
22701 pat = RTVEC_ELT (vec, 0);
22702 else
22703 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
22704
22705 emit_insn (gen_avx_vzeroupper (vzeroupper));
22706 emit_call_insn (pat);
22707 }
22708
22709 /* Output the assembly for a call instruction. */
22710
22711 const char *
22712 ix86_output_call_insn (rtx insn, rtx call_op)
22713 {
22714 bool direct_p = constant_call_address_operand (call_op, Pmode);
22715 bool seh_nop_p = false;
22716 const char *xasm;
22717
22718 if (SIBLING_CALL_P (insn))
22719 {
22720 if (direct_p)
22721 xasm = "jmp\t%P0";
22722 /* SEH epilogue detection requires the indirect branch case
22723 to include REX.W. */
22724 else if (TARGET_SEH)
22725 xasm = "rex.W jmp %A0";
22726 else
22727 xasm = "jmp\t%A0";
22728
22729 output_asm_insn (xasm, &call_op);
22730 return "";
22731 }
22732
22733 /* SEH unwinding can require an extra nop to be emitted in several
22734 circumstances. Determine if we have one of those. */
22735 if (TARGET_SEH)
22736 {
22737 rtx i;
22738
22739 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
22740 {
22741 /* If we get to another real insn, we don't need the nop. */
22742 if (INSN_P (i))
22743 break;
22744
22745 /* If we get to the epilogue note, prevent a catch region from
22746 being adjacent to the standard epilogue sequence. If non-
22747 call-exceptions, we'll have done this during epilogue emission. */
22748 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
22749 && !flag_non_call_exceptions
22750 && !can_throw_internal (insn))
22751 {
22752 seh_nop_p = true;
22753 break;
22754 }
22755 }
22756
22757 /* If we didn't find a real insn following the call, prevent the
22758 unwinder from looking into the next function. */
22759 if (i == NULL)
22760 seh_nop_p = true;
22761 }
22762
22763 if (direct_p)
22764 xasm = "call\t%P0";
22765 else
22766 xasm = "call\t%A0";
22767
22768 output_asm_insn (xasm, &call_op);
22769
22770 if (seh_nop_p)
22771 return "nop";
22772
22773 return "";
22774 }
22775 \f
22776 /* Clear stack slot assignments remembered from previous functions.
22777 This is called from INIT_EXPANDERS once before RTL is emitted for each
22778 function. */
22779
22780 static struct machine_function *
22781 ix86_init_machine_status (void)
22782 {
22783 struct machine_function *f;
22784
22785 f = ggc_alloc_cleared_machine_function ();
22786 f->use_fast_prologue_epilogue_nregs = -1;
22787 f->tls_descriptor_call_expanded_p = 0;
22788 f->call_abi = ix86_abi;
22789
22790 return f;
22791 }
22792
22793 /* Return a MEM corresponding to a stack slot with mode MODE.
22794 Allocate a new slot if necessary.
22795
22796 The RTL for a function can have several slots available: N is
22797 which slot to use. */
22798
22799 rtx
22800 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
22801 {
22802 struct stack_local_entry *s;
22803
22804 gcc_assert (n < MAX_386_STACK_LOCALS);
22805
22806 /* Virtual slot is valid only before vregs are instantiated. */
22807 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
22808
22809 for (s = ix86_stack_locals; s; s = s->next)
22810 if (s->mode == mode && s->n == n)
22811 return validize_mem (copy_rtx (s->rtl));
22812
22813 s = ggc_alloc_stack_local_entry ();
22814 s->n = n;
22815 s->mode = mode;
22816 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
22817
22818 s->next = ix86_stack_locals;
22819 ix86_stack_locals = s;
22820 return validize_mem (s->rtl);
22821 }
22822 \f
22823 /* Calculate the length of the memory address in the instruction encoding.
22824 Includes addr32 prefix, does not include the one-byte modrm, opcode,
22825 or other prefixes. */
22826
22827 int
22828 memory_address_length (rtx addr)
22829 {
22830 struct ix86_address parts;
22831 rtx base, index, disp;
22832 int len;
22833 int ok;
22834
22835 if (GET_CODE (addr) == PRE_DEC
22836 || GET_CODE (addr) == POST_INC
22837 || GET_CODE (addr) == PRE_MODIFY
22838 || GET_CODE (addr) == POST_MODIFY)
22839 return 0;
22840
22841 ok = ix86_decompose_address (addr, &parts);
22842 gcc_assert (ok);
22843
22844 if (parts.base && GET_CODE (parts.base) == SUBREG)
22845 parts.base = SUBREG_REG (parts.base);
22846 if (parts.index && GET_CODE (parts.index) == SUBREG)
22847 parts.index = SUBREG_REG (parts.index);
22848
22849 base = parts.base;
22850 index = parts.index;
22851 disp = parts.disp;
22852
22853 /* Add length of addr32 prefix. */
22854 len = (GET_CODE (addr) == ZERO_EXTEND
22855 || GET_CODE (addr) == AND);
22856
22857 /* Rule of thumb:
22858 - esp as the base always wants an index,
22859 - ebp as the base always wants a displacement,
22860 - r12 as the base always wants an index,
22861 - r13 as the base always wants a displacement. */
22862
22863 /* Register Indirect. */
22864 if (base && !index && !disp)
22865 {
22866 /* esp (for its index) and ebp (for its displacement) need
22867 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
22868 code. */
22869 if (REG_P (addr)
22870 && (addr == arg_pointer_rtx
22871 || addr == frame_pointer_rtx
22872 || REGNO (addr) == SP_REG
22873 || REGNO (addr) == BP_REG
22874 || REGNO (addr) == R12_REG
22875 || REGNO (addr) == R13_REG))
22876 len = 1;
22877 }
22878
22879 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
22880 is not disp32, but disp32(%rip), so for disp32
22881 SIB byte is needed, unless print_operand_address
22882 optimizes it into disp32(%rip) or (%rip) is implied
22883 by UNSPEC. */
22884 else if (disp && !base && !index)
22885 {
22886 len = 4;
22887 if (TARGET_64BIT)
22888 {
22889 rtx symbol = disp;
22890
22891 if (GET_CODE (disp) == CONST)
22892 symbol = XEXP (disp, 0);
22893 if (GET_CODE (symbol) == PLUS
22894 && CONST_INT_P (XEXP (symbol, 1)))
22895 symbol = XEXP (symbol, 0);
22896
22897 if (GET_CODE (symbol) != LABEL_REF
22898 && (GET_CODE (symbol) != SYMBOL_REF
22899 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
22900 && (GET_CODE (symbol) != UNSPEC
22901 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
22902 && XINT (symbol, 1) != UNSPEC_PCREL
22903 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
22904 len += 1;
22905 }
22906 }
22907
22908 else
22909 {
22910 /* Find the length of the displacement constant. */
22911 if (disp)
22912 {
22913 if (base && satisfies_constraint_K (disp))
22914 len = 1;
22915 else
22916 len = 4;
22917 }
22918 /* ebp always wants a displacement. Similarly r13. */
22919 else if (base && REG_P (base)
22920 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
22921 len = 1;
22922
22923 /* An index requires the two-byte modrm form.... */
22924 if (index
22925 /* ...like esp (or r12), which always wants an index. */
22926 || base == arg_pointer_rtx
22927 || base == frame_pointer_rtx
22928 || (base && REG_P (base)
22929 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
22930 len += 1;
22931 }
22932
22933 switch (parts.seg)
22934 {
22935 case SEG_FS:
22936 case SEG_GS:
22937 len += 1;
22938 break;
22939 default:
22940 break;
22941 }
22942
22943 return len;
22944 }
22945
22946 /* Compute default value for "length_immediate" attribute. When SHORTFORM
22947 is set, expect that insn have 8bit immediate alternative. */
22948 int
22949 ix86_attr_length_immediate_default (rtx insn, bool shortform)
22950 {
22951 int len = 0;
22952 int i;
22953 extract_insn_cached (insn);
22954 for (i = recog_data.n_operands - 1; i >= 0; --i)
22955 if (CONSTANT_P (recog_data.operand[i]))
22956 {
22957 enum attr_mode mode = get_attr_mode (insn);
22958
22959 gcc_assert (!len);
22960 if (shortform && CONST_INT_P (recog_data.operand[i]))
22961 {
22962 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
22963 switch (mode)
22964 {
22965 case MODE_QI:
22966 len = 1;
22967 continue;
22968 case MODE_HI:
22969 ival = trunc_int_for_mode (ival, HImode);
22970 break;
22971 case MODE_SI:
22972 ival = trunc_int_for_mode (ival, SImode);
22973 break;
22974 default:
22975 break;
22976 }
22977 if (IN_RANGE (ival, -128, 127))
22978 {
22979 len = 1;
22980 continue;
22981 }
22982 }
22983 switch (mode)
22984 {
22985 case MODE_QI:
22986 len = 1;
22987 break;
22988 case MODE_HI:
22989 len = 2;
22990 break;
22991 case MODE_SI:
22992 len = 4;
22993 break;
22994 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
22995 case MODE_DI:
22996 len = 4;
22997 break;
22998 default:
22999 fatal_insn ("unknown insn mode", insn);
23000 }
23001 }
23002 return len;
23003 }
23004 /* Compute default value for "length_address" attribute. */
23005 int
23006 ix86_attr_length_address_default (rtx insn)
23007 {
23008 int i;
23009
23010 if (get_attr_type (insn) == TYPE_LEA)
23011 {
23012 rtx set = PATTERN (insn), addr;
23013
23014 if (GET_CODE (set) == PARALLEL)
23015 set = XVECEXP (set, 0, 0);
23016
23017 gcc_assert (GET_CODE (set) == SET);
23018
23019 addr = SET_SRC (set);
23020 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23021 {
23022 if (GET_CODE (addr) == ZERO_EXTEND)
23023 addr = XEXP (addr, 0);
23024 if (GET_CODE (addr) == SUBREG)
23025 addr = SUBREG_REG (addr);
23026 }
23027
23028 return memory_address_length (addr);
23029 }
23030
23031 extract_insn_cached (insn);
23032 for (i = recog_data.n_operands - 1; i >= 0; --i)
23033 if (MEM_P (recog_data.operand[i]))
23034 {
23035 constrain_operands_cached (reload_completed);
23036 if (which_alternative != -1)
23037 {
23038 const char *constraints = recog_data.constraints[i];
23039 int alt = which_alternative;
23040
23041 while (*constraints == '=' || *constraints == '+')
23042 constraints++;
23043 while (alt-- > 0)
23044 while (*constraints++ != ',')
23045 ;
23046 /* Skip ignored operands. */
23047 if (*constraints == 'X')
23048 continue;
23049 }
23050 return memory_address_length (XEXP (recog_data.operand[i], 0));
23051 }
23052 return 0;
23053 }
23054
23055 /* Compute default value for "length_vex" attribute. It includes
23056 2 or 3 byte VEX prefix and 1 opcode byte. */
23057
23058 int
23059 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23060 {
23061 int i;
23062
23063 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23064 byte VEX prefix. */
23065 if (!has_0f_opcode || has_vex_w)
23066 return 3 + 1;
23067
23068 /* We can always use 2 byte VEX prefix in 32bit. */
23069 if (!TARGET_64BIT)
23070 return 2 + 1;
23071
23072 extract_insn_cached (insn);
23073
23074 for (i = recog_data.n_operands - 1; i >= 0; --i)
23075 if (REG_P (recog_data.operand[i]))
23076 {
23077 /* REX.W bit uses 3 byte VEX prefix. */
23078 if (GET_MODE (recog_data.operand[i]) == DImode
23079 && GENERAL_REG_P (recog_data.operand[i]))
23080 return 3 + 1;
23081 }
23082 else
23083 {
23084 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23085 if (MEM_P (recog_data.operand[i])
23086 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23087 return 3 + 1;
23088 }
23089
23090 return 2 + 1;
23091 }
23092 \f
23093 /* Return the maximum number of instructions a cpu can issue. */
23094
23095 static int
23096 ix86_issue_rate (void)
23097 {
23098 switch (ix86_tune)
23099 {
23100 case PROCESSOR_PENTIUM:
23101 case PROCESSOR_ATOM:
23102 case PROCESSOR_K6:
23103 return 2;
23104
23105 case PROCESSOR_PENTIUMPRO:
23106 case PROCESSOR_PENTIUM4:
23107 case PROCESSOR_CORE2_32:
23108 case PROCESSOR_CORE2_64:
23109 case PROCESSOR_COREI7_32:
23110 case PROCESSOR_COREI7_64:
23111 case PROCESSOR_ATHLON:
23112 case PROCESSOR_K8:
23113 case PROCESSOR_AMDFAM10:
23114 case PROCESSOR_NOCONA:
23115 case PROCESSOR_GENERIC32:
23116 case PROCESSOR_GENERIC64:
23117 case PROCESSOR_BDVER1:
23118 case PROCESSOR_BDVER2:
23119 case PROCESSOR_BTVER1:
23120 return 3;
23121
23122 default:
23123 return 1;
23124 }
23125 }
23126
23127 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23128 by DEP_INSN and nothing set by DEP_INSN. */
23129
23130 static bool
23131 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23132 {
23133 rtx set, set2;
23134
23135 /* Simplify the test for uninteresting insns. */
23136 if (insn_type != TYPE_SETCC
23137 && insn_type != TYPE_ICMOV
23138 && insn_type != TYPE_FCMOV
23139 && insn_type != TYPE_IBR)
23140 return false;
23141
23142 if ((set = single_set (dep_insn)) != 0)
23143 {
23144 set = SET_DEST (set);
23145 set2 = NULL_RTX;
23146 }
23147 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23148 && XVECLEN (PATTERN (dep_insn), 0) == 2
23149 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23150 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23151 {
23152 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23153 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23154 }
23155 else
23156 return false;
23157
23158 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23159 return false;
23160
23161 /* This test is true if the dependent insn reads the flags but
23162 not any other potentially set register. */
23163 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23164 return false;
23165
23166 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23167 return false;
23168
23169 return true;
23170 }
23171
23172 /* Return true iff USE_INSN has a memory address with operands set by
23173 SET_INSN. */
23174
23175 bool
23176 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23177 {
23178 int i;
23179 extract_insn_cached (use_insn);
23180 for (i = recog_data.n_operands - 1; i >= 0; --i)
23181 if (MEM_P (recog_data.operand[i]))
23182 {
23183 rtx addr = XEXP (recog_data.operand[i], 0);
23184 return modified_in_p (addr, set_insn) != 0;
23185 }
23186 return false;
23187 }
23188
23189 static int
23190 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23191 {
23192 enum attr_type insn_type, dep_insn_type;
23193 enum attr_memory memory;
23194 rtx set, set2;
23195 int dep_insn_code_number;
23196
23197 /* Anti and output dependencies have zero cost on all CPUs. */
23198 if (REG_NOTE_KIND (link) != 0)
23199 return 0;
23200
23201 dep_insn_code_number = recog_memoized (dep_insn);
23202
23203 /* If we can't recognize the insns, we can't really do anything. */
23204 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23205 return cost;
23206
23207 insn_type = get_attr_type (insn);
23208 dep_insn_type = get_attr_type (dep_insn);
23209
23210 switch (ix86_tune)
23211 {
23212 case PROCESSOR_PENTIUM:
23213 /* Address Generation Interlock adds a cycle of latency. */
23214 if (insn_type == TYPE_LEA)
23215 {
23216 rtx addr = PATTERN (insn);
23217
23218 if (GET_CODE (addr) == PARALLEL)
23219 addr = XVECEXP (addr, 0, 0);
23220
23221 gcc_assert (GET_CODE (addr) == SET);
23222
23223 addr = SET_SRC (addr);
23224 if (modified_in_p (addr, dep_insn))
23225 cost += 1;
23226 }
23227 else if (ix86_agi_dependent (dep_insn, insn))
23228 cost += 1;
23229
23230 /* ??? Compares pair with jump/setcc. */
23231 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23232 cost = 0;
23233
23234 /* Floating point stores require value to be ready one cycle earlier. */
23235 if (insn_type == TYPE_FMOV
23236 && get_attr_memory (insn) == MEMORY_STORE
23237 && !ix86_agi_dependent (dep_insn, insn))
23238 cost += 1;
23239 break;
23240
23241 case PROCESSOR_PENTIUMPRO:
23242 memory = get_attr_memory (insn);
23243
23244 /* INT->FP conversion is expensive. */
23245 if (get_attr_fp_int_src (dep_insn))
23246 cost += 5;
23247
23248 /* There is one cycle extra latency between an FP op and a store. */
23249 if (insn_type == TYPE_FMOV
23250 && (set = single_set (dep_insn)) != NULL_RTX
23251 && (set2 = single_set (insn)) != NULL_RTX
23252 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23253 && MEM_P (SET_DEST (set2)))
23254 cost += 1;
23255
23256 /* Show ability of reorder buffer to hide latency of load by executing
23257 in parallel with previous instruction in case
23258 previous instruction is not needed to compute the address. */
23259 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23260 && !ix86_agi_dependent (dep_insn, insn))
23261 {
23262 /* Claim moves to take one cycle, as core can issue one load
23263 at time and the next load can start cycle later. */
23264 if (dep_insn_type == TYPE_IMOV
23265 || dep_insn_type == TYPE_FMOV)
23266 cost = 1;
23267 else if (cost > 1)
23268 cost--;
23269 }
23270 break;
23271
23272 case PROCESSOR_K6:
23273 memory = get_attr_memory (insn);
23274
23275 /* The esp dependency is resolved before the instruction is really
23276 finished. */
23277 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23278 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23279 return 1;
23280
23281 /* INT->FP conversion is expensive. */
23282 if (get_attr_fp_int_src (dep_insn))
23283 cost += 5;
23284
23285 /* Show ability of reorder buffer to hide latency of load by executing
23286 in parallel with previous instruction in case
23287 previous instruction is not needed to compute the address. */
23288 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23289 && !ix86_agi_dependent (dep_insn, insn))
23290 {
23291 /* Claim moves to take one cycle, as core can issue one load
23292 at time and the next load can start cycle later. */
23293 if (dep_insn_type == TYPE_IMOV
23294 || dep_insn_type == TYPE_FMOV)
23295 cost = 1;
23296 else if (cost > 2)
23297 cost -= 2;
23298 else
23299 cost = 1;
23300 }
23301 break;
23302
23303 case PROCESSOR_ATHLON:
23304 case PROCESSOR_K8:
23305 case PROCESSOR_AMDFAM10:
23306 case PROCESSOR_BDVER1:
23307 case PROCESSOR_BDVER2:
23308 case PROCESSOR_BTVER1:
23309 case PROCESSOR_ATOM:
23310 case PROCESSOR_GENERIC32:
23311 case PROCESSOR_GENERIC64:
23312 memory = get_attr_memory (insn);
23313
23314 /* Show ability of reorder buffer to hide latency of load by executing
23315 in parallel with previous instruction in case
23316 previous instruction is not needed to compute the address. */
23317 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23318 && !ix86_agi_dependent (dep_insn, insn))
23319 {
23320 enum attr_unit unit = get_attr_unit (insn);
23321 int loadcost = 3;
23322
23323 /* Because of the difference between the length of integer and
23324 floating unit pipeline preparation stages, the memory operands
23325 for floating point are cheaper.
23326
23327 ??? For Athlon it the difference is most probably 2. */
23328 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23329 loadcost = 3;
23330 else
23331 loadcost = TARGET_ATHLON ? 2 : 0;
23332
23333 if (cost >= loadcost)
23334 cost -= loadcost;
23335 else
23336 cost = 0;
23337 }
23338
23339 default:
23340 break;
23341 }
23342
23343 return cost;
23344 }
23345
23346 /* How many alternative schedules to try. This should be as wide as the
23347 scheduling freedom in the DFA, but no wider. Making this value too
23348 large results extra work for the scheduler. */
23349
23350 static int
23351 ia32_multipass_dfa_lookahead (void)
23352 {
23353 switch (ix86_tune)
23354 {
23355 case PROCESSOR_PENTIUM:
23356 return 2;
23357
23358 case PROCESSOR_PENTIUMPRO:
23359 case PROCESSOR_K6:
23360 return 1;
23361
23362 case PROCESSOR_CORE2_32:
23363 case PROCESSOR_CORE2_64:
23364 case PROCESSOR_COREI7_32:
23365 case PROCESSOR_COREI7_64:
23366 /* Generally, we want haifa-sched:max_issue() to look ahead as far
23367 as many instructions can be executed on a cycle, i.e.,
23368 issue_rate. I wonder why tuning for many CPUs does not do this. */
23369 return ix86_issue_rate ();
23370
23371 default:
23372 return 0;
23373 }
23374 }
23375
23376 \f
23377
23378 /* Model decoder of Core 2/i7.
23379 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
23380 track the instruction fetch block boundaries and make sure that long
23381 (9+ bytes) instructions are assigned to D0. */
23382
23383 /* Maximum length of an insn that can be handled by
23384 a secondary decoder unit. '8' for Core 2/i7. */
23385 static int core2i7_secondary_decoder_max_insn_size;
23386
23387 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
23388 '16' for Core 2/i7. */
23389 static int core2i7_ifetch_block_size;
23390
23391 /* Maximum number of instructions decoder can handle per cycle.
23392 '6' for Core 2/i7. */
23393 static int core2i7_ifetch_block_max_insns;
23394
23395 typedef struct ix86_first_cycle_multipass_data_ *
23396 ix86_first_cycle_multipass_data_t;
23397 typedef const struct ix86_first_cycle_multipass_data_ *
23398 const_ix86_first_cycle_multipass_data_t;
23399
23400 /* A variable to store target state across calls to max_issue within
23401 one cycle. */
23402 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
23403 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
23404
23405 /* Initialize DATA. */
23406 static void
23407 core2i7_first_cycle_multipass_init (void *_data)
23408 {
23409 ix86_first_cycle_multipass_data_t data
23410 = (ix86_first_cycle_multipass_data_t) _data;
23411
23412 data->ifetch_block_len = 0;
23413 data->ifetch_block_n_insns = 0;
23414 data->ready_try_change = NULL;
23415 data->ready_try_change_size = 0;
23416 }
23417
23418 /* Advancing the cycle; reset ifetch block counts. */
23419 static void
23420 core2i7_dfa_post_advance_cycle (void)
23421 {
23422 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
23423
23424 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23425
23426 data->ifetch_block_len = 0;
23427 data->ifetch_block_n_insns = 0;
23428 }
23429
23430 static int min_insn_size (rtx);
23431
23432 /* Filter out insns from ready_try that the core will not be able to issue
23433 on current cycle due to decoder. */
23434 static void
23435 core2i7_first_cycle_multipass_filter_ready_try
23436 (const_ix86_first_cycle_multipass_data_t data,
23437 char *ready_try, int n_ready, bool first_cycle_insn_p)
23438 {
23439 while (n_ready--)
23440 {
23441 rtx insn;
23442 int insn_size;
23443
23444 if (ready_try[n_ready])
23445 continue;
23446
23447 insn = get_ready_element (n_ready);
23448 insn_size = min_insn_size (insn);
23449
23450 if (/* If this is a too long an insn for a secondary decoder ... */
23451 (!first_cycle_insn_p
23452 && insn_size > core2i7_secondary_decoder_max_insn_size)
23453 /* ... or it would not fit into the ifetch block ... */
23454 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
23455 /* ... or the decoder is full already ... */
23456 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
23457 /* ... mask the insn out. */
23458 {
23459 ready_try[n_ready] = 1;
23460
23461 if (data->ready_try_change)
23462 SET_BIT (data->ready_try_change, n_ready);
23463 }
23464 }
23465 }
23466
23467 /* Prepare for a new round of multipass lookahead scheduling. */
23468 static void
23469 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
23470 bool first_cycle_insn_p)
23471 {
23472 ix86_first_cycle_multipass_data_t data
23473 = (ix86_first_cycle_multipass_data_t) _data;
23474 const_ix86_first_cycle_multipass_data_t prev_data
23475 = ix86_first_cycle_multipass_data;
23476
23477 /* Restore the state from the end of the previous round. */
23478 data->ifetch_block_len = prev_data->ifetch_block_len;
23479 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
23480
23481 /* Filter instructions that cannot be issued on current cycle due to
23482 decoder restrictions. */
23483 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23484 first_cycle_insn_p);
23485 }
23486
23487 /* INSN is being issued in current solution. Account for its impact on
23488 the decoder model. */
23489 static void
23490 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
23491 rtx insn, const void *_prev_data)
23492 {
23493 ix86_first_cycle_multipass_data_t data
23494 = (ix86_first_cycle_multipass_data_t) _data;
23495 const_ix86_first_cycle_multipass_data_t prev_data
23496 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
23497
23498 int insn_size = min_insn_size (insn);
23499
23500 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
23501 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
23502 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
23503 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23504
23505 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
23506 if (!data->ready_try_change)
23507 {
23508 data->ready_try_change = sbitmap_alloc (n_ready);
23509 data->ready_try_change_size = n_ready;
23510 }
23511 else if (data->ready_try_change_size < n_ready)
23512 {
23513 data->ready_try_change = sbitmap_resize (data->ready_try_change,
23514 n_ready, 0);
23515 data->ready_try_change_size = n_ready;
23516 }
23517 sbitmap_zero (data->ready_try_change);
23518
23519 /* Filter out insns from ready_try that the core will not be able to issue
23520 on current cycle due to decoder. */
23521 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23522 false);
23523 }
23524
23525 /* Revert the effect on ready_try. */
23526 static void
23527 core2i7_first_cycle_multipass_backtrack (const void *_data,
23528 char *ready_try,
23529 int n_ready ATTRIBUTE_UNUSED)
23530 {
23531 const_ix86_first_cycle_multipass_data_t data
23532 = (const_ix86_first_cycle_multipass_data_t) _data;
23533 unsigned int i = 0;
23534 sbitmap_iterator sbi;
23535
23536 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
23537 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
23538 {
23539 ready_try[i] = 0;
23540 }
23541 }
23542
23543 /* Save the result of multipass lookahead scheduling for the next round. */
23544 static void
23545 core2i7_first_cycle_multipass_end (const void *_data)
23546 {
23547 const_ix86_first_cycle_multipass_data_t data
23548 = (const_ix86_first_cycle_multipass_data_t) _data;
23549 ix86_first_cycle_multipass_data_t next_data
23550 = ix86_first_cycle_multipass_data;
23551
23552 if (data != NULL)
23553 {
23554 next_data->ifetch_block_len = data->ifetch_block_len;
23555 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
23556 }
23557 }
23558
23559 /* Deallocate target data. */
23560 static void
23561 core2i7_first_cycle_multipass_fini (void *_data)
23562 {
23563 ix86_first_cycle_multipass_data_t data
23564 = (ix86_first_cycle_multipass_data_t) _data;
23565
23566 if (data->ready_try_change)
23567 {
23568 sbitmap_free (data->ready_try_change);
23569 data->ready_try_change = NULL;
23570 data->ready_try_change_size = 0;
23571 }
23572 }
23573
23574 /* Prepare for scheduling pass. */
23575 static void
23576 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
23577 int verbose ATTRIBUTE_UNUSED,
23578 int max_uid ATTRIBUTE_UNUSED)
23579 {
23580 /* Install scheduling hooks for current CPU. Some of these hooks are used
23581 in time-critical parts of the scheduler, so we only set them up when
23582 they are actually used. */
23583 switch (ix86_tune)
23584 {
23585 case PROCESSOR_CORE2_32:
23586 case PROCESSOR_CORE2_64:
23587 case PROCESSOR_COREI7_32:
23588 case PROCESSOR_COREI7_64:
23589 targetm.sched.dfa_post_advance_cycle
23590 = core2i7_dfa_post_advance_cycle;
23591 targetm.sched.first_cycle_multipass_init
23592 = core2i7_first_cycle_multipass_init;
23593 targetm.sched.first_cycle_multipass_begin
23594 = core2i7_first_cycle_multipass_begin;
23595 targetm.sched.first_cycle_multipass_issue
23596 = core2i7_first_cycle_multipass_issue;
23597 targetm.sched.first_cycle_multipass_backtrack
23598 = core2i7_first_cycle_multipass_backtrack;
23599 targetm.sched.first_cycle_multipass_end
23600 = core2i7_first_cycle_multipass_end;
23601 targetm.sched.first_cycle_multipass_fini
23602 = core2i7_first_cycle_multipass_fini;
23603
23604 /* Set decoder parameters. */
23605 core2i7_secondary_decoder_max_insn_size = 8;
23606 core2i7_ifetch_block_size = 16;
23607 core2i7_ifetch_block_max_insns = 6;
23608 break;
23609
23610 default:
23611 targetm.sched.dfa_post_advance_cycle = NULL;
23612 targetm.sched.first_cycle_multipass_init = NULL;
23613 targetm.sched.first_cycle_multipass_begin = NULL;
23614 targetm.sched.first_cycle_multipass_issue = NULL;
23615 targetm.sched.first_cycle_multipass_backtrack = NULL;
23616 targetm.sched.first_cycle_multipass_end = NULL;
23617 targetm.sched.first_cycle_multipass_fini = NULL;
23618 break;
23619 }
23620 }
23621
23622 \f
23623 /* Compute the alignment given to a constant that is being placed in memory.
23624 EXP is the constant and ALIGN is the alignment that the object would
23625 ordinarily have.
23626 The value of this function is used instead of that alignment to align
23627 the object. */
23628
23629 int
23630 ix86_constant_alignment (tree exp, int align)
23631 {
23632 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
23633 || TREE_CODE (exp) == INTEGER_CST)
23634 {
23635 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
23636 return 64;
23637 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
23638 return 128;
23639 }
23640 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
23641 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
23642 return BITS_PER_WORD;
23643
23644 return align;
23645 }
23646
23647 /* Compute the alignment for a static variable.
23648 TYPE is the data type, and ALIGN is the alignment that
23649 the object would ordinarily have. The value of this function is used
23650 instead of that alignment to align the object. */
23651
23652 int
23653 ix86_data_alignment (tree type, int align)
23654 {
23655 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
23656
23657 if (AGGREGATE_TYPE_P (type)
23658 && TYPE_SIZE (type)
23659 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23660 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
23661 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
23662 && align < max_align)
23663 align = max_align;
23664
23665 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
23666 to 16byte boundary. */
23667 if (TARGET_64BIT)
23668 {
23669 if (AGGREGATE_TYPE_P (type)
23670 && TYPE_SIZE (type)
23671 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23672 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
23673 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
23674 return 128;
23675 }
23676
23677 if (TREE_CODE (type) == ARRAY_TYPE)
23678 {
23679 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
23680 return 64;
23681 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
23682 return 128;
23683 }
23684 else if (TREE_CODE (type) == COMPLEX_TYPE)
23685 {
23686
23687 if (TYPE_MODE (type) == DCmode && align < 64)
23688 return 64;
23689 if ((TYPE_MODE (type) == XCmode
23690 || TYPE_MODE (type) == TCmode) && align < 128)
23691 return 128;
23692 }
23693 else if ((TREE_CODE (type) == RECORD_TYPE
23694 || TREE_CODE (type) == UNION_TYPE
23695 || TREE_CODE (type) == QUAL_UNION_TYPE)
23696 && TYPE_FIELDS (type))
23697 {
23698 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
23699 return 64;
23700 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
23701 return 128;
23702 }
23703 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
23704 || TREE_CODE (type) == INTEGER_TYPE)
23705 {
23706 if (TYPE_MODE (type) == DFmode && align < 64)
23707 return 64;
23708 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
23709 return 128;
23710 }
23711
23712 return align;
23713 }
23714
23715 /* Compute the alignment for a local variable or a stack slot. EXP is
23716 the data type or decl itself, MODE is the widest mode available and
23717 ALIGN is the alignment that the object would ordinarily have. The
23718 value of this macro is used instead of that alignment to align the
23719 object. */
23720
23721 unsigned int
23722 ix86_local_alignment (tree exp, enum machine_mode mode,
23723 unsigned int align)
23724 {
23725 tree type, decl;
23726
23727 if (exp && DECL_P (exp))
23728 {
23729 type = TREE_TYPE (exp);
23730 decl = exp;
23731 }
23732 else
23733 {
23734 type = exp;
23735 decl = NULL;
23736 }
23737
23738 /* Don't do dynamic stack realignment for long long objects with
23739 -mpreferred-stack-boundary=2. */
23740 if (!TARGET_64BIT
23741 && align == 64
23742 && ix86_preferred_stack_boundary < 64
23743 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
23744 && (!type || !TYPE_USER_ALIGN (type))
23745 && (!decl || !DECL_USER_ALIGN (decl)))
23746 align = 32;
23747
23748 /* If TYPE is NULL, we are allocating a stack slot for caller-save
23749 register in MODE. We will return the largest alignment of XF
23750 and DF. */
23751 if (!type)
23752 {
23753 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
23754 align = GET_MODE_ALIGNMENT (DFmode);
23755 return align;
23756 }
23757
23758 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
23759 to 16byte boundary. Exact wording is:
23760
23761 An array uses the same alignment as its elements, except that a local or
23762 global array variable of length at least 16 bytes or
23763 a C99 variable-length array variable always has alignment of at least 16 bytes.
23764
23765 This was added to allow use of aligned SSE instructions at arrays. This
23766 rule is meant for static storage (where compiler can not do the analysis
23767 by itself). We follow it for automatic variables only when convenient.
23768 We fully control everything in the function compiled and functions from
23769 other unit can not rely on the alignment.
23770
23771 Exclude va_list type. It is the common case of local array where
23772 we can not benefit from the alignment. */
23773 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
23774 && TARGET_SSE)
23775 {
23776 if (AGGREGATE_TYPE_P (type)
23777 && (va_list_type_node == NULL_TREE
23778 || (TYPE_MAIN_VARIANT (type)
23779 != TYPE_MAIN_VARIANT (va_list_type_node)))
23780 && TYPE_SIZE (type)
23781 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23782 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
23783 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
23784 return 128;
23785 }
23786 if (TREE_CODE (type) == ARRAY_TYPE)
23787 {
23788 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
23789 return 64;
23790 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
23791 return 128;
23792 }
23793 else if (TREE_CODE (type) == COMPLEX_TYPE)
23794 {
23795 if (TYPE_MODE (type) == DCmode && align < 64)
23796 return 64;
23797 if ((TYPE_MODE (type) == XCmode
23798 || TYPE_MODE (type) == TCmode) && align < 128)
23799 return 128;
23800 }
23801 else if ((TREE_CODE (type) == RECORD_TYPE
23802 || TREE_CODE (type) == UNION_TYPE
23803 || TREE_CODE (type) == QUAL_UNION_TYPE)
23804 && TYPE_FIELDS (type))
23805 {
23806 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
23807 return 64;
23808 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
23809 return 128;
23810 }
23811 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
23812 || TREE_CODE (type) == INTEGER_TYPE)
23813 {
23814
23815 if (TYPE_MODE (type) == DFmode && align < 64)
23816 return 64;
23817 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
23818 return 128;
23819 }
23820 return align;
23821 }
23822
23823 /* Compute the minimum required alignment for dynamic stack realignment
23824 purposes for a local variable, parameter or a stack slot. EXP is
23825 the data type or decl itself, MODE is its mode and ALIGN is the
23826 alignment that the object would ordinarily have. */
23827
23828 unsigned int
23829 ix86_minimum_alignment (tree exp, enum machine_mode mode,
23830 unsigned int align)
23831 {
23832 tree type, decl;
23833
23834 if (exp && DECL_P (exp))
23835 {
23836 type = TREE_TYPE (exp);
23837 decl = exp;
23838 }
23839 else
23840 {
23841 type = exp;
23842 decl = NULL;
23843 }
23844
23845 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
23846 return align;
23847
23848 /* Don't do dynamic stack realignment for long long objects with
23849 -mpreferred-stack-boundary=2. */
23850 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
23851 && (!type || !TYPE_USER_ALIGN (type))
23852 && (!decl || !DECL_USER_ALIGN (decl)))
23853 return 32;
23854
23855 return align;
23856 }
23857 \f
23858 /* Find a location for the static chain incoming to a nested function.
23859 This is a register, unless all free registers are used by arguments. */
23860
23861 static rtx
23862 ix86_static_chain (const_tree fndecl, bool incoming_p)
23863 {
23864 unsigned regno;
23865
23866 if (!DECL_STATIC_CHAIN (fndecl))
23867 return NULL;
23868
23869 if (TARGET_64BIT)
23870 {
23871 /* We always use R10 in 64-bit mode. */
23872 regno = R10_REG;
23873 }
23874 else
23875 {
23876 tree fntype;
23877 unsigned int ccvt;
23878
23879 /* By default in 32-bit mode we use ECX to pass the static chain. */
23880 regno = CX_REG;
23881
23882 fntype = TREE_TYPE (fndecl);
23883 ccvt = ix86_get_callcvt (fntype);
23884 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
23885 {
23886 /* Fastcall functions use ecx/edx for arguments, which leaves
23887 us with EAX for the static chain.
23888 Thiscall functions use ecx for arguments, which also
23889 leaves us with EAX for the static chain. */
23890 regno = AX_REG;
23891 }
23892 else if (ix86_function_regparm (fntype, fndecl) == 3)
23893 {
23894 /* For regparm 3, we have no free call-clobbered registers in
23895 which to store the static chain. In order to implement this,
23896 we have the trampoline push the static chain to the stack.
23897 However, we can't push a value below the return address when
23898 we call the nested function directly, so we have to use an
23899 alternate entry point. For this we use ESI, and have the
23900 alternate entry point push ESI, so that things appear the
23901 same once we're executing the nested function. */
23902 if (incoming_p)
23903 {
23904 if (fndecl == current_function_decl)
23905 ix86_static_chain_on_stack = true;
23906 return gen_frame_mem (SImode,
23907 plus_constant (arg_pointer_rtx, -8));
23908 }
23909 regno = SI_REG;
23910 }
23911 }
23912
23913 return gen_rtx_REG (Pmode, regno);
23914 }
23915
23916 /* Emit RTL insns to initialize the variable parts of a trampoline.
23917 FNDECL is the decl of the target address; M_TRAMP is a MEM for
23918 the trampoline, and CHAIN_VALUE is an RTX for the static chain
23919 to be passed to the target function. */
23920
23921 static void
23922 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
23923 {
23924 rtx mem, fnaddr;
23925 int opcode;
23926 int offset = 0;
23927
23928 fnaddr = XEXP (DECL_RTL (fndecl), 0);
23929
23930 if (TARGET_64BIT)
23931 {
23932 int size;
23933
23934 /* Load the function address to r11. Try to load address using
23935 the shorter movl instead of movabs. We may want to support
23936 movq for kernel mode, but kernel does not use trampolines at
23937 the moment. */
23938 if (x86_64_zext_immediate_operand (fnaddr, VOIDmode))
23939 {
23940 fnaddr = copy_to_mode_reg (DImode, fnaddr);
23941
23942 mem = adjust_address (m_tramp, HImode, offset);
23943 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
23944
23945 mem = adjust_address (m_tramp, SImode, offset + 2);
23946 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
23947 offset += 6;
23948 }
23949 else
23950 {
23951 mem = adjust_address (m_tramp, HImode, offset);
23952 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
23953
23954 mem = adjust_address (m_tramp, DImode, offset + 2);
23955 emit_move_insn (mem, fnaddr);
23956 offset += 10;
23957 }
23958
23959 /* Load static chain using movabs to r10. Use the
23960 shorter movl instead of movabs for x32. */
23961 if (TARGET_X32)
23962 {
23963 opcode = 0xba41;
23964 size = 6;
23965 }
23966 else
23967 {
23968 opcode = 0xba49;
23969 size = 10;
23970 }
23971
23972 mem = adjust_address (m_tramp, HImode, offset);
23973 emit_move_insn (mem, gen_int_mode (opcode, HImode));
23974
23975 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
23976 emit_move_insn (mem, chain_value);
23977 offset += size;
23978
23979 /* Jump to r11; the last (unused) byte is a nop, only there to
23980 pad the write out to a single 32-bit store. */
23981 mem = adjust_address (m_tramp, SImode, offset);
23982 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
23983 offset += 4;
23984 }
23985 else
23986 {
23987 rtx disp, chain;
23988
23989 /* Depending on the static chain location, either load a register
23990 with a constant, or push the constant to the stack. All of the
23991 instructions are the same size. */
23992 chain = ix86_static_chain (fndecl, true);
23993 if (REG_P (chain))
23994 {
23995 switch (REGNO (chain))
23996 {
23997 case AX_REG:
23998 opcode = 0xb8; break;
23999 case CX_REG:
24000 opcode = 0xb9; break;
24001 default:
24002 gcc_unreachable ();
24003 }
24004 }
24005 else
24006 opcode = 0x68;
24007
24008 mem = adjust_address (m_tramp, QImode, offset);
24009 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24010
24011 mem = adjust_address (m_tramp, SImode, offset + 1);
24012 emit_move_insn (mem, chain_value);
24013 offset += 5;
24014
24015 mem = adjust_address (m_tramp, QImode, offset);
24016 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24017
24018 mem = adjust_address (m_tramp, SImode, offset + 1);
24019
24020 /* Compute offset from the end of the jmp to the target function.
24021 In the case in which the trampoline stores the static chain on
24022 the stack, we need to skip the first insn which pushes the
24023 (call-saved) register static chain; this push is 1 byte. */
24024 offset += 5;
24025 disp = expand_binop (SImode, sub_optab, fnaddr,
24026 plus_constant (XEXP (m_tramp, 0),
24027 offset - (MEM_P (chain) ? 1 : 0)),
24028 NULL_RTX, 1, OPTAB_DIRECT);
24029 emit_move_insn (mem, disp);
24030 }
24031
24032 gcc_assert (offset <= TRAMPOLINE_SIZE);
24033
24034 #ifdef HAVE_ENABLE_EXECUTE_STACK
24035 #ifdef CHECK_EXECUTE_STACK_ENABLED
24036 if (CHECK_EXECUTE_STACK_ENABLED)
24037 #endif
24038 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24039 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24040 #endif
24041 }
24042 \f
24043 /* The following file contains several enumerations and data structures
24044 built from the definitions in i386-builtin-types.def. */
24045
24046 #include "i386-builtin-types.inc"
24047
24048 /* Table for the ix86 builtin non-function types. */
24049 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24050
24051 /* Retrieve an element from the above table, building some of
24052 the types lazily. */
24053
24054 static tree
24055 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24056 {
24057 unsigned int index;
24058 tree type, itype;
24059
24060 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24061
24062 type = ix86_builtin_type_tab[(int) tcode];
24063 if (type != NULL)
24064 return type;
24065
24066 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24067 if (tcode <= IX86_BT_LAST_VECT)
24068 {
24069 enum machine_mode mode;
24070
24071 index = tcode - IX86_BT_LAST_PRIM - 1;
24072 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24073 mode = ix86_builtin_type_vect_mode[index];
24074
24075 type = build_vector_type_for_mode (itype, mode);
24076 }
24077 else
24078 {
24079 int quals;
24080
24081 index = tcode - IX86_BT_LAST_VECT - 1;
24082 if (tcode <= IX86_BT_LAST_PTR)
24083 quals = TYPE_UNQUALIFIED;
24084 else
24085 quals = TYPE_QUAL_CONST;
24086
24087 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24088 if (quals != TYPE_UNQUALIFIED)
24089 itype = build_qualified_type (itype, quals);
24090
24091 type = build_pointer_type (itype);
24092 }
24093
24094 ix86_builtin_type_tab[(int) tcode] = type;
24095 return type;
24096 }
24097
24098 /* Table for the ix86 builtin function types. */
24099 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24100
24101 /* Retrieve an element from the above table, building some of
24102 the types lazily. */
24103
24104 static tree
24105 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24106 {
24107 tree type;
24108
24109 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24110
24111 type = ix86_builtin_func_type_tab[(int) tcode];
24112 if (type != NULL)
24113 return type;
24114
24115 if (tcode <= IX86_BT_LAST_FUNC)
24116 {
24117 unsigned start = ix86_builtin_func_start[(int) tcode];
24118 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24119 tree rtype, atype, args = void_list_node;
24120 unsigned i;
24121
24122 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24123 for (i = after - 1; i > start; --i)
24124 {
24125 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24126 args = tree_cons (NULL, atype, args);
24127 }
24128
24129 type = build_function_type (rtype, args);
24130 }
24131 else
24132 {
24133 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24134 enum ix86_builtin_func_type icode;
24135
24136 icode = ix86_builtin_func_alias_base[index];
24137 type = ix86_get_builtin_func_type (icode);
24138 }
24139
24140 ix86_builtin_func_type_tab[(int) tcode] = type;
24141 return type;
24142 }
24143
24144
24145 /* Codes for all the SSE/MMX builtins. */
24146 enum ix86_builtins
24147 {
24148 IX86_BUILTIN_ADDPS,
24149 IX86_BUILTIN_ADDSS,
24150 IX86_BUILTIN_DIVPS,
24151 IX86_BUILTIN_DIVSS,
24152 IX86_BUILTIN_MULPS,
24153 IX86_BUILTIN_MULSS,
24154 IX86_BUILTIN_SUBPS,
24155 IX86_BUILTIN_SUBSS,
24156
24157 IX86_BUILTIN_CMPEQPS,
24158 IX86_BUILTIN_CMPLTPS,
24159 IX86_BUILTIN_CMPLEPS,
24160 IX86_BUILTIN_CMPGTPS,
24161 IX86_BUILTIN_CMPGEPS,
24162 IX86_BUILTIN_CMPNEQPS,
24163 IX86_BUILTIN_CMPNLTPS,
24164 IX86_BUILTIN_CMPNLEPS,
24165 IX86_BUILTIN_CMPNGTPS,
24166 IX86_BUILTIN_CMPNGEPS,
24167 IX86_BUILTIN_CMPORDPS,
24168 IX86_BUILTIN_CMPUNORDPS,
24169 IX86_BUILTIN_CMPEQSS,
24170 IX86_BUILTIN_CMPLTSS,
24171 IX86_BUILTIN_CMPLESS,
24172 IX86_BUILTIN_CMPNEQSS,
24173 IX86_BUILTIN_CMPNLTSS,
24174 IX86_BUILTIN_CMPNLESS,
24175 IX86_BUILTIN_CMPNGTSS,
24176 IX86_BUILTIN_CMPNGESS,
24177 IX86_BUILTIN_CMPORDSS,
24178 IX86_BUILTIN_CMPUNORDSS,
24179
24180 IX86_BUILTIN_COMIEQSS,
24181 IX86_BUILTIN_COMILTSS,
24182 IX86_BUILTIN_COMILESS,
24183 IX86_BUILTIN_COMIGTSS,
24184 IX86_BUILTIN_COMIGESS,
24185 IX86_BUILTIN_COMINEQSS,
24186 IX86_BUILTIN_UCOMIEQSS,
24187 IX86_BUILTIN_UCOMILTSS,
24188 IX86_BUILTIN_UCOMILESS,
24189 IX86_BUILTIN_UCOMIGTSS,
24190 IX86_BUILTIN_UCOMIGESS,
24191 IX86_BUILTIN_UCOMINEQSS,
24192
24193 IX86_BUILTIN_CVTPI2PS,
24194 IX86_BUILTIN_CVTPS2PI,
24195 IX86_BUILTIN_CVTSI2SS,
24196 IX86_BUILTIN_CVTSI642SS,
24197 IX86_BUILTIN_CVTSS2SI,
24198 IX86_BUILTIN_CVTSS2SI64,
24199 IX86_BUILTIN_CVTTPS2PI,
24200 IX86_BUILTIN_CVTTSS2SI,
24201 IX86_BUILTIN_CVTTSS2SI64,
24202
24203 IX86_BUILTIN_MAXPS,
24204 IX86_BUILTIN_MAXSS,
24205 IX86_BUILTIN_MINPS,
24206 IX86_BUILTIN_MINSS,
24207
24208 IX86_BUILTIN_LOADUPS,
24209 IX86_BUILTIN_STOREUPS,
24210 IX86_BUILTIN_MOVSS,
24211
24212 IX86_BUILTIN_MOVHLPS,
24213 IX86_BUILTIN_MOVLHPS,
24214 IX86_BUILTIN_LOADHPS,
24215 IX86_BUILTIN_LOADLPS,
24216 IX86_BUILTIN_STOREHPS,
24217 IX86_BUILTIN_STORELPS,
24218
24219 IX86_BUILTIN_MASKMOVQ,
24220 IX86_BUILTIN_MOVMSKPS,
24221 IX86_BUILTIN_PMOVMSKB,
24222
24223 IX86_BUILTIN_MOVNTPS,
24224 IX86_BUILTIN_MOVNTQ,
24225
24226 IX86_BUILTIN_LOADDQU,
24227 IX86_BUILTIN_STOREDQU,
24228
24229 IX86_BUILTIN_PACKSSWB,
24230 IX86_BUILTIN_PACKSSDW,
24231 IX86_BUILTIN_PACKUSWB,
24232
24233 IX86_BUILTIN_PADDB,
24234 IX86_BUILTIN_PADDW,
24235 IX86_BUILTIN_PADDD,
24236 IX86_BUILTIN_PADDQ,
24237 IX86_BUILTIN_PADDSB,
24238 IX86_BUILTIN_PADDSW,
24239 IX86_BUILTIN_PADDUSB,
24240 IX86_BUILTIN_PADDUSW,
24241 IX86_BUILTIN_PSUBB,
24242 IX86_BUILTIN_PSUBW,
24243 IX86_BUILTIN_PSUBD,
24244 IX86_BUILTIN_PSUBQ,
24245 IX86_BUILTIN_PSUBSB,
24246 IX86_BUILTIN_PSUBSW,
24247 IX86_BUILTIN_PSUBUSB,
24248 IX86_BUILTIN_PSUBUSW,
24249
24250 IX86_BUILTIN_PAND,
24251 IX86_BUILTIN_PANDN,
24252 IX86_BUILTIN_POR,
24253 IX86_BUILTIN_PXOR,
24254
24255 IX86_BUILTIN_PAVGB,
24256 IX86_BUILTIN_PAVGW,
24257
24258 IX86_BUILTIN_PCMPEQB,
24259 IX86_BUILTIN_PCMPEQW,
24260 IX86_BUILTIN_PCMPEQD,
24261 IX86_BUILTIN_PCMPGTB,
24262 IX86_BUILTIN_PCMPGTW,
24263 IX86_BUILTIN_PCMPGTD,
24264
24265 IX86_BUILTIN_PMADDWD,
24266
24267 IX86_BUILTIN_PMAXSW,
24268 IX86_BUILTIN_PMAXUB,
24269 IX86_BUILTIN_PMINSW,
24270 IX86_BUILTIN_PMINUB,
24271
24272 IX86_BUILTIN_PMULHUW,
24273 IX86_BUILTIN_PMULHW,
24274 IX86_BUILTIN_PMULLW,
24275
24276 IX86_BUILTIN_PSADBW,
24277 IX86_BUILTIN_PSHUFW,
24278
24279 IX86_BUILTIN_PSLLW,
24280 IX86_BUILTIN_PSLLD,
24281 IX86_BUILTIN_PSLLQ,
24282 IX86_BUILTIN_PSRAW,
24283 IX86_BUILTIN_PSRAD,
24284 IX86_BUILTIN_PSRLW,
24285 IX86_BUILTIN_PSRLD,
24286 IX86_BUILTIN_PSRLQ,
24287 IX86_BUILTIN_PSLLWI,
24288 IX86_BUILTIN_PSLLDI,
24289 IX86_BUILTIN_PSLLQI,
24290 IX86_BUILTIN_PSRAWI,
24291 IX86_BUILTIN_PSRADI,
24292 IX86_BUILTIN_PSRLWI,
24293 IX86_BUILTIN_PSRLDI,
24294 IX86_BUILTIN_PSRLQI,
24295
24296 IX86_BUILTIN_PUNPCKHBW,
24297 IX86_BUILTIN_PUNPCKHWD,
24298 IX86_BUILTIN_PUNPCKHDQ,
24299 IX86_BUILTIN_PUNPCKLBW,
24300 IX86_BUILTIN_PUNPCKLWD,
24301 IX86_BUILTIN_PUNPCKLDQ,
24302
24303 IX86_BUILTIN_SHUFPS,
24304
24305 IX86_BUILTIN_RCPPS,
24306 IX86_BUILTIN_RCPSS,
24307 IX86_BUILTIN_RSQRTPS,
24308 IX86_BUILTIN_RSQRTPS_NR,
24309 IX86_BUILTIN_RSQRTSS,
24310 IX86_BUILTIN_RSQRTF,
24311 IX86_BUILTIN_SQRTPS,
24312 IX86_BUILTIN_SQRTPS_NR,
24313 IX86_BUILTIN_SQRTSS,
24314
24315 IX86_BUILTIN_UNPCKHPS,
24316 IX86_BUILTIN_UNPCKLPS,
24317
24318 IX86_BUILTIN_ANDPS,
24319 IX86_BUILTIN_ANDNPS,
24320 IX86_BUILTIN_ORPS,
24321 IX86_BUILTIN_XORPS,
24322
24323 IX86_BUILTIN_EMMS,
24324 IX86_BUILTIN_LDMXCSR,
24325 IX86_BUILTIN_STMXCSR,
24326 IX86_BUILTIN_SFENCE,
24327
24328 /* 3DNow! Original */
24329 IX86_BUILTIN_FEMMS,
24330 IX86_BUILTIN_PAVGUSB,
24331 IX86_BUILTIN_PF2ID,
24332 IX86_BUILTIN_PFACC,
24333 IX86_BUILTIN_PFADD,
24334 IX86_BUILTIN_PFCMPEQ,
24335 IX86_BUILTIN_PFCMPGE,
24336 IX86_BUILTIN_PFCMPGT,
24337 IX86_BUILTIN_PFMAX,
24338 IX86_BUILTIN_PFMIN,
24339 IX86_BUILTIN_PFMUL,
24340 IX86_BUILTIN_PFRCP,
24341 IX86_BUILTIN_PFRCPIT1,
24342 IX86_BUILTIN_PFRCPIT2,
24343 IX86_BUILTIN_PFRSQIT1,
24344 IX86_BUILTIN_PFRSQRT,
24345 IX86_BUILTIN_PFSUB,
24346 IX86_BUILTIN_PFSUBR,
24347 IX86_BUILTIN_PI2FD,
24348 IX86_BUILTIN_PMULHRW,
24349
24350 /* 3DNow! Athlon Extensions */
24351 IX86_BUILTIN_PF2IW,
24352 IX86_BUILTIN_PFNACC,
24353 IX86_BUILTIN_PFPNACC,
24354 IX86_BUILTIN_PI2FW,
24355 IX86_BUILTIN_PSWAPDSI,
24356 IX86_BUILTIN_PSWAPDSF,
24357
24358 /* SSE2 */
24359 IX86_BUILTIN_ADDPD,
24360 IX86_BUILTIN_ADDSD,
24361 IX86_BUILTIN_DIVPD,
24362 IX86_BUILTIN_DIVSD,
24363 IX86_BUILTIN_MULPD,
24364 IX86_BUILTIN_MULSD,
24365 IX86_BUILTIN_SUBPD,
24366 IX86_BUILTIN_SUBSD,
24367
24368 IX86_BUILTIN_CMPEQPD,
24369 IX86_BUILTIN_CMPLTPD,
24370 IX86_BUILTIN_CMPLEPD,
24371 IX86_BUILTIN_CMPGTPD,
24372 IX86_BUILTIN_CMPGEPD,
24373 IX86_BUILTIN_CMPNEQPD,
24374 IX86_BUILTIN_CMPNLTPD,
24375 IX86_BUILTIN_CMPNLEPD,
24376 IX86_BUILTIN_CMPNGTPD,
24377 IX86_BUILTIN_CMPNGEPD,
24378 IX86_BUILTIN_CMPORDPD,
24379 IX86_BUILTIN_CMPUNORDPD,
24380 IX86_BUILTIN_CMPEQSD,
24381 IX86_BUILTIN_CMPLTSD,
24382 IX86_BUILTIN_CMPLESD,
24383 IX86_BUILTIN_CMPNEQSD,
24384 IX86_BUILTIN_CMPNLTSD,
24385 IX86_BUILTIN_CMPNLESD,
24386 IX86_BUILTIN_CMPORDSD,
24387 IX86_BUILTIN_CMPUNORDSD,
24388
24389 IX86_BUILTIN_COMIEQSD,
24390 IX86_BUILTIN_COMILTSD,
24391 IX86_BUILTIN_COMILESD,
24392 IX86_BUILTIN_COMIGTSD,
24393 IX86_BUILTIN_COMIGESD,
24394 IX86_BUILTIN_COMINEQSD,
24395 IX86_BUILTIN_UCOMIEQSD,
24396 IX86_BUILTIN_UCOMILTSD,
24397 IX86_BUILTIN_UCOMILESD,
24398 IX86_BUILTIN_UCOMIGTSD,
24399 IX86_BUILTIN_UCOMIGESD,
24400 IX86_BUILTIN_UCOMINEQSD,
24401
24402 IX86_BUILTIN_MAXPD,
24403 IX86_BUILTIN_MAXSD,
24404 IX86_BUILTIN_MINPD,
24405 IX86_BUILTIN_MINSD,
24406
24407 IX86_BUILTIN_ANDPD,
24408 IX86_BUILTIN_ANDNPD,
24409 IX86_BUILTIN_ORPD,
24410 IX86_BUILTIN_XORPD,
24411
24412 IX86_BUILTIN_SQRTPD,
24413 IX86_BUILTIN_SQRTSD,
24414
24415 IX86_BUILTIN_UNPCKHPD,
24416 IX86_BUILTIN_UNPCKLPD,
24417
24418 IX86_BUILTIN_SHUFPD,
24419
24420 IX86_BUILTIN_LOADUPD,
24421 IX86_BUILTIN_STOREUPD,
24422 IX86_BUILTIN_MOVSD,
24423
24424 IX86_BUILTIN_LOADHPD,
24425 IX86_BUILTIN_LOADLPD,
24426
24427 IX86_BUILTIN_CVTDQ2PD,
24428 IX86_BUILTIN_CVTDQ2PS,
24429
24430 IX86_BUILTIN_CVTPD2DQ,
24431 IX86_BUILTIN_CVTPD2PI,
24432 IX86_BUILTIN_CVTPD2PS,
24433 IX86_BUILTIN_CVTTPD2DQ,
24434 IX86_BUILTIN_CVTTPD2PI,
24435
24436 IX86_BUILTIN_CVTPI2PD,
24437 IX86_BUILTIN_CVTSI2SD,
24438 IX86_BUILTIN_CVTSI642SD,
24439
24440 IX86_BUILTIN_CVTSD2SI,
24441 IX86_BUILTIN_CVTSD2SI64,
24442 IX86_BUILTIN_CVTSD2SS,
24443 IX86_BUILTIN_CVTSS2SD,
24444 IX86_BUILTIN_CVTTSD2SI,
24445 IX86_BUILTIN_CVTTSD2SI64,
24446
24447 IX86_BUILTIN_CVTPS2DQ,
24448 IX86_BUILTIN_CVTPS2PD,
24449 IX86_BUILTIN_CVTTPS2DQ,
24450
24451 IX86_BUILTIN_MOVNTI,
24452 IX86_BUILTIN_MOVNTPD,
24453 IX86_BUILTIN_MOVNTDQ,
24454
24455 IX86_BUILTIN_MOVQ128,
24456
24457 /* SSE2 MMX */
24458 IX86_BUILTIN_MASKMOVDQU,
24459 IX86_BUILTIN_MOVMSKPD,
24460 IX86_BUILTIN_PMOVMSKB128,
24461
24462 IX86_BUILTIN_PACKSSWB128,
24463 IX86_BUILTIN_PACKSSDW128,
24464 IX86_BUILTIN_PACKUSWB128,
24465
24466 IX86_BUILTIN_PADDB128,
24467 IX86_BUILTIN_PADDW128,
24468 IX86_BUILTIN_PADDD128,
24469 IX86_BUILTIN_PADDQ128,
24470 IX86_BUILTIN_PADDSB128,
24471 IX86_BUILTIN_PADDSW128,
24472 IX86_BUILTIN_PADDUSB128,
24473 IX86_BUILTIN_PADDUSW128,
24474 IX86_BUILTIN_PSUBB128,
24475 IX86_BUILTIN_PSUBW128,
24476 IX86_BUILTIN_PSUBD128,
24477 IX86_BUILTIN_PSUBQ128,
24478 IX86_BUILTIN_PSUBSB128,
24479 IX86_BUILTIN_PSUBSW128,
24480 IX86_BUILTIN_PSUBUSB128,
24481 IX86_BUILTIN_PSUBUSW128,
24482
24483 IX86_BUILTIN_PAND128,
24484 IX86_BUILTIN_PANDN128,
24485 IX86_BUILTIN_POR128,
24486 IX86_BUILTIN_PXOR128,
24487
24488 IX86_BUILTIN_PAVGB128,
24489 IX86_BUILTIN_PAVGW128,
24490
24491 IX86_BUILTIN_PCMPEQB128,
24492 IX86_BUILTIN_PCMPEQW128,
24493 IX86_BUILTIN_PCMPEQD128,
24494 IX86_BUILTIN_PCMPGTB128,
24495 IX86_BUILTIN_PCMPGTW128,
24496 IX86_BUILTIN_PCMPGTD128,
24497
24498 IX86_BUILTIN_PMADDWD128,
24499
24500 IX86_BUILTIN_PMAXSW128,
24501 IX86_BUILTIN_PMAXUB128,
24502 IX86_BUILTIN_PMINSW128,
24503 IX86_BUILTIN_PMINUB128,
24504
24505 IX86_BUILTIN_PMULUDQ,
24506 IX86_BUILTIN_PMULUDQ128,
24507 IX86_BUILTIN_PMULHUW128,
24508 IX86_BUILTIN_PMULHW128,
24509 IX86_BUILTIN_PMULLW128,
24510
24511 IX86_BUILTIN_PSADBW128,
24512 IX86_BUILTIN_PSHUFHW,
24513 IX86_BUILTIN_PSHUFLW,
24514 IX86_BUILTIN_PSHUFD,
24515
24516 IX86_BUILTIN_PSLLDQI128,
24517 IX86_BUILTIN_PSLLWI128,
24518 IX86_BUILTIN_PSLLDI128,
24519 IX86_BUILTIN_PSLLQI128,
24520 IX86_BUILTIN_PSRAWI128,
24521 IX86_BUILTIN_PSRADI128,
24522 IX86_BUILTIN_PSRLDQI128,
24523 IX86_BUILTIN_PSRLWI128,
24524 IX86_BUILTIN_PSRLDI128,
24525 IX86_BUILTIN_PSRLQI128,
24526
24527 IX86_BUILTIN_PSLLDQ128,
24528 IX86_BUILTIN_PSLLW128,
24529 IX86_BUILTIN_PSLLD128,
24530 IX86_BUILTIN_PSLLQ128,
24531 IX86_BUILTIN_PSRAW128,
24532 IX86_BUILTIN_PSRAD128,
24533 IX86_BUILTIN_PSRLW128,
24534 IX86_BUILTIN_PSRLD128,
24535 IX86_BUILTIN_PSRLQ128,
24536
24537 IX86_BUILTIN_PUNPCKHBW128,
24538 IX86_BUILTIN_PUNPCKHWD128,
24539 IX86_BUILTIN_PUNPCKHDQ128,
24540 IX86_BUILTIN_PUNPCKHQDQ128,
24541 IX86_BUILTIN_PUNPCKLBW128,
24542 IX86_BUILTIN_PUNPCKLWD128,
24543 IX86_BUILTIN_PUNPCKLDQ128,
24544 IX86_BUILTIN_PUNPCKLQDQ128,
24545
24546 IX86_BUILTIN_CLFLUSH,
24547 IX86_BUILTIN_MFENCE,
24548 IX86_BUILTIN_LFENCE,
24549 IX86_BUILTIN_PAUSE,
24550
24551 IX86_BUILTIN_BSRSI,
24552 IX86_BUILTIN_BSRDI,
24553 IX86_BUILTIN_RDPMC,
24554 IX86_BUILTIN_RDTSC,
24555 IX86_BUILTIN_RDTSCP,
24556 IX86_BUILTIN_ROLQI,
24557 IX86_BUILTIN_ROLHI,
24558 IX86_BUILTIN_RORQI,
24559 IX86_BUILTIN_RORHI,
24560
24561 /* SSE3. */
24562 IX86_BUILTIN_ADDSUBPS,
24563 IX86_BUILTIN_HADDPS,
24564 IX86_BUILTIN_HSUBPS,
24565 IX86_BUILTIN_MOVSHDUP,
24566 IX86_BUILTIN_MOVSLDUP,
24567 IX86_BUILTIN_ADDSUBPD,
24568 IX86_BUILTIN_HADDPD,
24569 IX86_BUILTIN_HSUBPD,
24570 IX86_BUILTIN_LDDQU,
24571
24572 IX86_BUILTIN_MONITOR,
24573 IX86_BUILTIN_MWAIT,
24574
24575 /* SSSE3. */
24576 IX86_BUILTIN_PHADDW,
24577 IX86_BUILTIN_PHADDD,
24578 IX86_BUILTIN_PHADDSW,
24579 IX86_BUILTIN_PHSUBW,
24580 IX86_BUILTIN_PHSUBD,
24581 IX86_BUILTIN_PHSUBSW,
24582 IX86_BUILTIN_PMADDUBSW,
24583 IX86_BUILTIN_PMULHRSW,
24584 IX86_BUILTIN_PSHUFB,
24585 IX86_BUILTIN_PSIGNB,
24586 IX86_BUILTIN_PSIGNW,
24587 IX86_BUILTIN_PSIGND,
24588 IX86_BUILTIN_PALIGNR,
24589 IX86_BUILTIN_PABSB,
24590 IX86_BUILTIN_PABSW,
24591 IX86_BUILTIN_PABSD,
24592
24593 IX86_BUILTIN_PHADDW128,
24594 IX86_BUILTIN_PHADDD128,
24595 IX86_BUILTIN_PHADDSW128,
24596 IX86_BUILTIN_PHSUBW128,
24597 IX86_BUILTIN_PHSUBD128,
24598 IX86_BUILTIN_PHSUBSW128,
24599 IX86_BUILTIN_PMADDUBSW128,
24600 IX86_BUILTIN_PMULHRSW128,
24601 IX86_BUILTIN_PSHUFB128,
24602 IX86_BUILTIN_PSIGNB128,
24603 IX86_BUILTIN_PSIGNW128,
24604 IX86_BUILTIN_PSIGND128,
24605 IX86_BUILTIN_PALIGNR128,
24606 IX86_BUILTIN_PABSB128,
24607 IX86_BUILTIN_PABSW128,
24608 IX86_BUILTIN_PABSD128,
24609
24610 /* AMDFAM10 - SSE4A New Instructions. */
24611 IX86_BUILTIN_MOVNTSD,
24612 IX86_BUILTIN_MOVNTSS,
24613 IX86_BUILTIN_EXTRQI,
24614 IX86_BUILTIN_EXTRQ,
24615 IX86_BUILTIN_INSERTQI,
24616 IX86_BUILTIN_INSERTQ,
24617
24618 /* SSE4.1. */
24619 IX86_BUILTIN_BLENDPD,
24620 IX86_BUILTIN_BLENDPS,
24621 IX86_BUILTIN_BLENDVPD,
24622 IX86_BUILTIN_BLENDVPS,
24623 IX86_BUILTIN_PBLENDVB128,
24624 IX86_BUILTIN_PBLENDW128,
24625
24626 IX86_BUILTIN_DPPD,
24627 IX86_BUILTIN_DPPS,
24628
24629 IX86_BUILTIN_INSERTPS128,
24630
24631 IX86_BUILTIN_MOVNTDQA,
24632 IX86_BUILTIN_MPSADBW128,
24633 IX86_BUILTIN_PACKUSDW128,
24634 IX86_BUILTIN_PCMPEQQ,
24635 IX86_BUILTIN_PHMINPOSUW128,
24636
24637 IX86_BUILTIN_PMAXSB128,
24638 IX86_BUILTIN_PMAXSD128,
24639 IX86_BUILTIN_PMAXUD128,
24640 IX86_BUILTIN_PMAXUW128,
24641
24642 IX86_BUILTIN_PMINSB128,
24643 IX86_BUILTIN_PMINSD128,
24644 IX86_BUILTIN_PMINUD128,
24645 IX86_BUILTIN_PMINUW128,
24646
24647 IX86_BUILTIN_PMOVSXBW128,
24648 IX86_BUILTIN_PMOVSXBD128,
24649 IX86_BUILTIN_PMOVSXBQ128,
24650 IX86_BUILTIN_PMOVSXWD128,
24651 IX86_BUILTIN_PMOVSXWQ128,
24652 IX86_BUILTIN_PMOVSXDQ128,
24653
24654 IX86_BUILTIN_PMOVZXBW128,
24655 IX86_BUILTIN_PMOVZXBD128,
24656 IX86_BUILTIN_PMOVZXBQ128,
24657 IX86_BUILTIN_PMOVZXWD128,
24658 IX86_BUILTIN_PMOVZXWQ128,
24659 IX86_BUILTIN_PMOVZXDQ128,
24660
24661 IX86_BUILTIN_PMULDQ128,
24662 IX86_BUILTIN_PMULLD128,
24663
24664 IX86_BUILTIN_ROUNDPD,
24665 IX86_BUILTIN_ROUNDPS,
24666 IX86_BUILTIN_ROUNDSD,
24667 IX86_BUILTIN_ROUNDSS,
24668
24669 IX86_BUILTIN_FLOORPD,
24670 IX86_BUILTIN_CEILPD,
24671 IX86_BUILTIN_TRUNCPD,
24672 IX86_BUILTIN_RINTPD,
24673 IX86_BUILTIN_ROUNDPD_AZ,
24674 IX86_BUILTIN_FLOORPS,
24675 IX86_BUILTIN_CEILPS,
24676 IX86_BUILTIN_TRUNCPS,
24677 IX86_BUILTIN_RINTPS,
24678 IX86_BUILTIN_ROUNDPS_AZ,
24679
24680 IX86_BUILTIN_PTESTZ,
24681 IX86_BUILTIN_PTESTC,
24682 IX86_BUILTIN_PTESTNZC,
24683
24684 IX86_BUILTIN_VEC_INIT_V2SI,
24685 IX86_BUILTIN_VEC_INIT_V4HI,
24686 IX86_BUILTIN_VEC_INIT_V8QI,
24687 IX86_BUILTIN_VEC_EXT_V2DF,
24688 IX86_BUILTIN_VEC_EXT_V2DI,
24689 IX86_BUILTIN_VEC_EXT_V4SF,
24690 IX86_BUILTIN_VEC_EXT_V4SI,
24691 IX86_BUILTIN_VEC_EXT_V8HI,
24692 IX86_BUILTIN_VEC_EXT_V2SI,
24693 IX86_BUILTIN_VEC_EXT_V4HI,
24694 IX86_BUILTIN_VEC_EXT_V16QI,
24695 IX86_BUILTIN_VEC_SET_V2DI,
24696 IX86_BUILTIN_VEC_SET_V4SF,
24697 IX86_BUILTIN_VEC_SET_V4SI,
24698 IX86_BUILTIN_VEC_SET_V8HI,
24699 IX86_BUILTIN_VEC_SET_V4HI,
24700 IX86_BUILTIN_VEC_SET_V16QI,
24701
24702 IX86_BUILTIN_VEC_PACK_SFIX,
24703
24704 /* SSE4.2. */
24705 IX86_BUILTIN_CRC32QI,
24706 IX86_BUILTIN_CRC32HI,
24707 IX86_BUILTIN_CRC32SI,
24708 IX86_BUILTIN_CRC32DI,
24709
24710 IX86_BUILTIN_PCMPESTRI128,
24711 IX86_BUILTIN_PCMPESTRM128,
24712 IX86_BUILTIN_PCMPESTRA128,
24713 IX86_BUILTIN_PCMPESTRC128,
24714 IX86_BUILTIN_PCMPESTRO128,
24715 IX86_BUILTIN_PCMPESTRS128,
24716 IX86_BUILTIN_PCMPESTRZ128,
24717 IX86_BUILTIN_PCMPISTRI128,
24718 IX86_BUILTIN_PCMPISTRM128,
24719 IX86_BUILTIN_PCMPISTRA128,
24720 IX86_BUILTIN_PCMPISTRC128,
24721 IX86_BUILTIN_PCMPISTRO128,
24722 IX86_BUILTIN_PCMPISTRS128,
24723 IX86_BUILTIN_PCMPISTRZ128,
24724
24725 IX86_BUILTIN_PCMPGTQ,
24726
24727 /* AES instructions */
24728 IX86_BUILTIN_AESENC128,
24729 IX86_BUILTIN_AESENCLAST128,
24730 IX86_BUILTIN_AESDEC128,
24731 IX86_BUILTIN_AESDECLAST128,
24732 IX86_BUILTIN_AESIMC128,
24733 IX86_BUILTIN_AESKEYGENASSIST128,
24734
24735 /* PCLMUL instruction */
24736 IX86_BUILTIN_PCLMULQDQ128,
24737
24738 /* AVX */
24739 IX86_BUILTIN_ADDPD256,
24740 IX86_BUILTIN_ADDPS256,
24741 IX86_BUILTIN_ADDSUBPD256,
24742 IX86_BUILTIN_ADDSUBPS256,
24743 IX86_BUILTIN_ANDPD256,
24744 IX86_BUILTIN_ANDPS256,
24745 IX86_BUILTIN_ANDNPD256,
24746 IX86_BUILTIN_ANDNPS256,
24747 IX86_BUILTIN_BLENDPD256,
24748 IX86_BUILTIN_BLENDPS256,
24749 IX86_BUILTIN_BLENDVPD256,
24750 IX86_BUILTIN_BLENDVPS256,
24751 IX86_BUILTIN_DIVPD256,
24752 IX86_BUILTIN_DIVPS256,
24753 IX86_BUILTIN_DPPS256,
24754 IX86_BUILTIN_HADDPD256,
24755 IX86_BUILTIN_HADDPS256,
24756 IX86_BUILTIN_HSUBPD256,
24757 IX86_BUILTIN_HSUBPS256,
24758 IX86_BUILTIN_MAXPD256,
24759 IX86_BUILTIN_MAXPS256,
24760 IX86_BUILTIN_MINPD256,
24761 IX86_BUILTIN_MINPS256,
24762 IX86_BUILTIN_MULPD256,
24763 IX86_BUILTIN_MULPS256,
24764 IX86_BUILTIN_ORPD256,
24765 IX86_BUILTIN_ORPS256,
24766 IX86_BUILTIN_SHUFPD256,
24767 IX86_BUILTIN_SHUFPS256,
24768 IX86_BUILTIN_SUBPD256,
24769 IX86_BUILTIN_SUBPS256,
24770 IX86_BUILTIN_XORPD256,
24771 IX86_BUILTIN_XORPS256,
24772 IX86_BUILTIN_CMPSD,
24773 IX86_BUILTIN_CMPSS,
24774 IX86_BUILTIN_CMPPD,
24775 IX86_BUILTIN_CMPPS,
24776 IX86_BUILTIN_CMPPD256,
24777 IX86_BUILTIN_CMPPS256,
24778 IX86_BUILTIN_CVTDQ2PD256,
24779 IX86_BUILTIN_CVTDQ2PS256,
24780 IX86_BUILTIN_CVTPD2PS256,
24781 IX86_BUILTIN_CVTPS2DQ256,
24782 IX86_BUILTIN_CVTPS2PD256,
24783 IX86_BUILTIN_CVTTPD2DQ256,
24784 IX86_BUILTIN_CVTPD2DQ256,
24785 IX86_BUILTIN_CVTTPS2DQ256,
24786 IX86_BUILTIN_EXTRACTF128PD256,
24787 IX86_BUILTIN_EXTRACTF128PS256,
24788 IX86_BUILTIN_EXTRACTF128SI256,
24789 IX86_BUILTIN_VZEROALL,
24790 IX86_BUILTIN_VZEROUPPER,
24791 IX86_BUILTIN_VPERMILVARPD,
24792 IX86_BUILTIN_VPERMILVARPS,
24793 IX86_BUILTIN_VPERMILVARPD256,
24794 IX86_BUILTIN_VPERMILVARPS256,
24795 IX86_BUILTIN_VPERMILPD,
24796 IX86_BUILTIN_VPERMILPS,
24797 IX86_BUILTIN_VPERMILPD256,
24798 IX86_BUILTIN_VPERMILPS256,
24799 IX86_BUILTIN_VPERMIL2PD,
24800 IX86_BUILTIN_VPERMIL2PS,
24801 IX86_BUILTIN_VPERMIL2PD256,
24802 IX86_BUILTIN_VPERMIL2PS256,
24803 IX86_BUILTIN_VPERM2F128PD256,
24804 IX86_BUILTIN_VPERM2F128PS256,
24805 IX86_BUILTIN_VPERM2F128SI256,
24806 IX86_BUILTIN_VBROADCASTSS,
24807 IX86_BUILTIN_VBROADCASTSD256,
24808 IX86_BUILTIN_VBROADCASTSS256,
24809 IX86_BUILTIN_VBROADCASTPD256,
24810 IX86_BUILTIN_VBROADCASTPS256,
24811 IX86_BUILTIN_VINSERTF128PD256,
24812 IX86_BUILTIN_VINSERTF128PS256,
24813 IX86_BUILTIN_VINSERTF128SI256,
24814 IX86_BUILTIN_LOADUPD256,
24815 IX86_BUILTIN_LOADUPS256,
24816 IX86_BUILTIN_STOREUPD256,
24817 IX86_BUILTIN_STOREUPS256,
24818 IX86_BUILTIN_LDDQU256,
24819 IX86_BUILTIN_MOVNTDQ256,
24820 IX86_BUILTIN_MOVNTPD256,
24821 IX86_BUILTIN_MOVNTPS256,
24822 IX86_BUILTIN_LOADDQU256,
24823 IX86_BUILTIN_STOREDQU256,
24824 IX86_BUILTIN_MASKLOADPD,
24825 IX86_BUILTIN_MASKLOADPS,
24826 IX86_BUILTIN_MASKSTOREPD,
24827 IX86_BUILTIN_MASKSTOREPS,
24828 IX86_BUILTIN_MASKLOADPD256,
24829 IX86_BUILTIN_MASKLOADPS256,
24830 IX86_BUILTIN_MASKSTOREPD256,
24831 IX86_BUILTIN_MASKSTOREPS256,
24832 IX86_BUILTIN_MOVSHDUP256,
24833 IX86_BUILTIN_MOVSLDUP256,
24834 IX86_BUILTIN_MOVDDUP256,
24835
24836 IX86_BUILTIN_SQRTPD256,
24837 IX86_BUILTIN_SQRTPS256,
24838 IX86_BUILTIN_SQRTPS_NR256,
24839 IX86_BUILTIN_RSQRTPS256,
24840 IX86_BUILTIN_RSQRTPS_NR256,
24841
24842 IX86_BUILTIN_RCPPS256,
24843
24844 IX86_BUILTIN_ROUNDPD256,
24845 IX86_BUILTIN_ROUNDPS256,
24846
24847 IX86_BUILTIN_FLOORPD256,
24848 IX86_BUILTIN_CEILPD256,
24849 IX86_BUILTIN_TRUNCPD256,
24850 IX86_BUILTIN_RINTPD256,
24851 IX86_BUILTIN_ROUNDPD_AZ256,
24852 IX86_BUILTIN_FLOORPS256,
24853 IX86_BUILTIN_CEILPS256,
24854 IX86_BUILTIN_TRUNCPS256,
24855 IX86_BUILTIN_RINTPS256,
24856 IX86_BUILTIN_ROUNDPS_AZ256,
24857
24858 IX86_BUILTIN_UNPCKHPD256,
24859 IX86_BUILTIN_UNPCKLPD256,
24860 IX86_BUILTIN_UNPCKHPS256,
24861 IX86_BUILTIN_UNPCKLPS256,
24862
24863 IX86_BUILTIN_SI256_SI,
24864 IX86_BUILTIN_PS256_PS,
24865 IX86_BUILTIN_PD256_PD,
24866 IX86_BUILTIN_SI_SI256,
24867 IX86_BUILTIN_PS_PS256,
24868 IX86_BUILTIN_PD_PD256,
24869
24870 IX86_BUILTIN_VTESTZPD,
24871 IX86_BUILTIN_VTESTCPD,
24872 IX86_BUILTIN_VTESTNZCPD,
24873 IX86_BUILTIN_VTESTZPS,
24874 IX86_BUILTIN_VTESTCPS,
24875 IX86_BUILTIN_VTESTNZCPS,
24876 IX86_BUILTIN_VTESTZPD256,
24877 IX86_BUILTIN_VTESTCPD256,
24878 IX86_BUILTIN_VTESTNZCPD256,
24879 IX86_BUILTIN_VTESTZPS256,
24880 IX86_BUILTIN_VTESTCPS256,
24881 IX86_BUILTIN_VTESTNZCPS256,
24882 IX86_BUILTIN_PTESTZ256,
24883 IX86_BUILTIN_PTESTC256,
24884 IX86_BUILTIN_PTESTNZC256,
24885
24886 IX86_BUILTIN_MOVMSKPD256,
24887 IX86_BUILTIN_MOVMSKPS256,
24888
24889 /* AVX2 */
24890 IX86_BUILTIN_MPSADBW256,
24891 IX86_BUILTIN_PABSB256,
24892 IX86_BUILTIN_PABSW256,
24893 IX86_BUILTIN_PABSD256,
24894 IX86_BUILTIN_PACKSSDW256,
24895 IX86_BUILTIN_PACKSSWB256,
24896 IX86_BUILTIN_PACKUSDW256,
24897 IX86_BUILTIN_PACKUSWB256,
24898 IX86_BUILTIN_PADDB256,
24899 IX86_BUILTIN_PADDW256,
24900 IX86_BUILTIN_PADDD256,
24901 IX86_BUILTIN_PADDQ256,
24902 IX86_BUILTIN_PADDSB256,
24903 IX86_BUILTIN_PADDSW256,
24904 IX86_BUILTIN_PADDUSB256,
24905 IX86_BUILTIN_PADDUSW256,
24906 IX86_BUILTIN_PALIGNR256,
24907 IX86_BUILTIN_AND256I,
24908 IX86_BUILTIN_ANDNOT256I,
24909 IX86_BUILTIN_PAVGB256,
24910 IX86_BUILTIN_PAVGW256,
24911 IX86_BUILTIN_PBLENDVB256,
24912 IX86_BUILTIN_PBLENDVW256,
24913 IX86_BUILTIN_PCMPEQB256,
24914 IX86_BUILTIN_PCMPEQW256,
24915 IX86_BUILTIN_PCMPEQD256,
24916 IX86_BUILTIN_PCMPEQQ256,
24917 IX86_BUILTIN_PCMPGTB256,
24918 IX86_BUILTIN_PCMPGTW256,
24919 IX86_BUILTIN_PCMPGTD256,
24920 IX86_BUILTIN_PCMPGTQ256,
24921 IX86_BUILTIN_PHADDW256,
24922 IX86_BUILTIN_PHADDD256,
24923 IX86_BUILTIN_PHADDSW256,
24924 IX86_BUILTIN_PHSUBW256,
24925 IX86_BUILTIN_PHSUBD256,
24926 IX86_BUILTIN_PHSUBSW256,
24927 IX86_BUILTIN_PMADDUBSW256,
24928 IX86_BUILTIN_PMADDWD256,
24929 IX86_BUILTIN_PMAXSB256,
24930 IX86_BUILTIN_PMAXSW256,
24931 IX86_BUILTIN_PMAXSD256,
24932 IX86_BUILTIN_PMAXUB256,
24933 IX86_BUILTIN_PMAXUW256,
24934 IX86_BUILTIN_PMAXUD256,
24935 IX86_BUILTIN_PMINSB256,
24936 IX86_BUILTIN_PMINSW256,
24937 IX86_BUILTIN_PMINSD256,
24938 IX86_BUILTIN_PMINUB256,
24939 IX86_BUILTIN_PMINUW256,
24940 IX86_BUILTIN_PMINUD256,
24941 IX86_BUILTIN_PMOVMSKB256,
24942 IX86_BUILTIN_PMOVSXBW256,
24943 IX86_BUILTIN_PMOVSXBD256,
24944 IX86_BUILTIN_PMOVSXBQ256,
24945 IX86_BUILTIN_PMOVSXWD256,
24946 IX86_BUILTIN_PMOVSXWQ256,
24947 IX86_BUILTIN_PMOVSXDQ256,
24948 IX86_BUILTIN_PMOVZXBW256,
24949 IX86_BUILTIN_PMOVZXBD256,
24950 IX86_BUILTIN_PMOVZXBQ256,
24951 IX86_BUILTIN_PMOVZXWD256,
24952 IX86_BUILTIN_PMOVZXWQ256,
24953 IX86_BUILTIN_PMOVZXDQ256,
24954 IX86_BUILTIN_PMULDQ256,
24955 IX86_BUILTIN_PMULHRSW256,
24956 IX86_BUILTIN_PMULHUW256,
24957 IX86_BUILTIN_PMULHW256,
24958 IX86_BUILTIN_PMULLW256,
24959 IX86_BUILTIN_PMULLD256,
24960 IX86_BUILTIN_PMULUDQ256,
24961 IX86_BUILTIN_POR256,
24962 IX86_BUILTIN_PSADBW256,
24963 IX86_BUILTIN_PSHUFB256,
24964 IX86_BUILTIN_PSHUFD256,
24965 IX86_BUILTIN_PSHUFHW256,
24966 IX86_BUILTIN_PSHUFLW256,
24967 IX86_BUILTIN_PSIGNB256,
24968 IX86_BUILTIN_PSIGNW256,
24969 IX86_BUILTIN_PSIGND256,
24970 IX86_BUILTIN_PSLLDQI256,
24971 IX86_BUILTIN_PSLLWI256,
24972 IX86_BUILTIN_PSLLW256,
24973 IX86_BUILTIN_PSLLDI256,
24974 IX86_BUILTIN_PSLLD256,
24975 IX86_BUILTIN_PSLLQI256,
24976 IX86_BUILTIN_PSLLQ256,
24977 IX86_BUILTIN_PSRAWI256,
24978 IX86_BUILTIN_PSRAW256,
24979 IX86_BUILTIN_PSRADI256,
24980 IX86_BUILTIN_PSRAD256,
24981 IX86_BUILTIN_PSRLDQI256,
24982 IX86_BUILTIN_PSRLWI256,
24983 IX86_BUILTIN_PSRLW256,
24984 IX86_BUILTIN_PSRLDI256,
24985 IX86_BUILTIN_PSRLD256,
24986 IX86_BUILTIN_PSRLQI256,
24987 IX86_BUILTIN_PSRLQ256,
24988 IX86_BUILTIN_PSUBB256,
24989 IX86_BUILTIN_PSUBW256,
24990 IX86_BUILTIN_PSUBD256,
24991 IX86_BUILTIN_PSUBQ256,
24992 IX86_BUILTIN_PSUBSB256,
24993 IX86_BUILTIN_PSUBSW256,
24994 IX86_BUILTIN_PSUBUSB256,
24995 IX86_BUILTIN_PSUBUSW256,
24996 IX86_BUILTIN_PUNPCKHBW256,
24997 IX86_BUILTIN_PUNPCKHWD256,
24998 IX86_BUILTIN_PUNPCKHDQ256,
24999 IX86_BUILTIN_PUNPCKHQDQ256,
25000 IX86_BUILTIN_PUNPCKLBW256,
25001 IX86_BUILTIN_PUNPCKLWD256,
25002 IX86_BUILTIN_PUNPCKLDQ256,
25003 IX86_BUILTIN_PUNPCKLQDQ256,
25004 IX86_BUILTIN_PXOR256,
25005 IX86_BUILTIN_MOVNTDQA256,
25006 IX86_BUILTIN_VBROADCASTSS_PS,
25007 IX86_BUILTIN_VBROADCASTSS_PS256,
25008 IX86_BUILTIN_VBROADCASTSD_PD256,
25009 IX86_BUILTIN_VBROADCASTSI256,
25010 IX86_BUILTIN_PBLENDD256,
25011 IX86_BUILTIN_PBLENDD128,
25012 IX86_BUILTIN_PBROADCASTB256,
25013 IX86_BUILTIN_PBROADCASTW256,
25014 IX86_BUILTIN_PBROADCASTD256,
25015 IX86_BUILTIN_PBROADCASTQ256,
25016 IX86_BUILTIN_PBROADCASTB128,
25017 IX86_BUILTIN_PBROADCASTW128,
25018 IX86_BUILTIN_PBROADCASTD128,
25019 IX86_BUILTIN_PBROADCASTQ128,
25020 IX86_BUILTIN_VPERMVARSI256,
25021 IX86_BUILTIN_VPERMDF256,
25022 IX86_BUILTIN_VPERMVARSF256,
25023 IX86_BUILTIN_VPERMDI256,
25024 IX86_BUILTIN_VPERMTI256,
25025 IX86_BUILTIN_VEXTRACT128I256,
25026 IX86_BUILTIN_VINSERT128I256,
25027 IX86_BUILTIN_MASKLOADD,
25028 IX86_BUILTIN_MASKLOADQ,
25029 IX86_BUILTIN_MASKLOADD256,
25030 IX86_BUILTIN_MASKLOADQ256,
25031 IX86_BUILTIN_MASKSTORED,
25032 IX86_BUILTIN_MASKSTOREQ,
25033 IX86_BUILTIN_MASKSTORED256,
25034 IX86_BUILTIN_MASKSTOREQ256,
25035 IX86_BUILTIN_PSLLVV4DI,
25036 IX86_BUILTIN_PSLLVV2DI,
25037 IX86_BUILTIN_PSLLVV8SI,
25038 IX86_BUILTIN_PSLLVV4SI,
25039 IX86_BUILTIN_PSRAVV8SI,
25040 IX86_BUILTIN_PSRAVV4SI,
25041 IX86_BUILTIN_PSRLVV4DI,
25042 IX86_BUILTIN_PSRLVV2DI,
25043 IX86_BUILTIN_PSRLVV8SI,
25044 IX86_BUILTIN_PSRLVV4SI,
25045
25046 IX86_BUILTIN_GATHERSIV2DF,
25047 IX86_BUILTIN_GATHERSIV4DF,
25048 IX86_BUILTIN_GATHERDIV2DF,
25049 IX86_BUILTIN_GATHERDIV4DF,
25050 IX86_BUILTIN_GATHERSIV4SF,
25051 IX86_BUILTIN_GATHERSIV8SF,
25052 IX86_BUILTIN_GATHERDIV4SF,
25053 IX86_BUILTIN_GATHERDIV8SF,
25054 IX86_BUILTIN_GATHERSIV2DI,
25055 IX86_BUILTIN_GATHERSIV4DI,
25056 IX86_BUILTIN_GATHERDIV2DI,
25057 IX86_BUILTIN_GATHERDIV4DI,
25058 IX86_BUILTIN_GATHERSIV4SI,
25059 IX86_BUILTIN_GATHERSIV8SI,
25060 IX86_BUILTIN_GATHERDIV4SI,
25061 IX86_BUILTIN_GATHERDIV8SI,
25062
25063 /* TFmode support builtins. */
25064 IX86_BUILTIN_INFQ,
25065 IX86_BUILTIN_HUGE_VALQ,
25066 IX86_BUILTIN_FABSQ,
25067 IX86_BUILTIN_COPYSIGNQ,
25068
25069 /* Vectorizer support builtins. */
25070 IX86_BUILTIN_CPYSGNPS,
25071 IX86_BUILTIN_CPYSGNPD,
25072 IX86_BUILTIN_CPYSGNPS256,
25073 IX86_BUILTIN_CPYSGNPD256,
25074
25075 IX86_BUILTIN_CVTUDQ2PS,
25076
25077 /* FMA4 instructions. */
25078 IX86_BUILTIN_VFMADDSS,
25079 IX86_BUILTIN_VFMADDSD,
25080 IX86_BUILTIN_VFMADDPS,
25081 IX86_BUILTIN_VFMADDPD,
25082 IX86_BUILTIN_VFMADDPS256,
25083 IX86_BUILTIN_VFMADDPD256,
25084 IX86_BUILTIN_VFMADDSUBPS,
25085 IX86_BUILTIN_VFMADDSUBPD,
25086 IX86_BUILTIN_VFMADDSUBPS256,
25087 IX86_BUILTIN_VFMADDSUBPD256,
25088
25089 /* FMA3 instructions. */
25090 IX86_BUILTIN_VFMADDSS3,
25091 IX86_BUILTIN_VFMADDSD3,
25092
25093 /* XOP instructions. */
25094 IX86_BUILTIN_VPCMOV,
25095 IX86_BUILTIN_VPCMOV_V2DI,
25096 IX86_BUILTIN_VPCMOV_V4SI,
25097 IX86_BUILTIN_VPCMOV_V8HI,
25098 IX86_BUILTIN_VPCMOV_V16QI,
25099 IX86_BUILTIN_VPCMOV_V4SF,
25100 IX86_BUILTIN_VPCMOV_V2DF,
25101 IX86_BUILTIN_VPCMOV256,
25102 IX86_BUILTIN_VPCMOV_V4DI256,
25103 IX86_BUILTIN_VPCMOV_V8SI256,
25104 IX86_BUILTIN_VPCMOV_V16HI256,
25105 IX86_BUILTIN_VPCMOV_V32QI256,
25106 IX86_BUILTIN_VPCMOV_V8SF256,
25107 IX86_BUILTIN_VPCMOV_V4DF256,
25108
25109 IX86_BUILTIN_VPPERM,
25110
25111 IX86_BUILTIN_VPMACSSWW,
25112 IX86_BUILTIN_VPMACSWW,
25113 IX86_BUILTIN_VPMACSSWD,
25114 IX86_BUILTIN_VPMACSWD,
25115 IX86_BUILTIN_VPMACSSDD,
25116 IX86_BUILTIN_VPMACSDD,
25117 IX86_BUILTIN_VPMACSSDQL,
25118 IX86_BUILTIN_VPMACSSDQH,
25119 IX86_BUILTIN_VPMACSDQL,
25120 IX86_BUILTIN_VPMACSDQH,
25121 IX86_BUILTIN_VPMADCSSWD,
25122 IX86_BUILTIN_VPMADCSWD,
25123
25124 IX86_BUILTIN_VPHADDBW,
25125 IX86_BUILTIN_VPHADDBD,
25126 IX86_BUILTIN_VPHADDBQ,
25127 IX86_BUILTIN_VPHADDWD,
25128 IX86_BUILTIN_VPHADDWQ,
25129 IX86_BUILTIN_VPHADDDQ,
25130 IX86_BUILTIN_VPHADDUBW,
25131 IX86_BUILTIN_VPHADDUBD,
25132 IX86_BUILTIN_VPHADDUBQ,
25133 IX86_BUILTIN_VPHADDUWD,
25134 IX86_BUILTIN_VPHADDUWQ,
25135 IX86_BUILTIN_VPHADDUDQ,
25136 IX86_BUILTIN_VPHSUBBW,
25137 IX86_BUILTIN_VPHSUBWD,
25138 IX86_BUILTIN_VPHSUBDQ,
25139
25140 IX86_BUILTIN_VPROTB,
25141 IX86_BUILTIN_VPROTW,
25142 IX86_BUILTIN_VPROTD,
25143 IX86_BUILTIN_VPROTQ,
25144 IX86_BUILTIN_VPROTB_IMM,
25145 IX86_BUILTIN_VPROTW_IMM,
25146 IX86_BUILTIN_VPROTD_IMM,
25147 IX86_BUILTIN_VPROTQ_IMM,
25148
25149 IX86_BUILTIN_VPSHLB,
25150 IX86_BUILTIN_VPSHLW,
25151 IX86_BUILTIN_VPSHLD,
25152 IX86_BUILTIN_VPSHLQ,
25153 IX86_BUILTIN_VPSHAB,
25154 IX86_BUILTIN_VPSHAW,
25155 IX86_BUILTIN_VPSHAD,
25156 IX86_BUILTIN_VPSHAQ,
25157
25158 IX86_BUILTIN_VFRCZSS,
25159 IX86_BUILTIN_VFRCZSD,
25160 IX86_BUILTIN_VFRCZPS,
25161 IX86_BUILTIN_VFRCZPD,
25162 IX86_BUILTIN_VFRCZPS256,
25163 IX86_BUILTIN_VFRCZPD256,
25164
25165 IX86_BUILTIN_VPCOMEQUB,
25166 IX86_BUILTIN_VPCOMNEUB,
25167 IX86_BUILTIN_VPCOMLTUB,
25168 IX86_BUILTIN_VPCOMLEUB,
25169 IX86_BUILTIN_VPCOMGTUB,
25170 IX86_BUILTIN_VPCOMGEUB,
25171 IX86_BUILTIN_VPCOMFALSEUB,
25172 IX86_BUILTIN_VPCOMTRUEUB,
25173
25174 IX86_BUILTIN_VPCOMEQUW,
25175 IX86_BUILTIN_VPCOMNEUW,
25176 IX86_BUILTIN_VPCOMLTUW,
25177 IX86_BUILTIN_VPCOMLEUW,
25178 IX86_BUILTIN_VPCOMGTUW,
25179 IX86_BUILTIN_VPCOMGEUW,
25180 IX86_BUILTIN_VPCOMFALSEUW,
25181 IX86_BUILTIN_VPCOMTRUEUW,
25182
25183 IX86_BUILTIN_VPCOMEQUD,
25184 IX86_BUILTIN_VPCOMNEUD,
25185 IX86_BUILTIN_VPCOMLTUD,
25186 IX86_BUILTIN_VPCOMLEUD,
25187 IX86_BUILTIN_VPCOMGTUD,
25188 IX86_BUILTIN_VPCOMGEUD,
25189 IX86_BUILTIN_VPCOMFALSEUD,
25190 IX86_BUILTIN_VPCOMTRUEUD,
25191
25192 IX86_BUILTIN_VPCOMEQUQ,
25193 IX86_BUILTIN_VPCOMNEUQ,
25194 IX86_BUILTIN_VPCOMLTUQ,
25195 IX86_BUILTIN_VPCOMLEUQ,
25196 IX86_BUILTIN_VPCOMGTUQ,
25197 IX86_BUILTIN_VPCOMGEUQ,
25198 IX86_BUILTIN_VPCOMFALSEUQ,
25199 IX86_BUILTIN_VPCOMTRUEUQ,
25200
25201 IX86_BUILTIN_VPCOMEQB,
25202 IX86_BUILTIN_VPCOMNEB,
25203 IX86_BUILTIN_VPCOMLTB,
25204 IX86_BUILTIN_VPCOMLEB,
25205 IX86_BUILTIN_VPCOMGTB,
25206 IX86_BUILTIN_VPCOMGEB,
25207 IX86_BUILTIN_VPCOMFALSEB,
25208 IX86_BUILTIN_VPCOMTRUEB,
25209
25210 IX86_BUILTIN_VPCOMEQW,
25211 IX86_BUILTIN_VPCOMNEW,
25212 IX86_BUILTIN_VPCOMLTW,
25213 IX86_BUILTIN_VPCOMLEW,
25214 IX86_BUILTIN_VPCOMGTW,
25215 IX86_BUILTIN_VPCOMGEW,
25216 IX86_BUILTIN_VPCOMFALSEW,
25217 IX86_BUILTIN_VPCOMTRUEW,
25218
25219 IX86_BUILTIN_VPCOMEQD,
25220 IX86_BUILTIN_VPCOMNED,
25221 IX86_BUILTIN_VPCOMLTD,
25222 IX86_BUILTIN_VPCOMLED,
25223 IX86_BUILTIN_VPCOMGTD,
25224 IX86_BUILTIN_VPCOMGED,
25225 IX86_BUILTIN_VPCOMFALSED,
25226 IX86_BUILTIN_VPCOMTRUED,
25227
25228 IX86_BUILTIN_VPCOMEQQ,
25229 IX86_BUILTIN_VPCOMNEQ,
25230 IX86_BUILTIN_VPCOMLTQ,
25231 IX86_BUILTIN_VPCOMLEQ,
25232 IX86_BUILTIN_VPCOMGTQ,
25233 IX86_BUILTIN_VPCOMGEQ,
25234 IX86_BUILTIN_VPCOMFALSEQ,
25235 IX86_BUILTIN_VPCOMTRUEQ,
25236
25237 /* LWP instructions. */
25238 IX86_BUILTIN_LLWPCB,
25239 IX86_BUILTIN_SLWPCB,
25240 IX86_BUILTIN_LWPVAL32,
25241 IX86_BUILTIN_LWPVAL64,
25242 IX86_BUILTIN_LWPINS32,
25243 IX86_BUILTIN_LWPINS64,
25244
25245 IX86_BUILTIN_CLZS,
25246
25247 /* BMI instructions. */
25248 IX86_BUILTIN_BEXTR32,
25249 IX86_BUILTIN_BEXTR64,
25250 IX86_BUILTIN_CTZS,
25251
25252 /* TBM instructions. */
25253 IX86_BUILTIN_BEXTRI32,
25254 IX86_BUILTIN_BEXTRI64,
25255
25256 /* BMI2 instructions. */
25257 IX86_BUILTIN_BZHI32,
25258 IX86_BUILTIN_BZHI64,
25259 IX86_BUILTIN_PDEP32,
25260 IX86_BUILTIN_PDEP64,
25261 IX86_BUILTIN_PEXT32,
25262 IX86_BUILTIN_PEXT64,
25263
25264 /* FSGSBASE instructions. */
25265 IX86_BUILTIN_RDFSBASE32,
25266 IX86_BUILTIN_RDFSBASE64,
25267 IX86_BUILTIN_RDGSBASE32,
25268 IX86_BUILTIN_RDGSBASE64,
25269 IX86_BUILTIN_WRFSBASE32,
25270 IX86_BUILTIN_WRFSBASE64,
25271 IX86_BUILTIN_WRGSBASE32,
25272 IX86_BUILTIN_WRGSBASE64,
25273
25274 /* RDRND instructions. */
25275 IX86_BUILTIN_RDRAND16_STEP,
25276 IX86_BUILTIN_RDRAND32_STEP,
25277 IX86_BUILTIN_RDRAND64_STEP,
25278
25279 /* F16C instructions. */
25280 IX86_BUILTIN_CVTPH2PS,
25281 IX86_BUILTIN_CVTPH2PS256,
25282 IX86_BUILTIN_CVTPS2PH,
25283 IX86_BUILTIN_CVTPS2PH256,
25284
25285 /* CFString built-in for darwin */
25286 IX86_BUILTIN_CFSTRING,
25287
25288 IX86_BUILTIN_MAX
25289 };
25290
25291 /* Table for the ix86 builtin decls. */
25292 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25293
25294 /* Table of all of the builtin functions that are possible with different ISA's
25295 but are waiting to be built until a function is declared to use that
25296 ISA. */
25297 struct builtin_isa {
25298 const char *name; /* function name */
25299 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25300 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
25301 bool const_p; /* true if the declaration is constant */
25302 bool set_and_not_built_p;
25303 };
25304
25305 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25306
25307
25308 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
25309 of which isa_flags to use in the ix86_builtins_isa array. Stores the
25310 function decl in the ix86_builtins array. Returns the function decl or
25311 NULL_TREE, if the builtin was not added.
25312
25313 If the front end has a special hook for builtin functions, delay adding
25314 builtin functions that aren't in the current ISA until the ISA is changed
25315 with function specific optimization. Doing so, can save about 300K for the
25316 default compiler. When the builtin is expanded, check at that time whether
25317 it is valid.
25318
25319 If the front end doesn't have a special hook, record all builtins, even if
25320 it isn't an instruction set in the current ISA in case the user uses
25321 function specific options for a different ISA, so that we don't get scope
25322 errors if a builtin is added in the middle of a function scope. */
25323
25324 static inline tree
25325 def_builtin (HOST_WIDE_INT mask, const char *name,
25326 enum ix86_builtin_func_type tcode,
25327 enum ix86_builtins code)
25328 {
25329 tree decl = NULL_TREE;
25330
25331 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
25332 {
25333 ix86_builtins_isa[(int) code].isa = mask;
25334
25335 mask &= ~OPTION_MASK_ISA_64BIT;
25336 if (mask == 0
25337 || (mask & ix86_isa_flags) != 0
25338 || (lang_hooks.builtin_function
25339 == lang_hooks.builtin_function_ext_scope))
25340
25341 {
25342 tree type = ix86_get_builtin_func_type (tcode);
25343 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
25344 NULL, NULL_TREE);
25345 ix86_builtins[(int) code] = decl;
25346 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
25347 }
25348 else
25349 {
25350 ix86_builtins[(int) code] = NULL_TREE;
25351 ix86_builtins_isa[(int) code].tcode = tcode;
25352 ix86_builtins_isa[(int) code].name = name;
25353 ix86_builtins_isa[(int) code].const_p = false;
25354 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
25355 }
25356 }
25357
25358 return decl;
25359 }
25360
25361 /* Like def_builtin, but also marks the function decl "const". */
25362
25363 static inline tree
25364 def_builtin_const (HOST_WIDE_INT mask, const char *name,
25365 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
25366 {
25367 tree decl = def_builtin (mask, name, tcode, code);
25368 if (decl)
25369 TREE_READONLY (decl) = 1;
25370 else
25371 ix86_builtins_isa[(int) code].const_p = true;
25372
25373 return decl;
25374 }
25375
25376 /* Add any new builtin functions for a given ISA that may not have been
25377 declared. This saves a bit of space compared to adding all of the
25378 declarations to the tree, even if we didn't use them. */
25379
25380 static void
25381 ix86_add_new_builtins (HOST_WIDE_INT isa)
25382 {
25383 int i;
25384
25385 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
25386 {
25387 if ((ix86_builtins_isa[i].isa & isa) != 0
25388 && ix86_builtins_isa[i].set_and_not_built_p)
25389 {
25390 tree decl, type;
25391
25392 /* Don't define the builtin again. */
25393 ix86_builtins_isa[i].set_and_not_built_p = false;
25394
25395 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
25396 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
25397 type, i, BUILT_IN_MD, NULL,
25398 NULL_TREE);
25399
25400 ix86_builtins[i] = decl;
25401 if (ix86_builtins_isa[i].const_p)
25402 TREE_READONLY (decl) = 1;
25403 }
25404 }
25405 }
25406
25407 /* Bits for builtin_description.flag. */
25408
25409 /* Set when we don't support the comparison natively, and should
25410 swap_comparison in order to support it. */
25411 #define BUILTIN_DESC_SWAP_OPERANDS 1
25412
25413 struct builtin_description
25414 {
25415 const HOST_WIDE_INT mask;
25416 const enum insn_code icode;
25417 const char *const name;
25418 const enum ix86_builtins code;
25419 const enum rtx_code comparison;
25420 const int flag;
25421 };
25422
25423 static const struct builtin_description bdesc_comi[] =
25424 {
25425 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
25426 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
25427 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
25428 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
25429 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
25430 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
25431 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
25432 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
25433 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
25434 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
25435 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
25436 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
25437 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
25438 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
25439 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
25440 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
25441 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
25442 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
25443 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
25444 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
25445 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
25446 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
25447 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
25448 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
25449 };
25450
25451 static const struct builtin_description bdesc_pcmpestr[] =
25452 {
25453 /* SSE4.2 */
25454 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
25455 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
25456 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
25457 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
25458 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
25459 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
25460 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
25461 };
25462
25463 static const struct builtin_description bdesc_pcmpistr[] =
25464 {
25465 /* SSE4.2 */
25466 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
25467 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
25468 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
25469 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
25470 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
25471 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
25472 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
25473 };
25474
25475 /* Special builtins with variable number of arguments. */
25476 static const struct builtin_description bdesc_special_args[] =
25477 {
25478 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
25479 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
25480 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
25481
25482 /* MMX */
25483 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25484
25485 /* 3DNow! */
25486 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25487
25488 /* SSE */
25489 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25490 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25491 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25492
25493 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25494 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25495 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25496 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25497
25498 /* SSE or 3DNow!A */
25499 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25500 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntdi, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
25501
25502 /* SSE2 */
25503 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25504 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25505 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25506 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
25507 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25508 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
25509 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntsi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
25510 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
25511 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25512
25513 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25514 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25515
25516 /* SSE3 */
25517 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25518
25519 /* SSE4.1 */
25520 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
25521
25522 /* SSE4A */
25523 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25524 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25525
25526 /* AVX */
25527 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
25528 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
25529
25530 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25531 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25532 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25533 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
25534 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
25535
25536 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25537 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25538 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
25539 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
25540 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
25541 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
25542 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
25543
25544 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
25545 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
25546 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
25547
25548 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
25549 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
25550 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
25551 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
25552 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
25553 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
25554 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
25555 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
25556
25557 /* AVX2 */
25558 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
25559 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
25560 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
25561 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
25562 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
25563 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
25564 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
25565 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
25566 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
25567
25568 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
25569 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
25570 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
25571 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
25572 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
25573 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
25574
25575 /* FSGSBASE */
25576 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
25577 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
25578 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
25579 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
25580 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
25581 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
25582 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
25583 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
25584 };
25585
25586 /* Builtins with variable number of arguments. */
25587 static const struct builtin_description bdesc_args[] =
25588 {
25589 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
25590 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
25591 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
25592 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
25593 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
25594 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
25595 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
25596
25597 /* MMX */
25598 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25599 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25600 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25601 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25602 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25603 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25604
25605 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25606 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25607 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25608 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25609 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25610 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25611 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25612 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25613
25614 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25615 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25616
25617 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25618 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25619 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25620 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25621
25622 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25623 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25624 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25625 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25626 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25627 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25628
25629 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25630 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25631 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25632 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25633 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
25634 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
25635
25636 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
25637 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
25638 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
25639
25640 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
25641
25642 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25643 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25644 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
25645 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25646 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25647 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
25648
25649 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25650 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25651 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
25652 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25653 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25654 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
25655
25656 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25657 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25658 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25659 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25660
25661 /* 3DNow! */
25662 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
25663 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
25664 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
25665 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
25666
25667 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25668 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25669 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25670 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
25671 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
25672 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
25673 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25674 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25675 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25676 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25677 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25678 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25679 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25680 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25681 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25682
25683 /* 3DNow!A */
25684 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
25685 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
25686 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
25687 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
25688 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25689 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25690
25691 /* SSE */
25692 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
25693 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25694 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25695 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25696 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25697 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25698 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
25699 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
25700 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
25701 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
25702 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
25703 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
25704
25705 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
25706
25707 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25708 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25709 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25710 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25711 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25712 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25713 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25714 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25715
25716 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
25717 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
25718 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
25719 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25720 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25721 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
25722 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
25723 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
25724 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
25725 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25726 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
25727 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
25728 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
25729 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
25730 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
25731 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
25732 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
25733 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
25734 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
25735 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25736 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25737 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
25738
25739 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25740 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25741 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25742 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25743
25744 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25745 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25746 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25747 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25748
25749 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25750
25751 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25752 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25753 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25754 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25755 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25756
25757 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
25758 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
25759 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
25760
25761 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
25762
25763 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
25764 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
25765 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
25766
25767 /* SSE MMX or 3Dnow!A */
25768 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25769 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25770 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25771
25772 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25773 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25774 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25775 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25776
25777 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
25778 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
25779
25780 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
25781
25782 /* SSE2 */
25783 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
25784
25785 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
25786 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
25787 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
25788 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
25789 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2ps, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
25790 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtudq2ps, "__builtin_ia32_cvtudq2ps", IX86_BUILTIN_CVTUDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
25791
25792 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
25793 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
25794 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
25795 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
25796 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
25797
25798 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
25799
25800 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
25801 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
25802 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
25803 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
25804
25805 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
25806 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
25807 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttps2dq, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
25808
25809 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25810 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25811 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25812 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25813 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25814 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25815 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25816 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25817
25818 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
25819 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
25820 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
25821 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
25822 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
25823 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
25824 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
25825 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
25826 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
25827 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
25828 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
25829 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
25830 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
25831 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
25832 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
25833 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
25834 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
25835 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
25836 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
25837 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
25838
25839 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25840 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25841 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25842 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25843
25844 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25845 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25846 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25847 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25848
25849 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25850
25851 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25852 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25853 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25854
25855 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
25856
25857 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25858 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25859 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
25860 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
25861 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25862 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25863 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
25864 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
25865
25866 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25867 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25868 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25869 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25870 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25871 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25872 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25873 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25874
25875 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25876 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
25877
25878 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
25879 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
25880 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
25881 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
25882
25883 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25884 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25885
25886 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25887 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25888 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
25889 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25890 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25891 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
25892
25893 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25894 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25895 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25896 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25897
25898 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25899 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25900 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
25901 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
25902 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25903 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25904 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
25905 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
25906
25907 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
25908 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
25909 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
25910
25911 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25912 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
25913
25914 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
25915 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
25916
25917 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
25918
25919 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
25920 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
25921 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
25922 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
25923
25924 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
25925 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
25926 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
25927 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
25928 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
25929 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
25930 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
25931
25932 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
25933 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
25934 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
25935 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
25936 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
25937 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
25938 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
25939
25940 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
25941 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
25942 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
25943 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
25944
25945 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
25946 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
25947 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
25948
25949 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
25950
25951 { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
25952 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
25953
25954 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
25955
25956 /* SSE2 MMX */
25957 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
25958 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
25959
25960 /* SSE3 */
25961 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
25962 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25963
25964 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25965 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25966 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25967 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25968 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25969 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25970
25971 /* SSSE3 */
25972 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
25973 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
25974 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
25975 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
25976 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
25977 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
25978
25979 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25980 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25981 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
25982 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25983 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25984 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25985 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25986 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25987 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
25988 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25989 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25990 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25991 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
25992 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
25993 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
25994 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25995 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25996 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25997 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
25998 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25999 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26000 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26001 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26002 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26003
26004 /* SSSE3. */
26005 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26006 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26007
26008 /* SSE4.1 */
26009 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26010 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26011 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26012 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26013 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26014 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26015 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26016 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26017 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26018 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26019
26020 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26021 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26022 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26023 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26024 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26025 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26026 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26027 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26028 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26029 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26030 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26031 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26032 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26033
26034 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26035 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26036 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26037 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26038 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26039 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26040 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26041 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26042 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26043 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26044 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26045 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26046
26047 /* SSE4.1 */
26048 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26049 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26050 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26051 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26052
26053 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26054 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26055 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26056 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26057
26058 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26059
26060 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26061 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26062 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26063 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26064
26065 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26066
26067 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26068 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26069 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26070
26071 /* SSE4.2 */
26072 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26073 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26074 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26075 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26076 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26077
26078 /* SSE4A */
26079 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26080 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26081 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26082 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26083
26084 /* AES */
26085 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26086 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26087
26088 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26089 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26090 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26091 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26092
26093 /* PCLMUL */
26094 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26095
26096 /* AVX */
26097 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26098 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26099 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26100 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26101 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26102 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26103 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26104 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26105 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26106 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26107 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26108 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26109 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26110 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26111 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26112 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26113 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26114 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26115 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26116 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26117 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26118 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26119 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26120 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26121 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26122 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26123
26124 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26125 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26126 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26127 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26128
26129 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26130 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26131 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26132 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26133 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26134 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26135 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26136 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26137 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26138 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26139 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26140 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26141 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26142 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26143 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26144 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26145 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtdq2pd256, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26146 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtdq2ps256, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26147 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26148 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26149 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26150 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvttpd2dq256, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26151 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26152 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvttps2dq256, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26153 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26154 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26155 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26156 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26157 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26158 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26159 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26160 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26161 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26162 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26163
26164 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26165 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26166 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26167
26168 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26169 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26170 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26171 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26172 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26173
26174 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26175
26176 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26177 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26178
26179 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26180 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26181 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26182 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26183
26184 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26185
26186 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26187 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26188 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26189 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26190
26191 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26192
26193 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26194 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26195 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26196 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26197
26198 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26199 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26200 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26201 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26202 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26203 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26204
26205 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26206 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26207 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26208 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26209 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26210 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26211 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26212 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26213 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26214 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26215 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26216 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26217 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26218 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26219 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26220
26221 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
26222 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26223
26224 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26225 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26226
26227 /* AVX2 */
26228 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26229 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26230 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26231 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26232 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26233 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26234 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26235 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26236 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26237 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26238 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26239 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26240 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26241 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26242 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26243 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26244 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26245 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26246 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26247 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26248 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26249 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26250 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26251 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26252 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26253 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26254 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26255 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26256 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26257 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26258 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26259 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26260 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26261 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26262 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26263 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26264 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26265 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26266 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26267 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26268 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26269 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26270 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26271 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26272 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26273 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26274 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26275 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26276 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26277 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26278 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26279 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26280 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26281 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26282 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26283 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26284 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26285 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26286 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26287 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26288 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26289 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26290 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26291 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26292 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26293 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26294 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26295 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26296 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26297 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26298 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26299 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26300 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26301 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26302 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
26303 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26304 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26305 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26306 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26307 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26308 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26309 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26310 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26311 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26312 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26313 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26314 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26315 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26316 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26317 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26318 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26319 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26320 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26321 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26322 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26323 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26324 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26325 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26326 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26327 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26328 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26329 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26330 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26331 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26332 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26333 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26334 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26335 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26336 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26337 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26338 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26339 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26340 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26341 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26342 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26343 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26344 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26345 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26346 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26347 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
26348 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26349 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
26350 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
26351 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26352 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26353 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26354 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26355 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26356 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26357 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26358 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26359 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26360 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
26361 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
26362 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
26363 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
26364 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26365 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26366 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26367 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26368 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26369 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26370 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26371 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26372 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26373 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26374
26375 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26376
26377 /* BMI */
26378 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26379 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26380 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26381
26382 /* TBM */
26383 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26384 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26385
26386 /* F16C */
26387 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
26388 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
26389 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
26390 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
26391
26392 /* BMI2 */
26393 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26394 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26395 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26396 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26397 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26398 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26399 };
26400
26401 /* FMA4 and XOP. */
26402 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
26403 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
26404 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
26405 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
26406 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
26407 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
26408 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
26409 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
26410 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
26411 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
26412 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
26413 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
26414 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
26415 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
26416 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
26417 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
26418 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
26419 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
26420 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
26421 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
26422 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
26423 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
26424 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
26425 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
26426 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
26427 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
26428 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
26429 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
26430 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
26431 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
26432 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
26433 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
26434 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
26435 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
26436 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
26437 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
26438 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
26439 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
26440 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
26441 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
26442 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
26443 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
26444 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
26445 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
26446 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
26447 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
26448 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
26449 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
26450 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
26451 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
26452 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
26453 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
26454
26455 static const struct builtin_description bdesc_multi_arg[] =
26456 {
26457 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
26458 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
26459 UNKNOWN, (int)MULTI_ARG_3_SF },
26460 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
26461 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
26462 UNKNOWN, (int)MULTI_ARG_3_DF },
26463
26464 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
26465 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
26466 UNKNOWN, (int)MULTI_ARG_3_SF },
26467 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
26468 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
26469 UNKNOWN, (int)MULTI_ARG_3_DF },
26470
26471 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
26472 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
26473 UNKNOWN, (int)MULTI_ARG_3_SF },
26474 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
26475 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
26476 UNKNOWN, (int)MULTI_ARG_3_DF },
26477 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
26478 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
26479 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26480 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
26481 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
26482 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26483
26484 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
26485 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
26486 UNKNOWN, (int)MULTI_ARG_3_SF },
26487 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
26488 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
26489 UNKNOWN, (int)MULTI_ARG_3_DF },
26490 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
26491 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
26492 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26493 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
26494 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
26495 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26496
26497 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
26498 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
26499 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
26500 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
26501 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
26502 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
26503 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
26504
26505 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26506 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26507 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
26508 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
26509 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
26510 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
26511 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
26512
26513 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
26514
26515 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26516 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26517 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26518 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26519 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
26520 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
26521 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26522 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26523 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26524 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26525 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26526 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26527
26528 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26529 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
26530 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
26531 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
26532 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
26533 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
26534 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
26535 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
26536 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26537 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
26538 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
26539 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
26540 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26541 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
26542 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
26543 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
26544
26545 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
26546 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
26547 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
26548 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
26549 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
26550 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
26551
26552 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26553 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
26554 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
26555 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26556 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
26557 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26558 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26559 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
26560 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
26561 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26562 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
26563 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26564 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26565 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26566 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26567
26568 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
26569 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
26570 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
26571 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
26572 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
26573 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
26574 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
26575
26576 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
26577 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
26578 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
26579 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
26580 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
26581 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
26582 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
26583
26584 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
26585 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
26586 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
26587 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
26588 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
26589 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
26590 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
26591
26592 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
26593 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
26594 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
26595 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
26596 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
26597 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
26598 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
26599
26600 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
26601 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
26602 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
26603 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
26604 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
26605 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
26606 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
26607
26608 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
26609 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
26610 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
26611 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
26612 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
26613 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
26614 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
26615
26616 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
26617 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
26618 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
26619 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
26620 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
26621 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
26622 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
26623
26624 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
26625 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
26626 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
26627 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
26628 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
26629 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
26630 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
26631
26632 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
26633 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
26634 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
26635 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
26636 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
26637 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
26638 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
26639 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
26640
26641 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
26642 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
26643 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
26644 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
26645 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
26646 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
26647 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
26648 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
26649
26650 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
26651 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
26652 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
26653 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
26654
26655 };
26656
26657 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
26658 in the current target ISA to allow the user to compile particular modules
26659 with different target specific options that differ from the command line
26660 options. */
26661 static void
26662 ix86_init_mmx_sse_builtins (void)
26663 {
26664 const struct builtin_description * d;
26665 enum ix86_builtin_func_type ftype;
26666 size_t i;
26667
26668 /* Add all special builtins with variable number of operands. */
26669 for (i = 0, d = bdesc_special_args;
26670 i < ARRAY_SIZE (bdesc_special_args);
26671 i++, d++)
26672 {
26673 if (d->name == 0)
26674 continue;
26675
26676 ftype = (enum ix86_builtin_func_type) d->flag;
26677 def_builtin (d->mask, d->name, ftype, d->code);
26678 }
26679
26680 /* Add all builtins with variable number of operands. */
26681 for (i = 0, d = bdesc_args;
26682 i < ARRAY_SIZE (bdesc_args);
26683 i++, d++)
26684 {
26685 if (d->name == 0)
26686 continue;
26687
26688 ftype = (enum ix86_builtin_func_type) d->flag;
26689 def_builtin_const (d->mask, d->name, ftype, d->code);
26690 }
26691
26692 /* pcmpestr[im] insns. */
26693 for (i = 0, d = bdesc_pcmpestr;
26694 i < ARRAY_SIZE (bdesc_pcmpestr);
26695 i++, d++)
26696 {
26697 if (d->code == IX86_BUILTIN_PCMPESTRM128)
26698 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
26699 else
26700 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
26701 def_builtin_const (d->mask, d->name, ftype, d->code);
26702 }
26703
26704 /* pcmpistr[im] insns. */
26705 for (i = 0, d = bdesc_pcmpistr;
26706 i < ARRAY_SIZE (bdesc_pcmpistr);
26707 i++, d++)
26708 {
26709 if (d->code == IX86_BUILTIN_PCMPISTRM128)
26710 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
26711 else
26712 ftype = INT_FTYPE_V16QI_V16QI_INT;
26713 def_builtin_const (d->mask, d->name, ftype, d->code);
26714 }
26715
26716 /* comi/ucomi insns. */
26717 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
26718 {
26719 if (d->mask == OPTION_MASK_ISA_SSE2)
26720 ftype = INT_FTYPE_V2DF_V2DF;
26721 else
26722 ftype = INT_FTYPE_V4SF_V4SF;
26723 def_builtin_const (d->mask, d->name, ftype, d->code);
26724 }
26725
26726 /* SSE */
26727 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
26728 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
26729 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
26730 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
26731
26732 /* SSE or 3DNow!A */
26733 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
26734 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
26735 IX86_BUILTIN_MASKMOVQ);
26736
26737 /* SSE2 */
26738 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
26739 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
26740
26741 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
26742 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
26743 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
26744 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
26745
26746 /* SSE3. */
26747 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
26748 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
26749 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
26750 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
26751
26752 /* AES */
26753 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
26754 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
26755 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
26756 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
26757 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
26758 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
26759 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
26760 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
26761 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
26762 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
26763 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
26764 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
26765
26766 /* PCLMUL */
26767 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
26768 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
26769
26770 /* RDRND */
26771 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
26772 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
26773 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
26774 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
26775 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
26776 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
26777 IX86_BUILTIN_RDRAND64_STEP);
26778
26779 /* AVX2 */
26780 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
26781 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
26782 IX86_BUILTIN_GATHERSIV2DF);
26783
26784 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
26785 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
26786 IX86_BUILTIN_GATHERSIV4DF);
26787
26788 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
26789 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
26790 IX86_BUILTIN_GATHERDIV2DF);
26791
26792 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
26793 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
26794 IX86_BUILTIN_GATHERDIV4DF);
26795
26796 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
26797 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
26798 IX86_BUILTIN_GATHERSIV4SF);
26799
26800 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
26801 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
26802 IX86_BUILTIN_GATHERSIV8SF);
26803
26804 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
26805 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
26806 IX86_BUILTIN_GATHERDIV4SF);
26807
26808 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
26809 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
26810 IX86_BUILTIN_GATHERDIV8SF);
26811
26812 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
26813 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
26814 IX86_BUILTIN_GATHERSIV2DI);
26815
26816 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
26817 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
26818 IX86_BUILTIN_GATHERSIV4DI);
26819
26820 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
26821 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
26822 IX86_BUILTIN_GATHERDIV2DI);
26823
26824 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
26825 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
26826 IX86_BUILTIN_GATHERDIV4DI);
26827
26828 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
26829 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
26830 IX86_BUILTIN_GATHERSIV4SI);
26831
26832 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
26833 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
26834 IX86_BUILTIN_GATHERSIV8SI);
26835
26836 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
26837 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
26838 IX86_BUILTIN_GATHERDIV4SI);
26839
26840 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
26841 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
26842 IX86_BUILTIN_GATHERDIV8SI);
26843
26844 /* MMX access to the vec_init patterns. */
26845 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
26846 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
26847
26848 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
26849 V4HI_FTYPE_HI_HI_HI_HI,
26850 IX86_BUILTIN_VEC_INIT_V4HI);
26851
26852 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
26853 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
26854 IX86_BUILTIN_VEC_INIT_V8QI);
26855
26856 /* Access to the vec_extract patterns. */
26857 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
26858 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
26859 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
26860 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
26861 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
26862 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
26863 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
26864 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
26865 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
26866 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
26867
26868 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
26869 "__builtin_ia32_vec_ext_v4hi",
26870 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
26871
26872 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
26873 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
26874
26875 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
26876 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
26877
26878 /* Access to the vec_set patterns. */
26879 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
26880 "__builtin_ia32_vec_set_v2di",
26881 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
26882
26883 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
26884 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
26885
26886 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
26887 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
26888
26889 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
26890 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
26891
26892 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
26893 "__builtin_ia32_vec_set_v4hi",
26894 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
26895
26896 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
26897 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
26898
26899 /* Add FMA4 multi-arg argument instructions */
26900 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
26901 {
26902 if (d->name == 0)
26903 continue;
26904
26905 ftype = (enum ix86_builtin_func_type) d->flag;
26906 def_builtin_const (d->mask, d->name, ftype, d->code);
26907 }
26908 }
26909
26910 /* Internal method for ix86_init_builtins. */
26911
26912 static void
26913 ix86_init_builtins_va_builtins_abi (void)
26914 {
26915 tree ms_va_ref, sysv_va_ref;
26916 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
26917 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
26918 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
26919 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
26920
26921 if (!TARGET_64BIT)
26922 return;
26923 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
26924 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
26925 ms_va_ref = build_reference_type (ms_va_list_type_node);
26926 sysv_va_ref =
26927 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
26928
26929 fnvoid_va_end_ms =
26930 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
26931 fnvoid_va_start_ms =
26932 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
26933 fnvoid_va_end_sysv =
26934 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
26935 fnvoid_va_start_sysv =
26936 build_varargs_function_type_list (void_type_node, sysv_va_ref,
26937 NULL_TREE);
26938 fnvoid_va_copy_ms =
26939 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
26940 NULL_TREE);
26941 fnvoid_va_copy_sysv =
26942 build_function_type_list (void_type_node, sysv_va_ref,
26943 sysv_va_ref, NULL_TREE);
26944
26945 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
26946 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
26947 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
26948 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
26949 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
26950 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
26951 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
26952 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
26953 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
26954 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
26955 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
26956 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
26957 }
26958
26959 static void
26960 ix86_init_builtin_types (void)
26961 {
26962 tree float128_type_node, float80_type_node;
26963
26964 /* The __float80 type. */
26965 float80_type_node = long_double_type_node;
26966 if (TYPE_MODE (float80_type_node) != XFmode)
26967 {
26968 /* The __float80 type. */
26969 float80_type_node = make_node (REAL_TYPE);
26970
26971 TYPE_PRECISION (float80_type_node) = 80;
26972 layout_type (float80_type_node);
26973 }
26974 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
26975
26976 /* The __float128 type. */
26977 float128_type_node = make_node (REAL_TYPE);
26978 TYPE_PRECISION (float128_type_node) = 128;
26979 layout_type (float128_type_node);
26980 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
26981
26982 /* This macro is built by i386-builtin-types.awk. */
26983 DEFINE_BUILTIN_PRIMITIVE_TYPES;
26984 }
26985
26986 static void
26987 ix86_init_builtins (void)
26988 {
26989 tree t;
26990
26991 ix86_init_builtin_types ();
26992
26993 /* TFmode support builtins. */
26994 def_builtin_const (0, "__builtin_infq",
26995 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
26996 def_builtin_const (0, "__builtin_huge_valq",
26997 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
26998
26999 /* We will expand them to normal call if SSE2 isn't available since
27000 they are used by libgcc. */
27001 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
27002 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
27003 BUILT_IN_MD, "__fabstf2", NULL_TREE);
27004 TREE_READONLY (t) = 1;
27005 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
27006
27007 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
27008 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
27009 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
27010 TREE_READONLY (t) = 1;
27011 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
27012
27013 ix86_init_mmx_sse_builtins ();
27014
27015 if (TARGET_LP64)
27016 ix86_init_builtins_va_builtins_abi ();
27017
27018 #ifdef SUBTARGET_INIT_BUILTINS
27019 SUBTARGET_INIT_BUILTINS;
27020 #endif
27021 }
27022
27023 /* Return the ix86 builtin for CODE. */
27024
27025 static tree
27026 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
27027 {
27028 if (code >= IX86_BUILTIN_MAX)
27029 return error_mark_node;
27030
27031 return ix86_builtins[code];
27032 }
27033
27034 /* Errors in the source file can cause expand_expr to return const0_rtx
27035 where we expect a vector. To avoid crashing, use one of the vector
27036 clear instructions. */
27037 static rtx
27038 safe_vector_operand (rtx x, enum machine_mode mode)
27039 {
27040 if (x == const0_rtx)
27041 x = CONST0_RTX (mode);
27042 return x;
27043 }
27044
27045 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
27046
27047 static rtx
27048 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
27049 {
27050 rtx pat;
27051 tree arg0 = CALL_EXPR_ARG (exp, 0);
27052 tree arg1 = CALL_EXPR_ARG (exp, 1);
27053 rtx op0 = expand_normal (arg0);
27054 rtx op1 = expand_normal (arg1);
27055 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27056 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27057 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
27058
27059 if (VECTOR_MODE_P (mode0))
27060 op0 = safe_vector_operand (op0, mode0);
27061 if (VECTOR_MODE_P (mode1))
27062 op1 = safe_vector_operand (op1, mode1);
27063
27064 if (optimize || !target
27065 || GET_MODE (target) != tmode
27066 || !insn_data[icode].operand[0].predicate (target, tmode))
27067 target = gen_reg_rtx (tmode);
27068
27069 if (GET_MODE (op1) == SImode && mode1 == TImode)
27070 {
27071 rtx x = gen_reg_rtx (V4SImode);
27072 emit_insn (gen_sse2_loadd (x, op1));
27073 op1 = gen_lowpart (TImode, x);
27074 }
27075
27076 if (!insn_data[icode].operand[1].predicate (op0, mode0))
27077 op0 = copy_to_mode_reg (mode0, op0);
27078 if (!insn_data[icode].operand[2].predicate (op1, mode1))
27079 op1 = copy_to_mode_reg (mode1, op1);
27080
27081 pat = GEN_FCN (icode) (target, op0, op1);
27082 if (! pat)
27083 return 0;
27084
27085 emit_insn (pat);
27086
27087 return target;
27088 }
27089
27090 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
27091
27092 static rtx
27093 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
27094 enum ix86_builtin_func_type m_type,
27095 enum rtx_code sub_code)
27096 {
27097 rtx pat;
27098 int i;
27099 int nargs;
27100 bool comparison_p = false;
27101 bool tf_p = false;
27102 bool last_arg_constant = false;
27103 int num_memory = 0;
27104 struct {
27105 rtx op;
27106 enum machine_mode mode;
27107 } args[4];
27108
27109 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27110
27111 switch (m_type)
27112 {
27113 case MULTI_ARG_4_DF2_DI_I:
27114 case MULTI_ARG_4_DF2_DI_I1:
27115 case MULTI_ARG_4_SF2_SI_I:
27116 case MULTI_ARG_4_SF2_SI_I1:
27117 nargs = 4;
27118 last_arg_constant = true;
27119 break;
27120
27121 case MULTI_ARG_3_SF:
27122 case MULTI_ARG_3_DF:
27123 case MULTI_ARG_3_SF2:
27124 case MULTI_ARG_3_DF2:
27125 case MULTI_ARG_3_DI:
27126 case MULTI_ARG_3_SI:
27127 case MULTI_ARG_3_SI_DI:
27128 case MULTI_ARG_3_HI:
27129 case MULTI_ARG_3_HI_SI:
27130 case MULTI_ARG_3_QI:
27131 case MULTI_ARG_3_DI2:
27132 case MULTI_ARG_3_SI2:
27133 case MULTI_ARG_3_HI2:
27134 case MULTI_ARG_3_QI2:
27135 nargs = 3;
27136 break;
27137
27138 case MULTI_ARG_2_SF:
27139 case MULTI_ARG_2_DF:
27140 case MULTI_ARG_2_DI:
27141 case MULTI_ARG_2_SI:
27142 case MULTI_ARG_2_HI:
27143 case MULTI_ARG_2_QI:
27144 nargs = 2;
27145 break;
27146
27147 case MULTI_ARG_2_DI_IMM:
27148 case MULTI_ARG_2_SI_IMM:
27149 case MULTI_ARG_2_HI_IMM:
27150 case MULTI_ARG_2_QI_IMM:
27151 nargs = 2;
27152 last_arg_constant = true;
27153 break;
27154
27155 case MULTI_ARG_1_SF:
27156 case MULTI_ARG_1_DF:
27157 case MULTI_ARG_1_SF2:
27158 case MULTI_ARG_1_DF2:
27159 case MULTI_ARG_1_DI:
27160 case MULTI_ARG_1_SI:
27161 case MULTI_ARG_1_HI:
27162 case MULTI_ARG_1_QI:
27163 case MULTI_ARG_1_SI_DI:
27164 case MULTI_ARG_1_HI_DI:
27165 case MULTI_ARG_1_HI_SI:
27166 case MULTI_ARG_1_QI_DI:
27167 case MULTI_ARG_1_QI_SI:
27168 case MULTI_ARG_1_QI_HI:
27169 nargs = 1;
27170 break;
27171
27172 case MULTI_ARG_2_DI_CMP:
27173 case MULTI_ARG_2_SI_CMP:
27174 case MULTI_ARG_2_HI_CMP:
27175 case MULTI_ARG_2_QI_CMP:
27176 nargs = 2;
27177 comparison_p = true;
27178 break;
27179
27180 case MULTI_ARG_2_SF_TF:
27181 case MULTI_ARG_2_DF_TF:
27182 case MULTI_ARG_2_DI_TF:
27183 case MULTI_ARG_2_SI_TF:
27184 case MULTI_ARG_2_HI_TF:
27185 case MULTI_ARG_2_QI_TF:
27186 nargs = 2;
27187 tf_p = true;
27188 break;
27189
27190 default:
27191 gcc_unreachable ();
27192 }
27193
27194 if (optimize || !target
27195 || GET_MODE (target) != tmode
27196 || !insn_data[icode].operand[0].predicate (target, tmode))
27197 target = gen_reg_rtx (tmode);
27198
27199 gcc_assert (nargs <= 4);
27200
27201 for (i = 0; i < nargs; i++)
27202 {
27203 tree arg = CALL_EXPR_ARG (exp, i);
27204 rtx op = expand_normal (arg);
27205 int adjust = (comparison_p) ? 1 : 0;
27206 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
27207
27208 if (last_arg_constant && i == nargs - 1)
27209 {
27210 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
27211 {
27212 enum insn_code new_icode = icode;
27213 switch (icode)
27214 {
27215 case CODE_FOR_xop_vpermil2v2df3:
27216 case CODE_FOR_xop_vpermil2v4sf3:
27217 case CODE_FOR_xop_vpermil2v4df3:
27218 case CODE_FOR_xop_vpermil2v8sf3:
27219 error ("the last argument must be a 2-bit immediate");
27220 return gen_reg_rtx (tmode);
27221 case CODE_FOR_xop_rotlv2di3:
27222 new_icode = CODE_FOR_rotlv2di3;
27223 goto xop_rotl;
27224 case CODE_FOR_xop_rotlv4si3:
27225 new_icode = CODE_FOR_rotlv4si3;
27226 goto xop_rotl;
27227 case CODE_FOR_xop_rotlv8hi3:
27228 new_icode = CODE_FOR_rotlv8hi3;
27229 goto xop_rotl;
27230 case CODE_FOR_xop_rotlv16qi3:
27231 new_icode = CODE_FOR_rotlv16qi3;
27232 xop_rotl:
27233 if (CONST_INT_P (op))
27234 {
27235 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
27236 op = GEN_INT (INTVAL (op) & mask);
27237 gcc_checking_assert
27238 (insn_data[icode].operand[i + 1].predicate (op, mode));
27239 }
27240 else
27241 {
27242 gcc_checking_assert
27243 (nargs == 2
27244 && insn_data[new_icode].operand[0].mode == tmode
27245 && insn_data[new_icode].operand[1].mode == tmode
27246 && insn_data[new_icode].operand[2].mode == mode
27247 && insn_data[new_icode].operand[0].predicate
27248 == insn_data[icode].operand[0].predicate
27249 && insn_data[new_icode].operand[1].predicate
27250 == insn_data[icode].operand[1].predicate);
27251 icode = new_icode;
27252 goto non_constant;
27253 }
27254 break;
27255 default:
27256 gcc_unreachable ();
27257 }
27258 }
27259 }
27260 else
27261 {
27262 non_constant:
27263 if (VECTOR_MODE_P (mode))
27264 op = safe_vector_operand (op, mode);
27265
27266 /* If we aren't optimizing, only allow one memory operand to be
27267 generated. */
27268 if (memory_operand (op, mode))
27269 num_memory++;
27270
27271 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
27272
27273 if (optimize
27274 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
27275 || num_memory > 1)
27276 op = force_reg (mode, op);
27277 }
27278
27279 args[i].op = op;
27280 args[i].mode = mode;
27281 }
27282
27283 switch (nargs)
27284 {
27285 case 1:
27286 pat = GEN_FCN (icode) (target, args[0].op);
27287 break;
27288
27289 case 2:
27290 if (tf_p)
27291 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
27292 GEN_INT ((int)sub_code));
27293 else if (! comparison_p)
27294 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
27295 else
27296 {
27297 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
27298 args[0].op,
27299 args[1].op);
27300
27301 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
27302 }
27303 break;
27304
27305 case 3:
27306 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
27307 break;
27308
27309 case 4:
27310 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
27311 break;
27312
27313 default:
27314 gcc_unreachable ();
27315 }
27316
27317 if (! pat)
27318 return 0;
27319
27320 emit_insn (pat);
27321 return target;
27322 }
27323
27324 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
27325 insns with vec_merge. */
27326
27327 static rtx
27328 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
27329 rtx target)
27330 {
27331 rtx pat;
27332 tree arg0 = CALL_EXPR_ARG (exp, 0);
27333 rtx op1, op0 = expand_normal (arg0);
27334 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27335 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27336
27337 if (optimize || !target
27338 || GET_MODE (target) != tmode
27339 || !insn_data[icode].operand[0].predicate (target, tmode))
27340 target = gen_reg_rtx (tmode);
27341
27342 if (VECTOR_MODE_P (mode0))
27343 op0 = safe_vector_operand (op0, mode0);
27344
27345 if ((optimize && !register_operand (op0, mode0))
27346 || !insn_data[icode].operand[1].predicate (op0, mode0))
27347 op0 = copy_to_mode_reg (mode0, op0);
27348
27349 op1 = op0;
27350 if (!insn_data[icode].operand[2].predicate (op1, mode0))
27351 op1 = copy_to_mode_reg (mode0, op1);
27352
27353 pat = GEN_FCN (icode) (target, op0, op1);
27354 if (! pat)
27355 return 0;
27356 emit_insn (pat);
27357 return target;
27358 }
27359
27360 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
27361
27362 static rtx
27363 ix86_expand_sse_compare (const struct builtin_description *d,
27364 tree exp, rtx target, bool swap)
27365 {
27366 rtx pat;
27367 tree arg0 = CALL_EXPR_ARG (exp, 0);
27368 tree arg1 = CALL_EXPR_ARG (exp, 1);
27369 rtx op0 = expand_normal (arg0);
27370 rtx op1 = expand_normal (arg1);
27371 rtx op2;
27372 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
27373 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
27374 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
27375 enum rtx_code comparison = d->comparison;
27376
27377 if (VECTOR_MODE_P (mode0))
27378 op0 = safe_vector_operand (op0, mode0);
27379 if (VECTOR_MODE_P (mode1))
27380 op1 = safe_vector_operand (op1, mode1);
27381
27382 /* Swap operands if we have a comparison that isn't available in
27383 hardware. */
27384 if (swap)
27385 {
27386 rtx tmp = gen_reg_rtx (mode1);
27387 emit_move_insn (tmp, op1);
27388 op1 = op0;
27389 op0 = tmp;
27390 }
27391
27392 if (optimize || !target
27393 || GET_MODE (target) != tmode
27394 || !insn_data[d->icode].operand[0].predicate (target, tmode))
27395 target = gen_reg_rtx (tmode);
27396
27397 if ((optimize && !register_operand (op0, mode0))
27398 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
27399 op0 = copy_to_mode_reg (mode0, op0);
27400 if ((optimize && !register_operand (op1, mode1))
27401 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
27402 op1 = copy_to_mode_reg (mode1, op1);
27403
27404 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
27405 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
27406 if (! pat)
27407 return 0;
27408 emit_insn (pat);
27409 return target;
27410 }
27411
27412 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
27413
27414 static rtx
27415 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
27416 rtx target)
27417 {
27418 rtx pat;
27419 tree arg0 = CALL_EXPR_ARG (exp, 0);
27420 tree arg1 = CALL_EXPR_ARG (exp, 1);
27421 rtx op0 = expand_normal (arg0);
27422 rtx op1 = expand_normal (arg1);
27423 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
27424 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
27425 enum rtx_code comparison = d->comparison;
27426
27427 if (VECTOR_MODE_P (mode0))
27428 op0 = safe_vector_operand (op0, mode0);
27429 if (VECTOR_MODE_P (mode1))
27430 op1 = safe_vector_operand (op1, mode1);
27431
27432 /* Swap operands if we have a comparison that isn't available in
27433 hardware. */
27434 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
27435 {
27436 rtx tmp = op1;
27437 op1 = op0;
27438 op0 = tmp;
27439 }
27440
27441 target = gen_reg_rtx (SImode);
27442 emit_move_insn (target, const0_rtx);
27443 target = gen_rtx_SUBREG (QImode, target, 0);
27444
27445 if ((optimize && !register_operand (op0, mode0))
27446 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
27447 op0 = copy_to_mode_reg (mode0, op0);
27448 if ((optimize && !register_operand (op1, mode1))
27449 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
27450 op1 = copy_to_mode_reg (mode1, op1);
27451
27452 pat = GEN_FCN (d->icode) (op0, op1);
27453 if (! pat)
27454 return 0;
27455 emit_insn (pat);
27456 emit_insn (gen_rtx_SET (VOIDmode,
27457 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
27458 gen_rtx_fmt_ee (comparison, QImode,
27459 SET_DEST (pat),
27460 const0_rtx)));
27461
27462 return SUBREG_REG (target);
27463 }
27464
27465 /* Subroutine of ix86_expand_args_builtin to take care of round insns. */
27466
27467 static rtx
27468 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
27469 rtx target)
27470 {
27471 rtx pat;
27472 tree arg0 = CALL_EXPR_ARG (exp, 0);
27473 rtx op1, op0 = expand_normal (arg0);
27474 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
27475 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
27476
27477 if (optimize || target == 0
27478 || GET_MODE (target) != tmode
27479 || !insn_data[d->icode].operand[0].predicate (target, tmode))
27480 target = gen_reg_rtx (tmode);
27481
27482 if (VECTOR_MODE_P (mode0))
27483 op0 = safe_vector_operand (op0, mode0);
27484
27485 if ((optimize && !register_operand (op0, mode0))
27486 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
27487 op0 = copy_to_mode_reg (mode0, op0);
27488
27489 op1 = GEN_INT (d->comparison);
27490
27491 pat = GEN_FCN (d->icode) (target, op0, op1);
27492 if (! pat)
27493 return 0;
27494 emit_insn (pat);
27495 return target;
27496 }
27497
27498 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
27499
27500 static rtx
27501 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
27502 rtx target)
27503 {
27504 rtx pat;
27505 tree arg0 = CALL_EXPR_ARG (exp, 0);
27506 tree arg1 = CALL_EXPR_ARG (exp, 1);
27507 rtx op0 = expand_normal (arg0);
27508 rtx op1 = expand_normal (arg1);
27509 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
27510 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
27511 enum rtx_code comparison = d->comparison;
27512
27513 if (VECTOR_MODE_P (mode0))
27514 op0 = safe_vector_operand (op0, mode0);
27515 if (VECTOR_MODE_P (mode1))
27516 op1 = safe_vector_operand (op1, mode1);
27517
27518 target = gen_reg_rtx (SImode);
27519 emit_move_insn (target, const0_rtx);
27520 target = gen_rtx_SUBREG (QImode, target, 0);
27521
27522 if ((optimize && !register_operand (op0, mode0))
27523 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
27524 op0 = copy_to_mode_reg (mode0, op0);
27525 if ((optimize && !register_operand (op1, mode1))
27526 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
27527 op1 = copy_to_mode_reg (mode1, op1);
27528
27529 pat = GEN_FCN (d->icode) (op0, op1);
27530 if (! pat)
27531 return 0;
27532 emit_insn (pat);
27533 emit_insn (gen_rtx_SET (VOIDmode,
27534 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
27535 gen_rtx_fmt_ee (comparison, QImode,
27536 SET_DEST (pat),
27537 const0_rtx)));
27538
27539 return SUBREG_REG (target);
27540 }
27541
27542 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
27543
27544 static rtx
27545 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
27546 tree exp, rtx target)
27547 {
27548 rtx pat;
27549 tree arg0 = CALL_EXPR_ARG (exp, 0);
27550 tree arg1 = CALL_EXPR_ARG (exp, 1);
27551 tree arg2 = CALL_EXPR_ARG (exp, 2);
27552 tree arg3 = CALL_EXPR_ARG (exp, 3);
27553 tree arg4 = CALL_EXPR_ARG (exp, 4);
27554 rtx scratch0, scratch1;
27555 rtx op0 = expand_normal (arg0);
27556 rtx op1 = expand_normal (arg1);
27557 rtx op2 = expand_normal (arg2);
27558 rtx op3 = expand_normal (arg3);
27559 rtx op4 = expand_normal (arg4);
27560 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
27561
27562 tmode0 = insn_data[d->icode].operand[0].mode;
27563 tmode1 = insn_data[d->icode].operand[1].mode;
27564 modev2 = insn_data[d->icode].operand[2].mode;
27565 modei3 = insn_data[d->icode].operand[3].mode;
27566 modev4 = insn_data[d->icode].operand[4].mode;
27567 modei5 = insn_data[d->icode].operand[5].mode;
27568 modeimm = insn_data[d->icode].operand[6].mode;
27569
27570 if (VECTOR_MODE_P (modev2))
27571 op0 = safe_vector_operand (op0, modev2);
27572 if (VECTOR_MODE_P (modev4))
27573 op2 = safe_vector_operand (op2, modev4);
27574
27575 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
27576 op0 = copy_to_mode_reg (modev2, op0);
27577 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
27578 op1 = copy_to_mode_reg (modei3, op1);
27579 if ((optimize && !register_operand (op2, modev4))
27580 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
27581 op2 = copy_to_mode_reg (modev4, op2);
27582 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
27583 op3 = copy_to_mode_reg (modei5, op3);
27584
27585 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
27586 {
27587 error ("the fifth argument must be an 8-bit immediate");
27588 return const0_rtx;
27589 }
27590
27591 if (d->code == IX86_BUILTIN_PCMPESTRI128)
27592 {
27593 if (optimize || !target
27594 || GET_MODE (target) != tmode0
27595 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
27596 target = gen_reg_rtx (tmode0);
27597
27598 scratch1 = gen_reg_rtx (tmode1);
27599
27600 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
27601 }
27602 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
27603 {
27604 if (optimize || !target
27605 || GET_MODE (target) != tmode1
27606 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
27607 target = gen_reg_rtx (tmode1);
27608
27609 scratch0 = gen_reg_rtx (tmode0);
27610
27611 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
27612 }
27613 else
27614 {
27615 gcc_assert (d->flag);
27616
27617 scratch0 = gen_reg_rtx (tmode0);
27618 scratch1 = gen_reg_rtx (tmode1);
27619
27620 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
27621 }
27622
27623 if (! pat)
27624 return 0;
27625
27626 emit_insn (pat);
27627
27628 if (d->flag)
27629 {
27630 target = gen_reg_rtx (SImode);
27631 emit_move_insn (target, const0_rtx);
27632 target = gen_rtx_SUBREG (QImode, target, 0);
27633
27634 emit_insn
27635 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
27636 gen_rtx_fmt_ee (EQ, QImode,
27637 gen_rtx_REG ((enum machine_mode) d->flag,
27638 FLAGS_REG),
27639 const0_rtx)));
27640 return SUBREG_REG (target);
27641 }
27642 else
27643 return target;
27644 }
27645
27646
27647 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
27648
27649 static rtx
27650 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
27651 tree exp, rtx target)
27652 {
27653 rtx pat;
27654 tree arg0 = CALL_EXPR_ARG (exp, 0);
27655 tree arg1 = CALL_EXPR_ARG (exp, 1);
27656 tree arg2 = CALL_EXPR_ARG (exp, 2);
27657 rtx scratch0, scratch1;
27658 rtx op0 = expand_normal (arg0);
27659 rtx op1 = expand_normal (arg1);
27660 rtx op2 = expand_normal (arg2);
27661 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
27662
27663 tmode0 = insn_data[d->icode].operand[0].mode;
27664 tmode1 = insn_data[d->icode].operand[1].mode;
27665 modev2 = insn_data[d->icode].operand[2].mode;
27666 modev3 = insn_data[d->icode].operand[3].mode;
27667 modeimm = insn_data[d->icode].operand[4].mode;
27668
27669 if (VECTOR_MODE_P (modev2))
27670 op0 = safe_vector_operand (op0, modev2);
27671 if (VECTOR_MODE_P (modev3))
27672 op1 = safe_vector_operand (op1, modev3);
27673
27674 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
27675 op0 = copy_to_mode_reg (modev2, op0);
27676 if ((optimize && !register_operand (op1, modev3))
27677 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
27678 op1 = copy_to_mode_reg (modev3, op1);
27679
27680 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
27681 {
27682 error ("the third argument must be an 8-bit immediate");
27683 return const0_rtx;
27684 }
27685
27686 if (d->code == IX86_BUILTIN_PCMPISTRI128)
27687 {
27688 if (optimize || !target
27689 || GET_MODE (target) != tmode0
27690 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
27691 target = gen_reg_rtx (tmode0);
27692
27693 scratch1 = gen_reg_rtx (tmode1);
27694
27695 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
27696 }
27697 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
27698 {
27699 if (optimize || !target
27700 || GET_MODE (target) != tmode1
27701 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
27702 target = gen_reg_rtx (tmode1);
27703
27704 scratch0 = gen_reg_rtx (tmode0);
27705
27706 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
27707 }
27708 else
27709 {
27710 gcc_assert (d->flag);
27711
27712 scratch0 = gen_reg_rtx (tmode0);
27713 scratch1 = gen_reg_rtx (tmode1);
27714
27715 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
27716 }
27717
27718 if (! pat)
27719 return 0;
27720
27721 emit_insn (pat);
27722
27723 if (d->flag)
27724 {
27725 target = gen_reg_rtx (SImode);
27726 emit_move_insn (target, const0_rtx);
27727 target = gen_rtx_SUBREG (QImode, target, 0);
27728
27729 emit_insn
27730 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
27731 gen_rtx_fmt_ee (EQ, QImode,
27732 gen_rtx_REG ((enum machine_mode) d->flag,
27733 FLAGS_REG),
27734 const0_rtx)));
27735 return SUBREG_REG (target);
27736 }
27737 else
27738 return target;
27739 }
27740
27741 /* Subroutine of ix86_expand_builtin to take care of insns with
27742 variable number of operands. */
27743
27744 static rtx
27745 ix86_expand_args_builtin (const struct builtin_description *d,
27746 tree exp, rtx target)
27747 {
27748 rtx pat, real_target;
27749 unsigned int i, nargs;
27750 unsigned int nargs_constant = 0;
27751 int num_memory = 0;
27752 struct
27753 {
27754 rtx op;
27755 enum machine_mode mode;
27756 } args[4];
27757 bool last_arg_count = false;
27758 enum insn_code icode = d->icode;
27759 const struct insn_data_d *insn_p = &insn_data[icode];
27760 enum machine_mode tmode = insn_p->operand[0].mode;
27761 enum machine_mode rmode = VOIDmode;
27762 bool swap = false;
27763 enum rtx_code comparison = d->comparison;
27764
27765 switch ((enum ix86_builtin_func_type) d->flag)
27766 {
27767 case V2DF_FTYPE_V2DF_ROUND:
27768 case V4DF_FTYPE_V4DF_ROUND:
27769 case V4SF_FTYPE_V4SF_ROUND:
27770 case V8SF_FTYPE_V8SF_ROUND:
27771 return ix86_expand_sse_round (d, exp, target);
27772 case INT_FTYPE_V8SF_V8SF_PTEST:
27773 case INT_FTYPE_V4DI_V4DI_PTEST:
27774 case INT_FTYPE_V4DF_V4DF_PTEST:
27775 case INT_FTYPE_V4SF_V4SF_PTEST:
27776 case INT_FTYPE_V2DI_V2DI_PTEST:
27777 case INT_FTYPE_V2DF_V2DF_PTEST:
27778 return ix86_expand_sse_ptest (d, exp, target);
27779 case FLOAT128_FTYPE_FLOAT128:
27780 case FLOAT_FTYPE_FLOAT:
27781 case INT_FTYPE_INT:
27782 case UINT64_FTYPE_INT:
27783 case UINT16_FTYPE_UINT16:
27784 case INT64_FTYPE_INT64:
27785 case INT64_FTYPE_V4SF:
27786 case INT64_FTYPE_V2DF:
27787 case INT_FTYPE_V16QI:
27788 case INT_FTYPE_V8QI:
27789 case INT_FTYPE_V8SF:
27790 case INT_FTYPE_V4DF:
27791 case INT_FTYPE_V4SF:
27792 case INT_FTYPE_V2DF:
27793 case INT_FTYPE_V32QI:
27794 case V16QI_FTYPE_V16QI:
27795 case V8SI_FTYPE_V8SF:
27796 case V8SI_FTYPE_V4SI:
27797 case V8HI_FTYPE_V8HI:
27798 case V8HI_FTYPE_V16QI:
27799 case V8QI_FTYPE_V8QI:
27800 case V8SF_FTYPE_V8SF:
27801 case V8SF_FTYPE_V8SI:
27802 case V8SF_FTYPE_V4SF:
27803 case V8SF_FTYPE_V8HI:
27804 case V4SI_FTYPE_V4SI:
27805 case V4SI_FTYPE_V16QI:
27806 case V4SI_FTYPE_V4SF:
27807 case V4SI_FTYPE_V8SI:
27808 case V4SI_FTYPE_V8HI:
27809 case V4SI_FTYPE_V4DF:
27810 case V4SI_FTYPE_V2DF:
27811 case V4HI_FTYPE_V4HI:
27812 case V4DF_FTYPE_V4DF:
27813 case V4DF_FTYPE_V4SI:
27814 case V4DF_FTYPE_V4SF:
27815 case V4DF_FTYPE_V2DF:
27816 case V4SF_FTYPE_V4SF:
27817 case V4SF_FTYPE_V4SI:
27818 case V4SF_FTYPE_V8SF:
27819 case V4SF_FTYPE_V4DF:
27820 case V4SF_FTYPE_V8HI:
27821 case V4SF_FTYPE_V2DF:
27822 case V2DI_FTYPE_V2DI:
27823 case V2DI_FTYPE_V16QI:
27824 case V2DI_FTYPE_V8HI:
27825 case V2DI_FTYPE_V4SI:
27826 case V2DF_FTYPE_V2DF:
27827 case V2DF_FTYPE_V4SI:
27828 case V2DF_FTYPE_V4DF:
27829 case V2DF_FTYPE_V4SF:
27830 case V2DF_FTYPE_V2SI:
27831 case V2SI_FTYPE_V2SI:
27832 case V2SI_FTYPE_V4SF:
27833 case V2SI_FTYPE_V2SF:
27834 case V2SI_FTYPE_V2DF:
27835 case V2SF_FTYPE_V2SF:
27836 case V2SF_FTYPE_V2SI:
27837 case V32QI_FTYPE_V32QI:
27838 case V32QI_FTYPE_V16QI:
27839 case V16HI_FTYPE_V16HI:
27840 case V16HI_FTYPE_V8HI:
27841 case V8SI_FTYPE_V8SI:
27842 case V16HI_FTYPE_V16QI:
27843 case V8SI_FTYPE_V16QI:
27844 case V4DI_FTYPE_V16QI:
27845 case V8SI_FTYPE_V8HI:
27846 case V4DI_FTYPE_V8HI:
27847 case V4DI_FTYPE_V4SI:
27848 case V4DI_FTYPE_V2DI:
27849 nargs = 1;
27850 break;
27851 case V4SF_FTYPE_V4SF_VEC_MERGE:
27852 case V2DF_FTYPE_V2DF_VEC_MERGE:
27853 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
27854 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
27855 case V16QI_FTYPE_V16QI_V16QI:
27856 case V16QI_FTYPE_V8HI_V8HI:
27857 case V8QI_FTYPE_V8QI_V8QI:
27858 case V8QI_FTYPE_V4HI_V4HI:
27859 case V8HI_FTYPE_V8HI_V8HI:
27860 case V8HI_FTYPE_V16QI_V16QI:
27861 case V8HI_FTYPE_V4SI_V4SI:
27862 case V8SF_FTYPE_V8SF_V8SF:
27863 case V8SF_FTYPE_V8SF_V8SI:
27864 case V4SI_FTYPE_V4SI_V4SI:
27865 case V4SI_FTYPE_V8HI_V8HI:
27866 case V4SI_FTYPE_V4SF_V4SF:
27867 case V4SI_FTYPE_V2DF_V2DF:
27868 case V4HI_FTYPE_V4HI_V4HI:
27869 case V4HI_FTYPE_V8QI_V8QI:
27870 case V4HI_FTYPE_V2SI_V2SI:
27871 case V4DF_FTYPE_V4DF_V4DF:
27872 case V4DF_FTYPE_V4DF_V4DI:
27873 case V4SF_FTYPE_V4SF_V4SF:
27874 case V4SF_FTYPE_V4SF_V4SI:
27875 case V4SF_FTYPE_V4SF_V2SI:
27876 case V4SF_FTYPE_V4SF_V2DF:
27877 case V4SF_FTYPE_V4SF_DI:
27878 case V4SF_FTYPE_V4SF_SI:
27879 case V2DI_FTYPE_V2DI_V2DI:
27880 case V2DI_FTYPE_V16QI_V16QI:
27881 case V2DI_FTYPE_V4SI_V4SI:
27882 case V2DI_FTYPE_V2DI_V16QI:
27883 case V2DI_FTYPE_V2DF_V2DF:
27884 case V2SI_FTYPE_V2SI_V2SI:
27885 case V2SI_FTYPE_V4HI_V4HI:
27886 case V2SI_FTYPE_V2SF_V2SF:
27887 case V2DF_FTYPE_V2DF_V2DF:
27888 case V2DF_FTYPE_V2DF_V4SF:
27889 case V2DF_FTYPE_V2DF_V2DI:
27890 case V2DF_FTYPE_V2DF_DI:
27891 case V2DF_FTYPE_V2DF_SI:
27892 case V2SF_FTYPE_V2SF_V2SF:
27893 case V1DI_FTYPE_V1DI_V1DI:
27894 case V1DI_FTYPE_V8QI_V8QI:
27895 case V1DI_FTYPE_V2SI_V2SI:
27896 case V32QI_FTYPE_V16HI_V16HI:
27897 case V16HI_FTYPE_V8SI_V8SI:
27898 case V32QI_FTYPE_V32QI_V32QI:
27899 case V16HI_FTYPE_V32QI_V32QI:
27900 case V16HI_FTYPE_V16HI_V16HI:
27901 case V8SI_FTYPE_V8SI_V8SI:
27902 case V8SI_FTYPE_V16HI_V16HI:
27903 case V4DI_FTYPE_V4DI_V4DI:
27904 case V4DI_FTYPE_V8SI_V8SI:
27905 if (comparison == UNKNOWN)
27906 return ix86_expand_binop_builtin (icode, exp, target);
27907 nargs = 2;
27908 break;
27909 case V4SF_FTYPE_V4SF_V4SF_SWAP:
27910 case V2DF_FTYPE_V2DF_V2DF_SWAP:
27911 gcc_assert (comparison != UNKNOWN);
27912 nargs = 2;
27913 swap = true;
27914 break;
27915 case V16HI_FTYPE_V16HI_V8HI_COUNT:
27916 case V16HI_FTYPE_V16HI_SI_COUNT:
27917 case V8SI_FTYPE_V8SI_V4SI_COUNT:
27918 case V8SI_FTYPE_V8SI_SI_COUNT:
27919 case V4DI_FTYPE_V4DI_V2DI_COUNT:
27920 case V4DI_FTYPE_V4DI_INT_COUNT:
27921 case V8HI_FTYPE_V8HI_V8HI_COUNT:
27922 case V8HI_FTYPE_V8HI_SI_COUNT:
27923 case V4SI_FTYPE_V4SI_V4SI_COUNT:
27924 case V4SI_FTYPE_V4SI_SI_COUNT:
27925 case V4HI_FTYPE_V4HI_V4HI_COUNT:
27926 case V4HI_FTYPE_V4HI_SI_COUNT:
27927 case V2DI_FTYPE_V2DI_V2DI_COUNT:
27928 case V2DI_FTYPE_V2DI_SI_COUNT:
27929 case V2SI_FTYPE_V2SI_V2SI_COUNT:
27930 case V2SI_FTYPE_V2SI_SI_COUNT:
27931 case V1DI_FTYPE_V1DI_V1DI_COUNT:
27932 case V1DI_FTYPE_V1DI_SI_COUNT:
27933 nargs = 2;
27934 last_arg_count = true;
27935 break;
27936 case UINT64_FTYPE_UINT64_UINT64:
27937 case UINT_FTYPE_UINT_UINT:
27938 case UINT_FTYPE_UINT_USHORT:
27939 case UINT_FTYPE_UINT_UCHAR:
27940 case UINT16_FTYPE_UINT16_INT:
27941 case UINT8_FTYPE_UINT8_INT:
27942 nargs = 2;
27943 break;
27944 case V2DI_FTYPE_V2DI_INT_CONVERT:
27945 nargs = 2;
27946 rmode = V1TImode;
27947 nargs_constant = 1;
27948 break;
27949 case V4DI_FTYPE_V4DI_INT_CONVERT:
27950 nargs = 2;
27951 rmode = V2TImode;
27952 nargs_constant = 1;
27953 break;
27954 case V8HI_FTYPE_V8HI_INT:
27955 case V8HI_FTYPE_V8SF_INT:
27956 case V8HI_FTYPE_V4SF_INT:
27957 case V8SF_FTYPE_V8SF_INT:
27958 case V4SI_FTYPE_V4SI_INT:
27959 case V4SI_FTYPE_V8SI_INT:
27960 case V4HI_FTYPE_V4HI_INT:
27961 case V4DF_FTYPE_V4DF_INT:
27962 case V4SF_FTYPE_V4SF_INT:
27963 case V4SF_FTYPE_V8SF_INT:
27964 case V2DI_FTYPE_V2DI_INT:
27965 case V2DF_FTYPE_V2DF_INT:
27966 case V2DF_FTYPE_V4DF_INT:
27967 case V16HI_FTYPE_V16HI_INT:
27968 case V8SI_FTYPE_V8SI_INT:
27969 case V4DI_FTYPE_V4DI_INT:
27970 case V2DI_FTYPE_V4DI_INT:
27971 nargs = 2;
27972 nargs_constant = 1;
27973 break;
27974 case V16QI_FTYPE_V16QI_V16QI_V16QI:
27975 case V8SF_FTYPE_V8SF_V8SF_V8SF:
27976 case V4DF_FTYPE_V4DF_V4DF_V4DF:
27977 case V4SF_FTYPE_V4SF_V4SF_V4SF:
27978 case V2DF_FTYPE_V2DF_V2DF_V2DF:
27979 case V32QI_FTYPE_V32QI_V32QI_V32QI:
27980 nargs = 3;
27981 break;
27982 case V32QI_FTYPE_V32QI_V32QI_INT:
27983 case V16HI_FTYPE_V16HI_V16HI_INT:
27984 case V16QI_FTYPE_V16QI_V16QI_INT:
27985 case V4DI_FTYPE_V4DI_V4DI_INT:
27986 case V8HI_FTYPE_V8HI_V8HI_INT:
27987 case V8SI_FTYPE_V8SI_V8SI_INT:
27988 case V8SI_FTYPE_V8SI_V4SI_INT:
27989 case V8SF_FTYPE_V8SF_V8SF_INT:
27990 case V8SF_FTYPE_V8SF_V4SF_INT:
27991 case V4SI_FTYPE_V4SI_V4SI_INT:
27992 case V4DF_FTYPE_V4DF_V4DF_INT:
27993 case V4DF_FTYPE_V4DF_V2DF_INT:
27994 case V4SF_FTYPE_V4SF_V4SF_INT:
27995 case V2DI_FTYPE_V2DI_V2DI_INT:
27996 case V4DI_FTYPE_V4DI_V2DI_INT:
27997 case V2DF_FTYPE_V2DF_V2DF_INT:
27998 nargs = 3;
27999 nargs_constant = 1;
28000 break;
28001 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
28002 nargs = 3;
28003 rmode = V4DImode;
28004 nargs_constant = 1;
28005 break;
28006 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
28007 nargs = 3;
28008 rmode = V2DImode;
28009 nargs_constant = 1;
28010 break;
28011 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
28012 nargs = 3;
28013 rmode = DImode;
28014 nargs_constant = 1;
28015 break;
28016 case V2DI_FTYPE_V2DI_UINT_UINT:
28017 nargs = 3;
28018 nargs_constant = 2;
28019 break;
28020 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
28021 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
28022 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
28023 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
28024 nargs = 4;
28025 nargs_constant = 1;
28026 break;
28027 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
28028 nargs = 4;
28029 nargs_constant = 2;
28030 break;
28031 default:
28032 gcc_unreachable ();
28033 }
28034
28035 gcc_assert (nargs <= ARRAY_SIZE (args));
28036
28037 if (comparison != UNKNOWN)
28038 {
28039 gcc_assert (nargs == 2);
28040 return ix86_expand_sse_compare (d, exp, target, swap);
28041 }
28042
28043 if (rmode == VOIDmode || rmode == tmode)
28044 {
28045 if (optimize
28046 || target == 0
28047 || GET_MODE (target) != tmode
28048 || !insn_p->operand[0].predicate (target, tmode))
28049 target = gen_reg_rtx (tmode);
28050 real_target = target;
28051 }
28052 else
28053 {
28054 target = gen_reg_rtx (rmode);
28055 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
28056 }
28057
28058 for (i = 0; i < nargs; i++)
28059 {
28060 tree arg = CALL_EXPR_ARG (exp, i);
28061 rtx op = expand_normal (arg);
28062 enum machine_mode mode = insn_p->operand[i + 1].mode;
28063 bool match = insn_p->operand[i + 1].predicate (op, mode);
28064
28065 if (last_arg_count && (i + 1) == nargs)
28066 {
28067 /* SIMD shift insns take either an 8-bit immediate or
28068 register as count. But builtin functions take int as
28069 count. If count doesn't match, we put it in register. */
28070 if (!match)
28071 {
28072 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
28073 if (!insn_p->operand[i + 1].predicate (op, mode))
28074 op = copy_to_reg (op);
28075 }
28076 }
28077 else if ((nargs - i) <= nargs_constant)
28078 {
28079 if (!match)
28080 switch (icode)
28081 {
28082 case CODE_FOR_avx2_inserti128:
28083 case CODE_FOR_avx2_extracti128:
28084 error ("the last argument must be an 1-bit immediate");
28085 return const0_rtx;
28086
28087 case CODE_FOR_sse4_1_roundpd:
28088 case CODE_FOR_sse4_1_roundps:
28089 case CODE_FOR_sse4_1_roundsd:
28090 case CODE_FOR_sse4_1_roundss:
28091 case CODE_FOR_sse4_1_blendps:
28092 case CODE_FOR_avx_blendpd256:
28093 case CODE_FOR_avx_vpermilv4df:
28094 case CODE_FOR_avx_roundpd256:
28095 case CODE_FOR_avx_roundps256:
28096 error ("the last argument must be a 4-bit immediate");
28097 return const0_rtx;
28098
28099 case CODE_FOR_sse4_1_blendpd:
28100 case CODE_FOR_avx_vpermilv2df:
28101 case CODE_FOR_xop_vpermil2v2df3:
28102 case CODE_FOR_xop_vpermil2v4sf3:
28103 case CODE_FOR_xop_vpermil2v4df3:
28104 case CODE_FOR_xop_vpermil2v8sf3:
28105 error ("the last argument must be a 2-bit immediate");
28106 return const0_rtx;
28107
28108 case CODE_FOR_avx_vextractf128v4df:
28109 case CODE_FOR_avx_vextractf128v8sf:
28110 case CODE_FOR_avx_vextractf128v8si:
28111 case CODE_FOR_avx_vinsertf128v4df:
28112 case CODE_FOR_avx_vinsertf128v8sf:
28113 case CODE_FOR_avx_vinsertf128v8si:
28114 error ("the last argument must be a 1-bit immediate");
28115 return const0_rtx;
28116
28117 case CODE_FOR_avx_vmcmpv2df3:
28118 case CODE_FOR_avx_vmcmpv4sf3:
28119 case CODE_FOR_avx_cmpv2df3:
28120 case CODE_FOR_avx_cmpv4sf3:
28121 case CODE_FOR_avx_cmpv4df3:
28122 case CODE_FOR_avx_cmpv8sf3:
28123 error ("the last argument must be a 5-bit immediate");
28124 return const0_rtx;
28125
28126 default:
28127 switch (nargs_constant)
28128 {
28129 case 2:
28130 if ((nargs - i) == nargs_constant)
28131 {
28132 error ("the next to last argument must be an 8-bit immediate");
28133 break;
28134 }
28135 case 1:
28136 error ("the last argument must be an 8-bit immediate");
28137 break;
28138 default:
28139 gcc_unreachable ();
28140 }
28141 return const0_rtx;
28142 }
28143 }
28144 else
28145 {
28146 if (VECTOR_MODE_P (mode))
28147 op = safe_vector_operand (op, mode);
28148
28149 /* If we aren't optimizing, only allow one memory operand to
28150 be generated. */
28151 if (memory_operand (op, mode))
28152 num_memory++;
28153
28154 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
28155 {
28156 if (optimize || !match || num_memory > 1)
28157 op = copy_to_mode_reg (mode, op);
28158 }
28159 else
28160 {
28161 op = copy_to_reg (op);
28162 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
28163 }
28164 }
28165
28166 args[i].op = op;
28167 args[i].mode = mode;
28168 }
28169
28170 switch (nargs)
28171 {
28172 case 1:
28173 pat = GEN_FCN (icode) (real_target, args[0].op);
28174 break;
28175 case 2:
28176 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
28177 break;
28178 case 3:
28179 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28180 args[2].op);
28181 break;
28182 case 4:
28183 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28184 args[2].op, args[3].op);
28185 break;
28186 default:
28187 gcc_unreachable ();
28188 }
28189
28190 if (! pat)
28191 return 0;
28192
28193 emit_insn (pat);
28194 return target;
28195 }
28196
28197 /* Subroutine of ix86_expand_builtin to take care of special insns
28198 with variable number of operands. */
28199
28200 static rtx
28201 ix86_expand_special_args_builtin (const struct builtin_description *d,
28202 tree exp, rtx target)
28203 {
28204 tree arg;
28205 rtx pat, op;
28206 unsigned int i, nargs, arg_adjust, memory;
28207 struct
28208 {
28209 rtx op;
28210 enum machine_mode mode;
28211 } args[3];
28212 enum insn_code icode = d->icode;
28213 bool last_arg_constant = false;
28214 const struct insn_data_d *insn_p = &insn_data[icode];
28215 enum machine_mode tmode = insn_p->operand[0].mode;
28216 enum { load, store } klass;
28217
28218 switch ((enum ix86_builtin_func_type) d->flag)
28219 {
28220 case VOID_FTYPE_VOID:
28221 if (icode == CODE_FOR_avx_vzeroupper)
28222 target = GEN_INT (vzeroupper_intrinsic);
28223 emit_insn (GEN_FCN (icode) (target));
28224 return 0;
28225 case VOID_FTYPE_UINT64:
28226 case VOID_FTYPE_UNSIGNED:
28227 nargs = 0;
28228 klass = store;
28229 memory = 0;
28230 break;
28231 case UINT64_FTYPE_VOID:
28232 case UNSIGNED_FTYPE_VOID:
28233 nargs = 0;
28234 klass = load;
28235 memory = 0;
28236 break;
28237 case UINT64_FTYPE_PUNSIGNED:
28238 case V2DI_FTYPE_PV2DI:
28239 case V4DI_FTYPE_PV4DI:
28240 case V32QI_FTYPE_PCCHAR:
28241 case V16QI_FTYPE_PCCHAR:
28242 case V8SF_FTYPE_PCV4SF:
28243 case V8SF_FTYPE_PCFLOAT:
28244 case V4SF_FTYPE_PCFLOAT:
28245 case V4DF_FTYPE_PCV2DF:
28246 case V4DF_FTYPE_PCDOUBLE:
28247 case V2DF_FTYPE_PCDOUBLE:
28248 case VOID_FTYPE_PVOID:
28249 nargs = 1;
28250 klass = load;
28251 memory = 0;
28252 break;
28253 case VOID_FTYPE_PV2SF_V4SF:
28254 case VOID_FTYPE_PV4DI_V4DI:
28255 case VOID_FTYPE_PV2DI_V2DI:
28256 case VOID_FTYPE_PCHAR_V32QI:
28257 case VOID_FTYPE_PCHAR_V16QI:
28258 case VOID_FTYPE_PFLOAT_V8SF:
28259 case VOID_FTYPE_PFLOAT_V4SF:
28260 case VOID_FTYPE_PDOUBLE_V4DF:
28261 case VOID_FTYPE_PDOUBLE_V2DF:
28262 case VOID_FTYPE_PULONGLONG_ULONGLONG:
28263 case VOID_FTYPE_PINT_INT:
28264 nargs = 1;
28265 klass = store;
28266 /* Reserve memory operand for target. */
28267 memory = ARRAY_SIZE (args);
28268 break;
28269 case V4SF_FTYPE_V4SF_PCV2SF:
28270 case V2DF_FTYPE_V2DF_PCDOUBLE:
28271 nargs = 2;
28272 klass = load;
28273 memory = 1;
28274 break;
28275 case V8SF_FTYPE_PCV8SF_V8SI:
28276 case V4DF_FTYPE_PCV4DF_V4DI:
28277 case V4SF_FTYPE_PCV4SF_V4SI:
28278 case V2DF_FTYPE_PCV2DF_V2DI:
28279 case V8SI_FTYPE_PCV8SI_V8SI:
28280 case V4DI_FTYPE_PCV4DI_V4DI:
28281 case V4SI_FTYPE_PCV4SI_V4SI:
28282 case V2DI_FTYPE_PCV2DI_V2DI:
28283 nargs = 2;
28284 klass = load;
28285 memory = 0;
28286 break;
28287 case VOID_FTYPE_PV8SF_V8SI_V8SF:
28288 case VOID_FTYPE_PV4DF_V4DI_V4DF:
28289 case VOID_FTYPE_PV4SF_V4SI_V4SF:
28290 case VOID_FTYPE_PV2DF_V2DI_V2DF:
28291 case VOID_FTYPE_PV8SI_V8SI_V8SI:
28292 case VOID_FTYPE_PV4DI_V4DI_V4DI:
28293 case VOID_FTYPE_PV4SI_V4SI_V4SI:
28294 case VOID_FTYPE_PV2DI_V2DI_V2DI:
28295 nargs = 2;
28296 klass = store;
28297 /* Reserve memory operand for target. */
28298 memory = ARRAY_SIZE (args);
28299 break;
28300 case VOID_FTYPE_UINT_UINT_UINT:
28301 case VOID_FTYPE_UINT64_UINT_UINT:
28302 case UCHAR_FTYPE_UINT_UINT_UINT:
28303 case UCHAR_FTYPE_UINT64_UINT_UINT:
28304 nargs = 3;
28305 klass = load;
28306 memory = ARRAY_SIZE (args);
28307 last_arg_constant = true;
28308 break;
28309 default:
28310 gcc_unreachable ();
28311 }
28312
28313 gcc_assert (nargs <= ARRAY_SIZE (args));
28314
28315 if (klass == store)
28316 {
28317 arg = CALL_EXPR_ARG (exp, 0);
28318 op = expand_normal (arg);
28319 gcc_assert (target == 0);
28320 if (memory)
28321 {
28322 if (GET_MODE (op) != Pmode)
28323 op = convert_to_mode (Pmode, op, 1);
28324 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
28325 }
28326 else
28327 target = force_reg (tmode, op);
28328 arg_adjust = 1;
28329 }
28330 else
28331 {
28332 arg_adjust = 0;
28333 if (optimize
28334 || target == 0
28335 || GET_MODE (target) != tmode
28336 || !insn_p->operand[0].predicate (target, tmode))
28337 target = gen_reg_rtx (tmode);
28338 }
28339
28340 for (i = 0; i < nargs; i++)
28341 {
28342 enum machine_mode mode = insn_p->operand[i + 1].mode;
28343 bool match;
28344
28345 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
28346 op = expand_normal (arg);
28347 match = insn_p->operand[i + 1].predicate (op, mode);
28348
28349 if (last_arg_constant && (i + 1) == nargs)
28350 {
28351 if (!match)
28352 {
28353 if (icode == CODE_FOR_lwp_lwpvalsi3
28354 || icode == CODE_FOR_lwp_lwpinssi3
28355 || icode == CODE_FOR_lwp_lwpvaldi3
28356 || icode == CODE_FOR_lwp_lwpinsdi3)
28357 error ("the last argument must be a 32-bit immediate");
28358 else
28359 error ("the last argument must be an 8-bit immediate");
28360 return const0_rtx;
28361 }
28362 }
28363 else
28364 {
28365 if (i == memory)
28366 {
28367 /* This must be the memory operand. */
28368 if (GET_MODE (op) != Pmode)
28369 op = convert_to_mode (Pmode, op, 1);
28370 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
28371 gcc_assert (GET_MODE (op) == mode
28372 || GET_MODE (op) == VOIDmode);
28373 }
28374 else
28375 {
28376 /* This must be register. */
28377 if (VECTOR_MODE_P (mode))
28378 op = safe_vector_operand (op, mode);
28379
28380 gcc_assert (GET_MODE (op) == mode
28381 || GET_MODE (op) == VOIDmode);
28382 op = copy_to_mode_reg (mode, op);
28383 }
28384 }
28385
28386 args[i].op = op;
28387 args[i].mode = mode;
28388 }
28389
28390 switch (nargs)
28391 {
28392 case 0:
28393 pat = GEN_FCN (icode) (target);
28394 break;
28395 case 1:
28396 pat = GEN_FCN (icode) (target, args[0].op);
28397 break;
28398 case 2:
28399 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
28400 break;
28401 case 3:
28402 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
28403 break;
28404 default:
28405 gcc_unreachable ();
28406 }
28407
28408 if (! pat)
28409 return 0;
28410 emit_insn (pat);
28411 return klass == store ? 0 : target;
28412 }
28413
28414 /* Return the integer constant in ARG. Constrain it to be in the range
28415 of the subparts of VEC_TYPE; issue an error if not. */
28416
28417 static int
28418 get_element_number (tree vec_type, tree arg)
28419 {
28420 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
28421
28422 if (!host_integerp (arg, 1)
28423 || (elt = tree_low_cst (arg, 1), elt > max))
28424 {
28425 error ("selector must be an integer constant in the range 0..%wi", max);
28426 return 0;
28427 }
28428
28429 return elt;
28430 }
28431
28432 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
28433 ix86_expand_vector_init. We DO have language-level syntax for this, in
28434 the form of (type){ init-list }. Except that since we can't place emms
28435 instructions from inside the compiler, we can't allow the use of MMX
28436 registers unless the user explicitly asks for it. So we do *not* define
28437 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
28438 we have builtins invoked by mmintrin.h that gives us license to emit
28439 these sorts of instructions. */
28440
28441 static rtx
28442 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
28443 {
28444 enum machine_mode tmode = TYPE_MODE (type);
28445 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
28446 int i, n_elt = GET_MODE_NUNITS (tmode);
28447 rtvec v = rtvec_alloc (n_elt);
28448
28449 gcc_assert (VECTOR_MODE_P (tmode));
28450 gcc_assert (call_expr_nargs (exp) == n_elt);
28451
28452 for (i = 0; i < n_elt; ++i)
28453 {
28454 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
28455 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
28456 }
28457
28458 if (!target || !register_operand (target, tmode))
28459 target = gen_reg_rtx (tmode);
28460
28461 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
28462 return target;
28463 }
28464
28465 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
28466 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
28467 had a language-level syntax for referencing vector elements. */
28468
28469 static rtx
28470 ix86_expand_vec_ext_builtin (tree exp, rtx target)
28471 {
28472 enum machine_mode tmode, mode0;
28473 tree arg0, arg1;
28474 int elt;
28475 rtx op0;
28476
28477 arg0 = CALL_EXPR_ARG (exp, 0);
28478 arg1 = CALL_EXPR_ARG (exp, 1);
28479
28480 op0 = expand_normal (arg0);
28481 elt = get_element_number (TREE_TYPE (arg0), arg1);
28482
28483 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
28484 mode0 = TYPE_MODE (TREE_TYPE (arg0));
28485 gcc_assert (VECTOR_MODE_P (mode0));
28486
28487 op0 = force_reg (mode0, op0);
28488
28489 if (optimize || !target || !register_operand (target, tmode))
28490 target = gen_reg_rtx (tmode);
28491
28492 ix86_expand_vector_extract (true, target, op0, elt);
28493
28494 return target;
28495 }
28496
28497 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
28498 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
28499 a language-level syntax for referencing vector elements. */
28500
28501 static rtx
28502 ix86_expand_vec_set_builtin (tree exp)
28503 {
28504 enum machine_mode tmode, mode1;
28505 tree arg0, arg1, arg2;
28506 int elt;
28507 rtx op0, op1, target;
28508
28509 arg0 = CALL_EXPR_ARG (exp, 0);
28510 arg1 = CALL_EXPR_ARG (exp, 1);
28511 arg2 = CALL_EXPR_ARG (exp, 2);
28512
28513 tmode = TYPE_MODE (TREE_TYPE (arg0));
28514 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
28515 gcc_assert (VECTOR_MODE_P (tmode));
28516
28517 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
28518 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
28519 elt = get_element_number (TREE_TYPE (arg0), arg2);
28520
28521 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
28522 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
28523
28524 op0 = force_reg (tmode, op0);
28525 op1 = force_reg (mode1, op1);
28526
28527 /* OP0 is the source of these builtin functions and shouldn't be
28528 modified. Create a copy, use it and return it as target. */
28529 target = gen_reg_rtx (tmode);
28530 emit_move_insn (target, op0);
28531 ix86_expand_vector_set (true, target, op1, elt);
28532
28533 return target;
28534 }
28535
28536 /* Expand an expression EXP that calls a built-in function,
28537 with result going to TARGET if that's convenient
28538 (and in mode MODE if that's convenient).
28539 SUBTARGET may be used as the target for computing one of EXP's operands.
28540 IGNORE is nonzero if the value is to be ignored. */
28541
28542 static rtx
28543 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
28544 enum machine_mode mode ATTRIBUTE_UNUSED,
28545 int ignore ATTRIBUTE_UNUSED)
28546 {
28547 const struct builtin_description *d;
28548 size_t i;
28549 enum insn_code icode;
28550 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
28551 tree arg0, arg1, arg2, arg3, arg4;
28552 rtx op0, op1, op2, op3, op4, pat;
28553 enum machine_mode mode0, mode1, mode2, mode3, mode4;
28554 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
28555
28556 /* Determine whether the builtin function is available under the current ISA.
28557 Originally the builtin was not created if it wasn't applicable to the
28558 current ISA based on the command line switches. With function specific
28559 options, we need to check in the context of the function making the call
28560 whether it is supported. */
28561 if (ix86_builtins_isa[fcode].isa
28562 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
28563 {
28564 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
28565 NULL, (enum fpmath_unit) 0, false);
28566
28567 if (!opts)
28568 error ("%qE needs unknown isa option", fndecl);
28569 else
28570 {
28571 gcc_assert (opts != NULL);
28572 error ("%qE needs isa option %s", fndecl, opts);
28573 free (opts);
28574 }
28575 return const0_rtx;
28576 }
28577
28578 switch (fcode)
28579 {
28580 case IX86_BUILTIN_MASKMOVQ:
28581 case IX86_BUILTIN_MASKMOVDQU:
28582 icode = (fcode == IX86_BUILTIN_MASKMOVQ
28583 ? CODE_FOR_mmx_maskmovq
28584 : CODE_FOR_sse2_maskmovdqu);
28585 /* Note the arg order is different from the operand order. */
28586 arg1 = CALL_EXPR_ARG (exp, 0);
28587 arg2 = CALL_EXPR_ARG (exp, 1);
28588 arg0 = CALL_EXPR_ARG (exp, 2);
28589 op0 = expand_normal (arg0);
28590 op1 = expand_normal (arg1);
28591 op2 = expand_normal (arg2);
28592 mode0 = insn_data[icode].operand[0].mode;
28593 mode1 = insn_data[icode].operand[1].mode;
28594 mode2 = insn_data[icode].operand[2].mode;
28595
28596 if (GET_MODE (op0) != Pmode)
28597 op0 = convert_to_mode (Pmode, op0, 1);
28598 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
28599
28600 if (!insn_data[icode].operand[0].predicate (op0, mode0))
28601 op0 = copy_to_mode_reg (mode0, op0);
28602 if (!insn_data[icode].operand[1].predicate (op1, mode1))
28603 op1 = copy_to_mode_reg (mode1, op1);
28604 if (!insn_data[icode].operand[2].predicate (op2, mode2))
28605 op2 = copy_to_mode_reg (mode2, op2);
28606 pat = GEN_FCN (icode) (op0, op1, op2);
28607 if (! pat)
28608 return 0;
28609 emit_insn (pat);
28610 return 0;
28611
28612 case IX86_BUILTIN_LDMXCSR:
28613 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
28614 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
28615 emit_move_insn (target, op0);
28616 emit_insn (gen_sse_ldmxcsr (target));
28617 return 0;
28618
28619 case IX86_BUILTIN_STMXCSR:
28620 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
28621 emit_insn (gen_sse_stmxcsr (target));
28622 return copy_to_mode_reg (SImode, target);
28623
28624 case IX86_BUILTIN_CLFLUSH:
28625 arg0 = CALL_EXPR_ARG (exp, 0);
28626 op0 = expand_normal (arg0);
28627 icode = CODE_FOR_sse2_clflush;
28628 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
28629 {
28630 if (GET_MODE (op0) != Pmode)
28631 op0 = convert_to_mode (Pmode, op0, 1);
28632 op0 = force_reg (Pmode, op0);
28633 }
28634
28635 emit_insn (gen_sse2_clflush (op0));
28636 return 0;
28637
28638 case IX86_BUILTIN_MONITOR:
28639 arg0 = CALL_EXPR_ARG (exp, 0);
28640 arg1 = CALL_EXPR_ARG (exp, 1);
28641 arg2 = CALL_EXPR_ARG (exp, 2);
28642 op0 = expand_normal (arg0);
28643 op1 = expand_normal (arg1);
28644 op2 = expand_normal (arg2);
28645 if (!REG_P (op0))
28646 {
28647 if (GET_MODE (op0) != Pmode)
28648 op0 = convert_to_mode (Pmode, op0, 1);
28649 op0 = force_reg (Pmode, op0);
28650 }
28651 if (!REG_P (op1))
28652 op1 = copy_to_mode_reg (SImode, op1);
28653 if (!REG_P (op2))
28654 op2 = copy_to_mode_reg (SImode, op2);
28655 emit_insn (ix86_gen_monitor (op0, op1, op2));
28656 return 0;
28657
28658 case IX86_BUILTIN_MWAIT:
28659 arg0 = CALL_EXPR_ARG (exp, 0);
28660 arg1 = CALL_EXPR_ARG (exp, 1);
28661 op0 = expand_normal (arg0);
28662 op1 = expand_normal (arg1);
28663 if (!REG_P (op0))
28664 op0 = copy_to_mode_reg (SImode, op0);
28665 if (!REG_P (op1))
28666 op1 = copy_to_mode_reg (SImode, op1);
28667 emit_insn (gen_sse3_mwait (op0, op1));
28668 return 0;
28669
28670 case IX86_BUILTIN_VEC_INIT_V2SI:
28671 case IX86_BUILTIN_VEC_INIT_V4HI:
28672 case IX86_BUILTIN_VEC_INIT_V8QI:
28673 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
28674
28675 case IX86_BUILTIN_VEC_EXT_V2DF:
28676 case IX86_BUILTIN_VEC_EXT_V2DI:
28677 case IX86_BUILTIN_VEC_EXT_V4SF:
28678 case IX86_BUILTIN_VEC_EXT_V4SI:
28679 case IX86_BUILTIN_VEC_EXT_V8HI:
28680 case IX86_BUILTIN_VEC_EXT_V2SI:
28681 case IX86_BUILTIN_VEC_EXT_V4HI:
28682 case IX86_BUILTIN_VEC_EXT_V16QI:
28683 return ix86_expand_vec_ext_builtin (exp, target);
28684
28685 case IX86_BUILTIN_VEC_SET_V2DI:
28686 case IX86_BUILTIN_VEC_SET_V4SF:
28687 case IX86_BUILTIN_VEC_SET_V4SI:
28688 case IX86_BUILTIN_VEC_SET_V8HI:
28689 case IX86_BUILTIN_VEC_SET_V4HI:
28690 case IX86_BUILTIN_VEC_SET_V16QI:
28691 return ix86_expand_vec_set_builtin (exp);
28692
28693 case IX86_BUILTIN_INFQ:
28694 case IX86_BUILTIN_HUGE_VALQ:
28695 {
28696 REAL_VALUE_TYPE inf;
28697 rtx tmp;
28698
28699 real_inf (&inf);
28700 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
28701
28702 tmp = validize_mem (force_const_mem (mode, tmp));
28703
28704 if (target == 0)
28705 target = gen_reg_rtx (mode);
28706
28707 emit_move_insn (target, tmp);
28708 return target;
28709 }
28710
28711 case IX86_BUILTIN_LLWPCB:
28712 arg0 = CALL_EXPR_ARG (exp, 0);
28713 op0 = expand_normal (arg0);
28714 icode = CODE_FOR_lwp_llwpcb;
28715 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
28716 {
28717 if (GET_MODE (op0) != Pmode)
28718 op0 = convert_to_mode (Pmode, op0, 1);
28719 op0 = force_reg (Pmode, op0);
28720 }
28721 emit_insn (gen_lwp_llwpcb (op0));
28722 return 0;
28723
28724 case IX86_BUILTIN_SLWPCB:
28725 icode = CODE_FOR_lwp_slwpcb;
28726 if (!target
28727 || !insn_data[icode].operand[0].predicate (target, Pmode))
28728 target = gen_reg_rtx (Pmode);
28729 emit_insn (gen_lwp_slwpcb (target));
28730 return target;
28731
28732 case IX86_BUILTIN_BEXTRI32:
28733 case IX86_BUILTIN_BEXTRI64:
28734 arg0 = CALL_EXPR_ARG (exp, 0);
28735 arg1 = CALL_EXPR_ARG (exp, 1);
28736 op0 = expand_normal (arg0);
28737 op1 = expand_normal (arg1);
28738 icode = (fcode == IX86_BUILTIN_BEXTRI32
28739 ? CODE_FOR_tbm_bextri_si
28740 : CODE_FOR_tbm_bextri_di);
28741 if (!CONST_INT_P (op1))
28742 {
28743 error ("last argument must be an immediate");
28744 return const0_rtx;
28745 }
28746 else
28747 {
28748 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
28749 unsigned char lsb_index = INTVAL (op1) & 0xFF;
28750 op1 = GEN_INT (length);
28751 op2 = GEN_INT (lsb_index);
28752 pat = GEN_FCN (icode) (target, op0, op1, op2);
28753 if (pat)
28754 emit_insn (pat);
28755 return target;
28756 }
28757
28758 case IX86_BUILTIN_RDRAND16_STEP:
28759 icode = CODE_FOR_rdrandhi_1;
28760 mode0 = HImode;
28761 goto rdrand_step;
28762
28763 case IX86_BUILTIN_RDRAND32_STEP:
28764 icode = CODE_FOR_rdrandsi_1;
28765 mode0 = SImode;
28766 goto rdrand_step;
28767
28768 case IX86_BUILTIN_RDRAND64_STEP:
28769 icode = CODE_FOR_rdranddi_1;
28770 mode0 = DImode;
28771
28772 rdrand_step:
28773 op0 = gen_reg_rtx (mode0);
28774 emit_insn (GEN_FCN (icode) (op0));
28775
28776 arg0 = CALL_EXPR_ARG (exp, 0);
28777 op1 = expand_normal (arg0);
28778 if (!address_operand (op1, VOIDmode))
28779 {
28780 op1 = convert_memory_address (Pmode, op1);
28781 op1 = copy_addr_to_reg (op1);
28782 }
28783 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
28784
28785 op1 = gen_reg_rtx (SImode);
28786 emit_move_insn (op1, CONST1_RTX (SImode));
28787
28788 /* Emit SImode conditional move. */
28789 if (mode0 == HImode)
28790 {
28791 op2 = gen_reg_rtx (SImode);
28792 emit_insn (gen_zero_extendhisi2 (op2, op0));
28793 }
28794 else if (mode0 == SImode)
28795 op2 = op0;
28796 else
28797 op2 = gen_rtx_SUBREG (SImode, op0, 0);
28798
28799 if (target == 0)
28800 target = gen_reg_rtx (SImode);
28801
28802 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
28803 const0_rtx);
28804 emit_insn (gen_rtx_SET (VOIDmode, target,
28805 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
28806 return target;
28807
28808 case IX86_BUILTIN_GATHERSIV2DF:
28809 icode = CODE_FOR_avx2_gathersiv2df;
28810 goto gather_gen;
28811 case IX86_BUILTIN_GATHERSIV4DF:
28812 icode = CODE_FOR_avx2_gathersiv4df;
28813 goto gather_gen;
28814 case IX86_BUILTIN_GATHERDIV2DF:
28815 icode = CODE_FOR_avx2_gatherdiv2df;
28816 goto gather_gen;
28817 case IX86_BUILTIN_GATHERDIV4DF:
28818 icode = CODE_FOR_avx2_gatherdiv4df;
28819 goto gather_gen;
28820 case IX86_BUILTIN_GATHERSIV4SF:
28821 icode = CODE_FOR_avx2_gathersiv4sf;
28822 goto gather_gen;
28823 case IX86_BUILTIN_GATHERSIV8SF:
28824 icode = CODE_FOR_avx2_gathersiv8sf;
28825 goto gather_gen;
28826 case IX86_BUILTIN_GATHERDIV4SF:
28827 icode = CODE_FOR_avx2_gatherdiv4sf;
28828 goto gather_gen;
28829 case IX86_BUILTIN_GATHERDIV8SF:
28830 icode = CODE_FOR_avx2_gatherdiv4sf256;
28831 goto gather_gen;
28832 case IX86_BUILTIN_GATHERSIV2DI:
28833 icode = CODE_FOR_avx2_gathersiv2di;
28834 goto gather_gen;
28835 case IX86_BUILTIN_GATHERSIV4DI:
28836 icode = CODE_FOR_avx2_gathersiv4di;
28837 goto gather_gen;
28838 case IX86_BUILTIN_GATHERDIV2DI:
28839 icode = CODE_FOR_avx2_gatherdiv2di;
28840 goto gather_gen;
28841 case IX86_BUILTIN_GATHERDIV4DI:
28842 icode = CODE_FOR_avx2_gatherdiv4di;
28843 goto gather_gen;
28844 case IX86_BUILTIN_GATHERSIV4SI:
28845 icode = CODE_FOR_avx2_gathersiv4si;
28846 goto gather_gen;
28847 case IX86_BUILTIN_GATHERSIV8SI:
28848 icode = CODE_FOR_avx2_gathersiv8si;
28849 goto gather_gen;
28850 case IX86_BUILTIN_GATHERDIV4SI:
28851 icode = CODE_FOR_avx2_gatherdiv4si;
28852 goto gather_gen;
28853 case IX86_BUILTIN_GATHERDIV8SI:
28854 icode = CODE_FOR_avx2_gatherdiv4si256;
28855
28856 gather_gen:
28857 arg0 = CALL_EXPR_ARG (exp, 0);
28858 arg1 = CALL_EXPR_ARG (exp, 1);
28859 arg2 = CALL_EXPR_ARG (exp, 2);
28860 arg3 = CALL_EXPR_ARG (exp, 3);
28861 arg4 = CALL_EXPR_ARG (exp, 4);
28862 op0 = expand_normal (arg0);
28863 op1 = expand_normal (arg1);
28864 op2 = expand_normal (arg2);
28865 op3 = expand_normal (arg3);
28866 op4 = expand_normal (arg4);
28867 /* Note the arg order is different from the operand order. */
28868 mode0 = insn_data[icode].operand[1].mode;
28869 mode2 = insn_data[icode].operand[3].mode;
28870 mode3 = insn_data[icode].operand[4].mode;
28871 mode4 = insn_data[icode].operand[5].mode;
28872
28873 if (target == NULL_RTX)
28874 target = gen_reg_rtx (insn_data[icode].operand[0].mode);
28875
28876 /* Force memory operand only with base register here. But we
28877 don't want to do it on memory operand for other builtin
28878 functions. */
28879 if (GET_MODE (op1) != Pmode)
28880 op1 = convert_to_mode (Pmode, op1, 1);
28881 op1 = force_reg (Pmode, op1);
28882
28883 if (!insn_data[icode].operand[1].predicate (op0, mode0))
28884 op0 = copy_to_mode_reg (mode0, op0);
28885 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
28886 op1 = copy_to_mode_reg (Pmode, op1);
28887 if (!insn_data[icode].operand[3].predicate (op2, mode2))
28888 op2 = copy_to_mode_reg (mode2, op2);
28889 if (!insn_data[icode].operand[4].predicate (op3, mode3))
28890 op3 = copy_to_mode_reg (mode3, op3);
28891 if (!insn_data[icode].operand[5].predicate (op4, mode4))
28892 {
28893 error ("last argument must be scale 1, 2, 4, 8");
28894 return const0_rtx;
28895 }
28896 pat = GEN_FCN (icode) (target, op0, op1, op2, op3, op4);
28897 if (! pat)
28898 return const0_rtx;
28899 emit_insn (pat);
28900 return target;
28901
28902 default:
28903 break;
28904 }
28905
28906 for (i = 0, d = bdesc_special_args;
28907 i < ARRAY_SIZE (bdesc_special_args);
28908 i++, d++)
28909 if (d->code == fcode)
28910 return ix86_expand_special_args_builtin (d, exp, target);
28911
28912 for (i = 0, d = bdesc_args;
28913 i < ARRAY_SIZE (bdesc_args);
28914 i++, d++)
28915 if (d->code == fcode)
28916 switch (fcode)
28917 {
28918 case IX86_BUILTIN_FABSQ:
28919 case IX86_BUILTIN_COPYSIGNQ:
28920 if (!TARGET_SSE2)
28921 /* Emit a normal call if SSE2 isn't available. */
28922 return expand_call (exp, target, ignore);
28923 default:
28924 return ix86_expand_args_builtin (d, exp, target);
28925 }
28926
28927 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28928 if (d->code == fcode)
28929 return ix86_expand_sse_comi (d, exp, target);
28930
28931 for (i = 0, d = bdesc_pcmpestr;
28932 i < ARRAY_SIZE (bdesc_pcmpestr);
28933 i++, d++)
28934 if (d->code == fcode)
28935 return ix86_expand_sse_pcmpestr (d, exp, target);
28936
28937 for (i = 0, d = bdesc_pcmpistr;
28938 i < ARRAY_SIZE (bdesc_pcmpistr);
28939 i++, d++)
28940 if (d->code == fcode)
28941 return ix86_expand_sse_pcmpistr (d, exp, target);
28942
28943 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28944 if (d->code == fcode)
28945 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
28946 (enum ix86_builtin_func_type)
28947 d->flag, d->comparison);
28948
28949 gcc_unreachable ();
28950 }
28951
28952 /* Returns a function decl for a vectorized version of the builtin function
28953 with builtin function code FN and the result vector type TYPE, or NULL_TREE
28954 if it is not available. */
28955
28956 static tree
28957 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
28958 tree type_in)
28959 {
28960 enum machine_mode in_mode, out_mode;
28961 int in_n, out_n;
28962 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
28963
28964 if (TREE_CODE (type_out) != VECTOR_TYPE
28965 || TREE_CODE (type_in) != VECTOR_TYPE
28966 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
28967 return NULL_TREE;
28968
28969 out_mode = TYPE_MODE (TREE_TYPE (type_out));
28970 out_n = TYPE_VECTOR_SUBPARTS (type_out);
28971 in_mode = TYPE_MODE (TREE_TYPE (type_in));
28972 in_n = TYPE_VECTOR_SUBPARTS (type_in);
28973
28974 switch (fn)
28975 {
28976 case BUILT_IN_SQRT:
28977 if (out_mode == DFmode && in_mode == DFmode)
28978 {
28979 if (out_n == 2 && in_n == 2)
28980 return ix86_builtins[IX86_BUILTIN_SQRTPD];
28981 else if (out_n == 4 && in_n == 4)
28982 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
28983 }
28984 break;
28985
28986 case BUILT_IN_SQRTF:
28987 if (out_mode == SFmode && in_mode == SFmode)
28988 {
28989 if (out_n == 4 && in_n == 4)
28990 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
28991 else if (out_n == 8 && in_n == 8)
28992 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
28993 }
28994 break;
28995
28996 case BUILT_IN_LRINT:
28997 if (out_mode == SImode && out_n == 4
28998 && in_mode == DFmode && in_n == 2)
28999 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
29000 break;
29001
29002 case BUILT_IN_LRINTF:
29003 if (out_mode == SImode && in_mode == SFmode)
29004 {
29005 if (out_n == 4 && in_n == 4)
29006 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
29007 else if (out_n == 8 && in_n == 8)
29008 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
29009 }
29010 break;
29011
29012 case BUILT_IN_COPYSIGN:
29013 if (out_mode == DFmode && in_mode == DFmode)
29014 {
29015 if (out_n == 2 && in_n == 2)
29016 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
29017 else if (out_n == 4 && in_n == 4)
29018 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
29019 }
29020 break;
29021
29022 case BUILT_IN_COPYSIGNF:
29023 if (out_mode == SFmode && in_mode == SFmode)
29024 {
29025 if (out_n == 4 && in_n == 4)
29026 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
29027 else if (out_n == 8 && in_n == 8)
29028 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
29029 }
29030 break;
29031
29032 case BUILT_IN_FLOOR:
29033 /* The round insn does not trap on denormals. */
29034 if (flag_trapping_math || !TARGET_ROUND)
29035 break;
29036
29037 if (out_mode == DFmode && in_mode == DFmode)
29038 {
29039 if (out_n == 2 && in_n == 2)
29040 return ix86_builtins[IX86_BUILTIN_FLOORPD];
29041 else if (out_n == 4 && in_n == 4)
29042 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
29043 }
29044 break;
29045
29046 case BUILT_IN_FLOORF:
29047 /* The round insn does not trap on denormals. */
29048 if (flag_trapping_math || !TARGET_ROUND)
29049 break;
29050
29051 if (out_mode == SFmode && in_mode == SFmode)
29052 {
29053 if (out_n == 4 && in_n == 4)
29054 return ix86_builtins[IX86_BUILTIN_FLOORPS];
29055 else if (out_n == 8 && in_n == 8)
29056 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
29057 }
29058 break;
29059
29060 case BUILT_IN_CEIL:
29061 /* The round insn does not trap on denormals. */
29062 if (flag_trapping_math || !TARGET_ROUND)
29063 break;
29064
29065 if (out_mode == DFmode && in_mode == DFmode)
29066 {
29067 if (out_n == 2 && in_n == 2)
29068 return ix86_builtins[IX86_BUILTIN_CEILPD];
29069 else if (out_n == 4 && in_n == 4)
29070 return ix86_builtins[IX86_BUILTIN_CEILPD256];
29071 }
29072 break;
29073
29074 case BUILT_IN_CEILF:
29075 /* The round insn does not trap on denormals. */
29076 if (flag_trapping_math || !TARGET_ROUND)
29077 break;
29078
29079 if (out_mode == SFmode && in_mode == SFmode)
29080 {
29081 if (out_n == 4 && in_n == 4)
29082 return ix86_builtins[IX86_BUILTIN_CEILPS];
29083 else if (out_n == 8 && in_n == 8)
29084 return ix86_builtins[IX86_BUILTIN_CEILPS256];
29085 }
29086 break;
29087
29088 case BUILT_IN_TRUNC:
29089 /* The round insn does not trap on denormals. */
29090 if (flag_trapping_math || !TARGET_ROUND)
29091 break;
29092
29093 if (out_mode == DFmode && in_mode == DFmode)
29094 {
29095 if (out_n == 2 && in_n == 2)
29096 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
29097 else if (out_n == 4 && in_n == 4)
29098 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
29099 }
29100 break;
29101
29102 case BUILT_IN_TRUNCF:
29103 /* The round insn does not trap on denormals. */
29104 if (flag_trapping_math || !TARGET_ROUND)
29105 break;
29106
29107 if (out_mode == SFmode && in_mode == SFmode)
29108 {
29109 if (out_n == 4 && in_n == 4)
29110 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
29111 else if (out_n == 8 && in_n == 8)
29112 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
29113 }
29114 break;
29115
29116 case BUILT_IN_RINT:
29117 /* The round insn does not trap on denormals. */
29118 if (flag_trapping_math || !TARGET_ROUND)
29119 break;
29120
29121 if (out_mode == DFmode && in_mode == DFmode)
29122 {
29123 if (out_n == 2 && in_n == 2)
29124 return ix86_builtins[IX86_BUILTIN_RINTPD];
29125 else if (out_n == 4 && in_n == 4)
29126 return ix86_builtins[IX86_BUILTIN_RINTPD256];
29127 }
29128 break;
29129
29130 case BUILT_IN_RINTF:
29131 /* The round insn does not trap on denormals. */
29132 if (flag_trapping_math || !TARGET_ROUND)
29133 break;
29134
29135 if (out_mode == SFmode && in_mode == SFmode)
29136 {
29137 if (out_n == 4 && in_n == 4)
29138 return ix86_builtins[IX86_BUILTIN_RINTPS];
29139 else if (out_n == 8 && in_n == 8)
29140 return ix86_builtins[IX86_BUILTIN_RINTPS256];
29141 }
29142 break;
29143
29144 case BUILT_IN_ROUND:
29145 /* The round insn does not trap on denormals. */
29146 if (flag_trapping_math || !TARGET_ROUND)
29147 break;
29148
29149 if (out_mode == DFmode && in_mode == DFmode)
29150 {
29151 if (out_n == 2 && in_n == 2)
29152 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
29153 else if (out_n == 4 && in_n == 4)
29154 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
29155 }
29156 break;
29157
29158 case BUILT_IN_ROUNDF:
29159 /* The round insn does not trap on denormals. */
29160 if (flag_trapping_math || !TARGET_ROUND)
29161 break;
29162
29163 if (out_mode == SFmode && in_mode == SFmode)
29164 {
29165 if (out_n == 4 && in_n == 4)
29166 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
29167 else if (out_n == 8 && in_n == 8)
29168 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
29169 }
29170 break;
29171
29172 case BUILT_IN_FMA:
29173 if (out_mode == DFmode && in_mode == DFmode)
29174 {
29175 if (out_n == 2 && in_n == 2)
29176 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
29177 if (out_n == 4 && in_n == 4)
29178 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
29179 }
29180 break;
29181
29182 case BUILT_IN_FMAF:
29183 if (out_mode == SFmode && in_mode == SFmode)
29184 {
29185 if (out_n == 4 && in_n == 4)
29186 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
29187 if (out_n == 8 && in_n == 8)
29188 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
29189 }
29190 break;
29191
29192 default:
29193 break;
29194 }
29195
29196 /* Dispatch to a handler for a vectorization library. */
29197 if (ix86_veclib_handler)
29198 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
29199 type_in);
29200
29201 return NULL_TREE;
29202 }
29203
29204 /* Handler for an SVML-style interface to
29205 a library with vectorized intrinsics. */
29206
29207 static tree
29208 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
29209 {
29210 char name[20];
29211 tree fntype, new_fndecl, args;
29212 unsigned arity;
29213 const char *bname;
29214 enum machine_mode el_mode, in_mode;
29215 int n, in_n;
29216
29217 /* The SVML is suitable for unsafe math only. */
29218 if (!flag_unsafe_math_optimizations)
29219 return NULL_TREE;
29220
29221 el_mode = TYPE_MODE (TREE_TYPE (type_out));
29222 n = TYPE_VECTOR_SUBPARTS (type_out);
29223 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29224 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29225 if (el_mode != in_mode
29226 || n != in_n)
29227 return NULL_TREE;
29228
29229 switch (fn)
29230 {
29231 case BUILT_IN_EXP:
29232 case BUILT_IN_LOG:
29233 case BUILT_IN_LOG10:
29234 case BUILT_IN_POW:
29235 case BUILT_IN_TANH:
29236 case BUILT_IN_TAN:
29237 case BUILT_IN_ATAN:
29238 case BUILT_IN_ATAN2:
29239 case BUILT_IN_ATANH:
29240 case BUILT_IN_CBRT:
29241 case BUILT_IN_SINH:
29242 case BUILT_IN_SIN:
29243 case BUILT_IN_ASINH:
29244 case BUILT_IN_ASIN:
29245 case BUILT_IN_COSH:
29246 case BUILT_IN_COS:
29247 case BUILT_IN_ACOSH:
29248 case BUILT_IN_ACOS:
29249 if (el_mode != DFmode || n != 2)
29250 return NULL_TREE;
29251 break;
29252
29253 case BUILT_IN_EXPF:
29254 case BUILT_IN_LOGF:
29255 case BUILT_IN_LOG10F:
29256 case BUILT_IN_POWF:
29257 case BUILT_IN_TANHF:
29258 case BUILT_IN_TANF:
29259 case BUILT_IN_ATANF:
29260 case BUILT_IN_ATAN2F:
29261 case BUILT_IN_ATANHF:
29262 case BUILT_IN_CBRTF:
29263 case BUILT_IN_SINHF:
29264 case BUILT_IN_SINF:
29265 case BUILT_IN_ASINHF:
29266 case BUILT_IN_ASINF:
29267 case BUILT_IN_COSHF:
29268 case BUILT_IN_COSF:
29269 case BUILT_IN_ACOSHF:
29270 case BUILT_IN_ACOSF:
29271 if (el_mode != SFmode || n != 4)
29272 return NULL_TREE;
29273 break;
29274
29275 default:
29276 return NULL_TREE;
29277 }
29278
29279 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
29280
29281 if (fn == BUILT_IN_LOGF)
29282 strcpy (name, "vmlsLn4");
29283 else if (fn == BUILT_IN_LOG)
29284 strcpy (name, "vmldLn2");
29285 else if (n == 4)
29286 {
29287 sprintf (name, "vmls%s", bname+10);
29288 name[strlen (name)-1] = '4';
29289 }
29290 else
29291 sprintf (name, "vmld%s2", bname+10);
29292
29293 /* Convert to uppercase. */
29294 name[4] &= ~0x20;
29295
29296 arity = 0;
29297 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
29298 args;
29299 args = TREE_CHAIN (args))
29300 arity++;
29301
29302 if (arity == 1)
29303 fntype = build_function_type_list (type_out, type_in, NULL);
29304 else
29305 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
29306
29307 /* Build a function declaration for the vectorized function. */
29308 new_fndecl = build_decl (BUILTINS_LOCATION,
29309 FUNCTION_DECL, get_identifier (name), fntype);
29310 TREE_PUBLIC (new_fndecl) = 1;
29311 DECL_EXTERNAL (new_fndecl) = 1;
29312 DECL_IS_NOVOPS (new_fndecl) = 1;
29313 TREE_READONLY (new_fndecl) = 1;
29314
29315 return new_fndecl;
29316 }
29317
29318 /* Handler for an ACML-style interface to
29319 a library with vectorized intrinsics. */
29320
29321 static tree
29322 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
29323 {
29324 char name[20] = "__vr.._";
29325 tree fntype, new_fndecl, args;
29326 unsigned arity;
29327 const char *bname;
29328 enum machine_mode el_mode, in_mode;
29329 int n, in_n;
29330
29331 /* The ACML is 64bits only and suitable for unsafe math only as
29332 it does not correctly support parts of IEEE with the required
29333 precision such as denormals. */
29334 if (!TARGET_64BIT
29335 || !flag_unsafe_math_optimizations)
29336 return NULL_TREE;
29337
29338 el_mode = TYPE_MODE (TREE_TYPE (type_out));
29339 n = TYPE_VECTOR_SUBPARTS (type_out);
29340 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29341 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29342 if (el_mode != in_mode
29343 || n != in_n)
29344 return NULL_TREE;
29345
29346 switch (fn)
29347 {
29348 case BUILT_IN_SIN:
29349 case BUILT_IN_COS:
29350 case BUILT_IN_EXP:
29351 case BUILT_IN_LOG:
29352 case BUILT_IN_LOG2:
29353 case BUILT_IN_LOG10:
29354 name[4] = 'd';
29355 name[5] = '2';
29356 if (el_mode != DFmode
29357 || n != 2)
29358 return NULL_TREE;
29359 break;
29360
29361 case BUILT_IN_SINF:
29362 case BUILT_IN_COSF:
29363 case BUILT_IN_EXPF:
29364 case BUILT_IN_POWF:
29365 case BUILT_IN_LOGF:
29366 case BUILT_IN_LOG2F:
29367 case BUILT_IN_LOG10F:
29368 name[4] = 's';
29369 name[5] = '4';
29370 if (el_mode != SFmode
29371 || n != 4)
29372 return NULL_TREE;
29373 break;
29374
29375 default:
29376 return NULL_TREE;
29377 }
29378
29379 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
29380 sprintf (name + 7, "%s", bname+10);
29381
29382 arity = 0;
29383 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
29384 args;
29385 args = TREE_CHAIN (args))
29386 arity++;
29387
29388 if (arity == 1)
29389 fntype = build_function_type_list (type_out, type_in, NULL);
29390 else
29391 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
29392
29393 /* Build a function declaration for the vectorized function. */
29394 new_fndecl = build_decl (BUILTINS_LOCATION,
29395 FUNCTION_DECL, get_identifier (name), fntype);
29396 TREE_PUBLIC (new_fndecl) = 1;
29397 DECL_EXTERNAL (new_fndecl) = 1;
29398 DECL_IS_NOVOPS (new_fndecl) = 1;
29399 TREE_READONLY (new_fndecl) = 1;
29400
29401 return new_fndecl;
29402 }
29403
29404
29405 /* Returns a decl of a function that implements conversion of an integer vector
29406 into a floating-point vector, or vice-versa. DEST_TYPE and SRC_TYPE
29407 are the types involved when converting according to CODE.
29408 Return NULL_TREE if it is not available. */
29409
29410 static tree
29411 ix86_vectorize_builtin_conversion (unsigned int code,
29412 tree dest_type, tree src_type)
29413 {
29414 if (! TARGET_SSE2)
29415 return NULL_TREE;
29416
29417 switch (code)
29418 {
29419 case FLOAT_EXPR:
29420 switch (TYPE_MODE (src_type))
29421 {
29422 case V4SImode:
29423 switch (TYPE_MODE (dest_type))
29424 {
29425 case V4SFmode:
29426 return (TYPE_UNSIGNED (src_type)
29427 ? ix86_builtins[IX86_BUILTIN_CVTUDQ2PS]
29428 : ix86_builtins[IX86_BUILTIN_CVTDQ2PS]);
29429 case V4DFmode:
29430 return (TYPE_UNSIGNED (src_type)
29431 ? NULL_TREE
29432 : ix86_builtins[IX86_BUILTIN_CVTDQ2PD256]);
29433 default:
29434 return NULL_TREE;
29435 }
29436 break;
29437 case V8SImode:
29438 switch (TYPE_MODE (dest_type))
29439 {
29440 case V8SFmode:
29441 return (TYPE_UNSIGNED (src_type)
29442 ? NULL_TREE
29443 : ix86_builtins[IX86_BUILTIN_CVTDQ2PS256]);
29444 default:
29445 return NULL_TREE;
29446 }
29447 break;
29448 default:
29449 return NULL_TREE;
29450 }
29451
29452 case FIX_TRUNC_EXPR:
29453 switch (TYPE_MODE (dest_type))
29454 {
29455 case V4SImode:
29456 switch (TYPE_MODE (src_type))
29457 {
29458 case V4SFmode:
29459 return (TYPE_UNSIGNED (dest_type)
29460 ? NULL_TREE
29461 : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ]);
29462 case V4DFmode:
29463 return (TYPE_UNSIGNED (dest_type)
29464 ? NULL_TREE
29465 : ix86_builtins[IX86_BUILTIN_CVTTPD2DQ256]);
29466 default:
29467 return NULL_TREE;
29468 }
29469 break;
29470
29471 case V8SImode:
29472 switch (TYPE_MODE (src_type))
29473 {
29474 case V8SFmode:
29475 return (TYPE_UNSIGNED (dest_type)
29476 ? NULL_TREE
29477 : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ256]);
29478 default:
29479 return NULL_TREE;
29480 }
29481 break;
29482
29483 default:
29484 return NULL_TREE;
29485 }
29486
29487 default:
29488 return NULL_TREE;
29489 }
29490
29491 return NULL_TREE;
29492 }
29493
29494 /* Returns a code for a target-specific builtin that implements
29495 reciprocal of the function, or NULL_TREE if not available. */
29496
29497 static tree
29498 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
29499 bool sqrt ATTRIBUTE_UNUSED)
29500 {
29501 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
29502 && flag_finite_math_only && !flag_trapping_math
29503 && flag_unsafe_math_optimizations))
29504 return NULL_TREE;
29505
29506 if (md_fn)
29507 /* Machine dependent builtins. */
29508 switch (fn)
29509 {
29510 /* Vectorized version of sqrt to rsqrt conversion. */
29511 case IX86_BUILTIN_SQRTPS_NR:
29512 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
29513
29514 case IX86_BUILTIN_SQRTPS_NR256:
29515 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
29516
29517 default:
29518 return NULL_TREE;
29519 }
29520 else
29521 /* Normal builtins. */
29522 switch (fn)
29523 {
29524 /* Sqrt to rsqrt conversion. */
29525 case BUILT_IN_SQRTF:
29526 return ix86_builtins[IX86_BUILTIN_RSQRTF];
29527
29528 default:
29529 return NULL_TREE;
29530 }
29531 }
29532 \f
29533 /* Helper for avx_vpermilps256_operand et al. This is also used by
29534 the expansion functions to turn the parallel back into a mask.
29535 The return value is 0 for no match and the imm8+1 for a match. */
29536
29537 int
29538 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
29539 {
29540 unsigned i, nelt = GET_MODE_NUNITS (mode);
29541 unsigned mask = 0;
29542 unsigned char ipar[8];
29543
29544 if (XVECLEN (par, 0) != (int) nelt)
29545 return 0;
29546
29547 /* Validate that all of the elements are constants, and not totally
29548 out of range. Copy the data into an integral array to make the
29549 subsequent checks easier. */
29550 for (i = 0; i < nelt; ++i)
29551 {
29552 rtx er = XVECEXP (par, 0, i);
29553 unsigned HOST_WIDE_INT ei;
29554
29555 if (!CONST_INT_P (er))
29556 return 0;
29557 ei = INTVAL (er);
29558 if (ei >= nelt)
29559 return 0;
29560 ipar[i] = ei;
29561 }
29562
29563 switch (mode)
29564 {
29565 case V4DFmode:
29566 /* In the 256-bit DFmode case, we can only move elements within
29567 a 128-bit lane. */
29568 for (i = 0; i < 2; ++i)
29569 {
29570 if (ipar[i] >= 2)
29571 return 0;
29572 mask |= ipar[i] << i;
29573 }
29574 for (i = 2; i < 4; ++i)
29575 {
29576 if (ipar[i] < 2)
29577 return 0;
29578 mask |= (ipar[i] - 2) << i;
29579 }
29580 break;
29581
29582 case V8SFmode:
29583 /* In the 256-bit SFmode case, we have full freedom of movement
29584 within the low 128-bit lane, but the high 128-bit lane must
29585 mirror the exact same pattern. */
29586 for (i = 0; i < 4; ++i)
29587 if (ipar[i] + 4 != ipar[i + 4])
29588 return 0;
29589 nelt = 4;
29590 /* FALLTHRU */
29591
29592 case V2DFmode:
29593 case V4SFmode:
29594 /* In the 128-bit case, we've full freedom in the placement of
29595 the elements from the source operand. */
29596 for (i = 0; i < nelt; ++i)
29597 mask |= ipar[i] << (i * (nelt / 2));
29598 break;
29599
29600 default:
29601 gcc_unreachable ();
29602 }
29603
29604 /* Make sure success has a non-zero value by adding one. */
29605 return mask + 1;
29606 }
29607
29608 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
29609 the expansion functions to turn the parallel back into a mask.
29610 The return value is 0 for no match and the imm8+1 for a match. */
29611
29612 int
29613 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
29614 {
29615 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
29616 unsigned mask = 0;
29617 unsigned char ipar[8];
29618
29619 if (XVECLEN (par, 0) != (int) nelt)
29620 return 0;
29621
29622 /* Validate that all of the elements are constants, and not totally
29623 out of range. Copy the data into an integral array to make the
29624 subsequent checks easier. */
29625 for (i = 0; i < nelt; ++i)
29626 {
29627 rtx er = XVECEXP (par, 0, i);
29628 unsigned HOST_WIDE_INT ei;
29629
29630 if (!CONST_INT_P (er))
29631 return 0;
29632 ei = INTVAL (er);
29633 if (ei >= 2 * nelt)
29634 return 0;
29635 ipar[i] = ei;
29636 }
29637
29638 /* Validate that the halves of the permute are halves. */
29639 for (i = 0; i < nelt2 - 1; ++i)
29640 if (ipar[i] + 1 != ipar[i + 1])
29641 return 0;
29642 for (i = nelt2; i < nelt - 1; ++i)
29643 if (ipar[i] + 1 != ipar[i + 1])
29644 return 0;
29645
29646 /* Reconstruct the mask. */
29647 for (i = 0; i < 2; ++i)
29648 {
29649 unsigned e = ipar[i * nelt2];
29650 if (e % nelt2)
29651 return 0;
29652 e /= nelt2;
29653 mask |= e << (i * 4);
29654 }
29655
29656 /* Make sure success has a non-zero value by adding one. */
29657 return mask + 1;
29658 }
29659 \f
29660
29661 /* Store OPERAND to the memory after reload is completed. This means
29662 that we can't easily use assign_stack_local. */
29663 rtx
29664 ix86_force_to_memory (enum machine_mode mode, rtx operand)
29665 {
29666 rtx result;
29667
29668 gcc_assert (reload_completed);
29669 if (ix86_using_red_zone ())
29670 {
29671 result = gen_rtx_MEM (mode,
29672 gen_rtx_PLUS (Pmode,
29673 stack_pointer_rtx,
29674 GEN_INT (-RED_ZONE_SIZE)));
29675 emit_move_insn (result, operand);
29676 }
29677 else if (TARGET_64BIT)
29678 {
29679 switch (mode)
29680 {
29681 case HImode:
29682 case SImode:
29683 operand = gen_lowpart (DImode, operand);
29684 /* FALLTHRU */
29685 case DImode:
29686 emit_insn (
29687 gen_rtx_SET (VOIDmode,
29688 gen_rtx_MEM (DImode,
29689 gen_rtx_PRE_DEC (DImode,
29690 stack_pointer_rtx)),
29691 operand));
29692 break;
29693 default:
29694 gcc_unreachable ();
29695 }
29696 result = gen_rtx_MEM (mode, stack_pointer_rtx);
29697 }
29698 else
29699 {
29700 switch (mode)
29701 {
29702 case DImode:
29703 {
29704 rtx operands[2];
29705 split_double_mode (mode, &operand, 1, operands, operands + 1);
29706 emit_insn (
29707 gen_rtx_SET (VOIDmode,
29708 gen_rtx_MEM (SImode,
29709 gen_rtx_PRE_DEC (Pmode,
29710 stack_pointer_rtx)),
29711 operands[1]));
29712 emit_insn (
29713 gen_rtx_SET (VOIDmode,
29714 gen_rtx_MEM (SImode,
29715 gen_rtx_PRE_DEC (Pmode,
29716 stack_pointer_rtx)),
29717 operands[0]));
29718 }
29719 break;
29720 case HImode:
29721 /* Store HImodes as SImodes. */
29722 operand = gen_lowpart (SImode, operand);
29723 /* FALLTHRU */
29724 case SImode:
29725 emit_insn (
29726 gen_rtx_SET (VOIDmode,
29727 gen_rtx_MEM (GET_MODE (operand),
29728 gen_rtx_PRE_DEC (SImode,
29729 stack_pointer_rtx)),
29730 operand));
29731 break;
29732 default:
29733 gcc_unreachable ();
29734 }
29735 result = gen_rtx_MEM (mode, stack_pointer_rtx);
29736 }
29737 return result;
29738 }
29739
29740 /* Free operand from the memory. */
29741 void
29742 ix86_free_from_memory (enum machine_mode mode)
29743 {
29744 if (!ix86_using_red_zone ())
29745 {
29746 int size;
29747
29748 if (mode == DImode || TARGET_64BIT)
29749 size = 8;
29750 else
29751 size = 4;
29752 /* Use LEA to deallocate stack space. In peephole2 it will be converted
29753 to pop or add instruction if registers are available. */
29754 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
29755 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
29756 GEN_INT (size))));
29757 }
29758 }
29759
29760 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
29761
29762 Put float CONST_DOUBLE in the constant pool instead of fp regs.
29763 QImode must go into class Q_REGS.
29764 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
29765 movdf to do mem-to-mem moves through integer regs. */
29766
29767 static reg_class_t
29768 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
29769 {
29770 enum machine_mode mode = GET_MODE (x);
29771
29772 /* We're only allowed to return a subclass of CLASS. Many of the
29773 following checks fail for NO_REGS, so eliminate that early. */
29774 if (regclass == NO_REGS)
29775 return NO_REGS;
29776
29777 /* All classes can load zeros. */
29778 if (x == CONST0_RTX (mode))
29779 return regclass;
29780
29781 /* Force constants into memory if we are loading a (nonzero) constant into
29782 an MMX or SSE register. This is because there are no MMX/SSE instructions
29783 to load from a constant. */
29784 if (CONSTANT_P (x)
29785 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
29786 return NO_REGS;
29787
29788 /* Prefer SSE regs only, if we can use them for math. */
29789 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
29790 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
29791
29792 /* Floating-point constants need more complex checks. */
29793 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
29794 {
29795 /* General regs can load everything. */
29796 if (reg_class_subset_p (regclass, GENERAL_REGS))
29797 return regclass;
29798
29799 /* Floats can load 0 and 1 plus some others. Note that we eliminated
29800 zero above. We only want to wind up preferring 80387 registers if
29801 we plan on doing computation with them. */
29802 if (TARGET_80387
29803 && standard_80387_constant_p (x) > 0)
29804 {
29805 /* Limit class to non-sse. */
29806 if (regclass == FLOAT_SSE_REGS)
29807 return FLOAT_REGS;
29808 if (regclass == FP_TOP_SSE_REGS)
29809 return FP_TOP_REG;
29810 if (regclass == FP_SECOND_SSE_REGS)
29811 return FP_SECOND_REG;
29812 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
29813 return regclass;
29814 }
29815
29816 return NO_REGS;
29817 }
29818
29819 /* Generally when we see PLUS here, it's the function invariant
29820 (plus soft-fp const_int). Which can only be computed into general
29821 regs. */
29822 if (GET_CODE (x) == PLUS)
29823 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
29824
29825 /* QImode constants are easy to load, but non-constant QImode data
29826 must go into Q_REGS. */
29827 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
29828 {
29829 if (reg_class_subset_p (regclass, Q_REGS))
29830 return regclass;
29831 if (reg_class_subset_p (Q_REGS, regclass))
29832 return Q_REGS;
29833 return NO_REGS;
29834 }
29835
29836 return regclass;
29837 }
29838
29839 /* Discourage putting floating-point values in SSE registers unless
29840 SSE math is being used, and likewise for the 387 registers. */
29841 static reg_class_t
29842 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
29843 {
29844 enum machine_mode mode = GET_MODE (x);
29845
29846 /* Restrict the output reload class to the register bank that we are doing
29847 math on. If we would like not to return a subset of CLASS, reject this
29848 alternative: if reload cannot do this, it will still use its choice. */
29849 mode = GET_MODE (x);
29850 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
29851 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
29852
29853 if (X87_FLOAT_MODE_P (mode))
29854 {
29855 if (regclass == FP_TOP_SSE_REGS)
29856 return FP_TOP_REG;
29857 else if (regclass == FP_SECOND_SSE_REGS)
29858 return FP_SECOND_REG;
29859 else
29860 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
29861 }
29862
29863 return regclass;
29864 }
29865
29866 static reg_class_t
29867 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
29868 enum machine_mode mode, secondary_reload_info *sri)
29869 {
29870 /* Double-word spills from general registers to non-offsettable memory
29871 references (zero-extended addresses) require special handling. */
29872 if (TARGET_64BIT
29873 && MEM_P (x)
29874 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
29875 && rclass == GENERAL_REGS
29876 && !offsettable_memref_p (x))
29877 {
29878 sri->icode = (in_p
29879 ? CODE_FOR_reload_noff_load
29880 : CODE_FOR_reload_noff_store);
29881 /* Add the cost of moving address to a temporary. */
29882 sri->extra_cost = 1;
29883
29884 return NO_REGS;
29885 }
29886
29887 /* QImode spills from non-QI registers require
29888 intermediate register on 32bit targets. */
29889 if (!TARGET_64BIT
29890 && !in_p && mode == QImode
29891 && (rclass == GENERAL_REGS
29892 || rclass == LEGACY_REGS
29893 || rclass == INDEX_REGS))
29894 {
29895 int regno;
29896
29897 if (REG_P (x))
29898 regno = REGNO (x);
29899 else
29900 regno = -1;
29901
29902 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
29903 regno = true_regnum (x);
29904
29905 /* Return Q_REGS if the operand is in memory. */
29906 if (regno == -1)
29907 return Q_REGS;
29908 }
29909
29910 /* This condition handles corner case where an expression involving
29911 pointers gets vectorized. We're trying to use the address of a
29912 stack slot as a vector initializer.
29913
29914 (set (reg:V2DI 74 [ vect_cst_.2 ])
29915 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
29916
29917 Eventually frame gets turned into sp+offset like this:
29918
29919 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
29920 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
29921 (const_int 392 [0x188]))))
29922
29923 That later gets turned into:
29924
29925 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
29926 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
29927 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
29928
29929 We'll have the following reload recorded:
29930
29931 Reload 0: reload_in (DI) =
29932 (plus:DI (reg/f:DI 7 sp)
29933 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
29934 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
29935 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
29936 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
29937 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
29938 reload_reg_rtx: (reg:V2DI 22 xmm1)
29939
29940 Which isn't going to work since SSE instructions can't handle scalar
29941 additions. Returning GENERAL_REGS forces the addition into integer
29942 register and reload can handle subsequent reloads without problems. */
29943
29944 if (in_p && GET_CODE (x) == PLUS
29945 && SSE_CLASS_P (rclass)
29946 && SCALAR_INT_MODE_P (mode))
29947 return GENERAL_REGS;
29948
29949 return NO_REGS;
29950 }
29951
29952 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
29953
29954 static bool
29955 ix86_class_likely_spilled_p (reg_class_t rclass)
29956 {
29957 switch (rclass)
29958 {
29959 case AREG:
29960 case DREG:
29961 case CREG:
29962 case BREG:
29963 case AD_REGS:
29964 case SIREG:
29965 case DIREG:
29966 case SSE_FIRST_REG:
29967 case FP_TOP_REG:
29968 case FP_SECOND_REG:
29969 return true;
29970
29971 default:
29972 break;
29973 }
29974
29975 return false;
29976 }
29977
29978 /* If we are copying between general and FP registers, we need a memory
29979 location. The same is true for SSE and MMX registers.
29980
29981 To optimize register_move_cost performance, allow inline variant.
29982
29983 The macro can't work reliably when one of the CLASSES is class containing
29984 registers from multiple units (SSE, MMX, integer). We avoid this by never
29985 combining those units in single alternative in the machine description.
29986 Ensure that this constraint holds to avoid unexpected surprises.
29987
29988 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
29989 enforce these sanity checks. */
29990
29991 static inline bool
29992 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
29993 enum machine_mode mode, int strict)
29994 {
29995 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
29996 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
29997 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
29998 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
29999 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
30000 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
30001 {
30002 gcc_assert (!strict);
30003 return true;
30004 }
30005
30006 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
30007 return true;
30008
30009 /* ??? This is a lie. We do have moves between mmx/general, and for
30010 mmx/sse2. But by saying we need secondary memory we discourage the
30011 register allocator from using the mmx registers unless needed. */
30012 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
30013 return true;
30014
30015 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
30016 {
30017 /* SSE1 doesn't have any direct moves from other classes. */
30018 if (!TARGET_SSE2)
30019 return true;
30020
30021 /* If the target says that inter-unit moves are more expensive
30022 than moving through memory, then don't generate them. */
30023 if (!TARGET_INTER_UNIT_MOVES)
30024 return true;
30025
30026 /* Between SSE and general, we have moves no larger than word size. */
30027 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
30028 return true;
30029 }
30030
30031 return false;
30032 }
30033
30034 bool
30035 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30036 enum machine_mode mode, int strict)
30037 {
30038 return inline_secondary_memory_needed (class1, class2, mode, strict);
30039 }
30040
30041 /* Implement the TARGET_CLASS_MAX_NREGS hook.
30042
30043 On the 80386, this is the size of MODE in words,
30044 except in the FP regs, where a single reg is always enough. */
30045
30046 static unsigned char
30047 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
30048 {
30049 if (MAYBE_INTEGER_CLASS_P (rclass))
30050 {
30051 if (mode == XFmode)
30052 return (TARGET_64BIT ? 2 : 3);
30053 else if (mode == XCmode)
30054 return (TARGET_64BIT ? 4 : 6);
30055 else
30056 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
30057 }
30058 else
30059 {
30060 if (COMPLEX_MODE_P (mode))
30061 return 2;
30062 else
30063 return 1;
30064 }
30065 }
30066
30067 /* Return true if the registers in CLASS cannot represent the change from
30068 modes FROM to TO. */
30069
30070 bool
30071 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
30072 enum reg_class regclass)
30073 {
30074 if (from == to)
30075 return false;
30076
30077 /* x87 registers can't do subreg at all, as all values are reformatted
30078 to extended precision. */
30079 if (MAYBE_FLOAT_CLASS_P (regclass))
30080 return true;
30081
30082 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
30083 {
30084 /* Vector registers do not support QI or HImode loads. If we don't
30085 disallow a change to these modes, reload will assume it's ok to
30086 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
30087 the vec_dupv4hi pattern. */
30088 if (GET_MODE_SIZE (from) < 4)
30089 return true;
30090
30091 /* Vector registers do not support subreg with nonzero offsets, which
30092 are otherwise valid for integer registers. Since we can't see
30093 whether we have a nonzero offset from here, prohibit all
30094 nonparadoxical subregs changing size. */
30095 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
30096 return true;
30097 }
30098
30099 return false;
30100 }
30101
30102 /* Return the cost of moving data of mode M between a
30103 register and memory. A value of 2 is the default; this cost is
30104 relative to those in `REGISTER_MOVE_COST'.
30105
30106 This function is used extensively by register_move_cost that is used to
30107 build tables at startup. Make it inline in this case.
30108 When IN is 2, return maximum of in and out move cost.
30109
30110 If moving between registers and memory is more expensive than
30111 between two registers, you should define this macro to express the
30112 relative cost.
30113
30114 Model also increased moving costs of QImode registers in non
30115 Q_REGS classes.
30116 */
30117 static inline int
30118 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
30119 int in)
30120 {
30121 int cost;
30122 if (FLOAT_CLASS_P (regclass))
30123 {
30124 int index;
30125 switch (mode)
30126 {
30127 case SFmode:
30128 index = 0;
30129 break;
30130 case DFmode:
30131 index = 1;
30132 break;
30133 case XFmode:
30134 index = 2;
30135 break;
30136 default:
30137 return 100;
30138 }
30139 if (in == 2)
30140 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
30141 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
30142 }
30143 if (SSE_CLASS_P (regclass))
30144 {
30145 int index;
30146 switch (GET_MODE_SIZE (mode))
30147 {
30148 case 4:
30149 index = 0;
30150 break;
30151 case 8:
30152 index = 1;
30153 break;
30154 case 16:
30155 index = 2;
30156 break;
30157 default:
30158 return 100;
30159 }
30160 if (in == 2)
30161 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
30162 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
30163 }
30164 if (MMX_CLASS_P (regclass))
30165 {
30166 int index;
30167 switch (GET_MODE_SIZE (mode))
30168 {
30169 case 4:
30170 index = 0;
30171 break;
30172 case 8:
30173 index = 1;
30174 break;
30175 default:
30176 return 100;
30177 }
30178 if (in)
30179 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
30180 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
30181 }
30182 switch (GET_MODE_SIZE (mode))
30183 {
30184 case 1:
30185 if (Q_CLASS_P (regclass) || TARGET_64BIT)
30186 {
30187 if (!in)
30188 return ix86_cost->int_store[0];
30189 if (TARGET_PARTIAL_REG_DEPENDENCY
30190 && optimize_function_for_speed_p (cfun))
30191 cost = ix86_cost->movzbl_load;
30192 else
30193 cost = ix86_cost->int_load[0];
30194 if (in == 2)
30195 return MAX (cost, ix86_cost->int_store[0]);
30196 return cost;
30197 }
30198 else
30199 {
30200 if (in == 2)
30201 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
30202 if (in)
30203 return ix86_cost->movzbl_load;
30204 else
30205 return ix86_cost->int_store[0] + 4;
30206 }
30207 break;
30208 case 2:
30209 if (in == 2)
30210 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
30211 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
30212 default:
30213 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
30214 if (mode == TFmode)
30215 mode = XFmode;
30216 if (in == 2)
30217 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
30218 else if (in)
30219 cost = ix86_cost->int_load[2];
30220 else
30221 cost = ix86_cost->int_store[2];
30222 return (cost * (((int) GET_MODE_SIZE (mode)
30223 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
30224 }
30225 }
30226
30227 static int
30228 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
30229 bool in)
30230 {
30231 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
30232 }
30233
30234
30235 /* Return the cost of moving data from a register in class CLASS1 to
30236 one in class CLASS2.
30237
30238 It is not required that the cost always equal 2 when FROM is the same as TO;
30239 on some machines it is expensive to move between registers if they are not
30240 general registers. */
30241
30242 static int
30243 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
30244 reg_class_t class2_i)
30245 {
30246 enum reg_class class1 = (enum reg_class) class1_i;
30247 enum reg_class class2 = (enum reg_class) class2_i;
30248
30249 /* In case we require secondary memory, compute cost of the store followed
30250 by load. In order to avoid bad register allocation choices, we need
30251 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
30252
30253 if (inline_secondary_memory_needed (class1, class2, mode, 0))
30254 {
30255 int cost = 1;
30256
30257 cost += inline_memory_move_cost (mode, class1, 2);
30258 cost += inline_memory_move_cost (mode, class2, 2);
30259
30260 /* In case of copying from general_purpose_register we may emit multiple
30261 stores followed by single load causing memory size mismatch stall.
30262 Count this as arbitrarily high cost of 20. */
30263 if (targetm.class_max_nregs (class1, mode)
30264 > targetm.class_max_nregs (class2, mode))
30265 cost += 20;
30266
30267 /* In the case of FP/MMX moves, the registers actually overlap, and we
30268 have to switch modes in order to treat them differently. */
30269 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
30270 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
30271 cost += 20;
30272
30273 return cost;
30274 }
30275
30276 /* Moves between SSE/MMX and integer unit are expensive. */
30277 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
30278 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
30279
30280 /* ??? By keeping returned value relatively high, we limit the number
30281 of moves between integer and MMX/SSE registers for all targets.
30282 Additionally, high value prevents problem with x86_modes_tieable_p(),
30283 where integer modes in MMX/SSE registers are not tieable
30284 because of missing QImode and HImode moves to, from or between
30285 MMX/SSE registers. */
30286 return MAX (8, ix86_cost->mmxsse_to_integer);
30287
30288 if (MAYBE_FLOAT_CLASS_P (class1))
30289 return ix86_cost->fp_move;
30290 if (MAYBE_SSE_CLASS_P (class1))
30291 return ix86_cost->sse_move;
30292 if (MAYBE_MMX_CLASS_P (class1))
30293 return ix86_cost->mmx_move;
30294 return 2;
30295 }
30296
30297 /* Return TRUE if hard register REGNO can hold a value of machine-mode
30298 MODE. */
30299
30300 bool
30301 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
30302 {
30303 /* Flags and only flags can only hold CCmode values. */
30304 if (CC_REGNO_P (regno))
30305 return GET_MODE_CLASS (mode) == MODE_CC;
30306 if (GET_MODE_CLASS (mode) == MODE_CC
30307 || GET_MODE_CLASS (mode) == MODE_RANDOM
30308 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
30309 return false;
30310 if (FP_REGNO_P (regno))
30311 return VALID_FP_MODE_P (mode);
30312 if (SSE_REGNO_P (regno))
30313 {
30314 /* We implement the move patterns for all vector modes into and
30315 out of SSE registers, even when no operation instructions
30316 are available. OImode move is available only when AVX is
30317 enabled. */
30318 return ((TARGET_AVX && mode == OImode)
30319 || VALID_AVX256_REG_MODE (mode)
30320 || VALID_SSE_REG_MODE (mode)
30321 || VALID_SSE2_REG_MODE (mode)
30322 || VALID_MMX_REG_MODE (mode)
30323 || VALID_MMX_REG_MODE_3DNOW (mode));
30324 }
30325 if (MMX_REGNO_P (regno))
30326 {
30327 /* We implement the move patterns for 3DNOW modes even in MMX mode,
30328 so if the register is available at all, then we can move data of
30329 the given mode into or out of it. */
30330 return (VALID_MMX_REG_MODE (mode)
30331 || VALID_MMX_REG_MODE_3DNOW (mode));
30332 }
30333
30334 if (mode == QImode)
30335 {
30336 /* Take care for QImode values - they can be in non-QI regs,
30337 but then they do cause partial register stalls. */
30338 if (regno <= BX_REG || TARGET_64BIT)
30339 return true;
30340 if (!TARGET_PARTIAL_REG_STALL)
30341 return true;
30342 return !can_create_pseudo_p ();
30343 }
30344 /* We handle both integer and floats in the general purpose registers. */
30345 else if (VALID_INT_MODE_P (mode))
30346 return true;
30347 else if (VALID_FP_MODE_P (mode))
30348 return true;
30349 else if (VALID_DFP_MODE_P (mode))
30350 return true;
30351 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
30352 on to use that value in smaller contexts, this can easily force a
30353 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
30354 supporting DImode, allow it. */
30355 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
30356 return true;
30357
30358 return false;
30359 }
30360
30361 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
30362 tieable integer mode. */
30363
30364 static bool
30365 ix86_tieable_integer_mode_p (enum machine_mode mode)
30366 {
30367 switch (mode)
30368 {
30369 case HImode:
30370 case SImode:
30371 return true;
30372
30373 case QImode:
30374 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
30375
30376 case DImode:
30377 return TARGET_64BIT;
30378
30379 default:
30380 return false;
30381 }
30382 }
30383
30384 /* Return true if MODE1 is accessible in a register that can hold MODE2
30385 without copying. That is, all register classes that can hold MODE2
30386 can also hold MODE1. */
30387
30388 bool
30389 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
30390 {
30391 if (mode1 == mode2)
30392 return true;
30393
30394 if (ix86_tieable_integer_mode_p (mode1)
30395 && ix86_tieable_integer_mode_p (mode2))
30396 return true;
30397
30398 /* MODE2 being XFmode implies fp stack or general regs, which means we
30399 can tie any smaller floating point modes to it. Note that we do not
30400 tie this with TFmode. */
30401 if (mode2 == XFmode)
30402 return mode1 == SFmode || mode1 == DFmode;
30403
30404 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
30405 that we can tie it with SFmode. */
30406 if (mode2 == DFmode)
30407 return mode1 == SFmode;
30408
30409 /* If MODE2 is only appropriate for an SSE register, then tie with
30410 any other mode acceptable to SSE registers. */
30411 if (GET_MODE_SIZE (mode2) == 16
30412 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
30413 return (GET_MODE_SIZE (mode1) == 16
30414 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
30415
30416 /* If MODE2 is appropriate for an MMX register, then tie
30417 with any other mode acceptable to MMX registers. */
30418 if (GET_MODE_SIZE (mode2) == 8
30419 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
30420 return (GET_MODE_SIZE (mode1) == 8
30421 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
30422
30423 return false;
30424 }
30425
30426 /* Compute a (partial) cost for rtx X. Return true if the complete
30427 cost has been computed, and false if subexpressions should be
30428 scanned. In either case, *TOTAL contains the cost result. */
30429
30430 static bool
30431 ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
30432 bool speed)
30433 {
30434 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
30435 enum machine_mode mode = GET_MODE (x);
30436 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
30437
30438 switch (code)
30439 {
30440 case CONST_INT:
30441 case CONST:
30442 case LABEL_REF:
30443 case SYMBOL_REF:
30444 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
30445 *total = 3;
30446 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
30447 *total = 2;
30448 else if (flag_pic && SYMBOLIC_CONST (x)
30449 && (!TARGET_64BIT
30450 || (!GET_CODE (x) != LABEL_REF
30451 && (GET_CODE (x) != SYMBOL_REF
30452 || !SYMBOL_REF_LOCAL_P (x)))))
30453 *total = 1;
30454 else
30455 *total = 0;
30456 return true;
30457
30458 case CONST_DOUBLE:
30459 if (mode == VOIDmode)
30460 *total = 0;
30461 else
30462 switch (standard_80387_constant_p (x))
30463 {
30464 case 1: /* 0.0 */
30465 *total = 1;
30466 break;
30467 default: /* Other constants */
30468 *total = 2;
30469 break;
30470 case 0:
30471 case -1:
30472 /* Start with (MEM (SYMBOL_REF)), since that's where
30473 it'll probably end up. Add a penalty for size. */
30474 *total = (COSTS_N_INSNS (1)
30475 + (flag_pic != 0 && !TARGET_64BIT)
30476 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
30477 break;
30478 }
30479 return true;
30480
30481 case ZERO_EXTEND:
30482 /* The zero extensions is often completely free on x86_64, so make
30483 it as cheap as possible. */
30484 if (TARGET_64BIT && mode == DImode
30485 && GET_MODE (XEXP (x, 0)) == SImode)
30486 *total = 1;
30487 else if (TARGET_ZERO_EXTEND_WITH_AND)
30488 *total = cost->add;
30489 else
30490 *total = cost->movzx;
30491 return false;
30492
30493 case SIGN_EXTEND:
30494 *total = cost->movsx;
30495 return false;
30496
30497 case ASHIFT:
30498 if (CONST_INT_P (XEXP (x, 1))
30499 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
30500 {
30501 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
30502 if (value == 1)
30503 {
30504 *total = cost->add;
30505 return false;
30506 }
30507 if ((value == 2 || value == 3)
30508 && cost->lea <= cost->shift_const)
30509 {
30510 *total = cost->lea;
30511 return false;
30512 }
30513 }
30514 /* FALLTHRU */
30515
30516 case ROTATE:
30517 case ASHIFTRT:
30518 case LSHIFTRT:
30519 case ROTATERT:
30520 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
30521 {
30522 if (CONST_INT_P (XEXP (x, 1)))
30523 {
30524 if (INTVAL (XEXP (x, 1)) > 32)
30525 *total = cost->shift_const + COSTS_N_INSNS (2);
30526 else
30527 *total = cost->shift_const * 2;
30528 }
30529 else
30530 {
30531 if (GET_CODE (XEXP (x, 1)) == AND)
30532 *total = cost->shift_var * 2;
30533 else
30534 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
30535 }
30536 }
30537 else
30538 {
30539 if (CONST_INT_P (XEXP (x, 1)))
30540 *total = cost->shift_const;
30541 else
30542 *total = cost->shift_var;
30543 }
30544 return false;
30545
30546 case FMA:
30547 {
30548 rtx sub;
30549
30550 gcc_assert (FLOAT_MODE_P (mode));
30551 gcc_assert (TARGET_FMA || TARGET_FMA4);
30552
30553 /* ??? SSE scalar/vector cost should be used here. */
30554 /* ??? Bald assumption that fma has the same cost as fmul. */
30555 *total = cost->fmul;
30556 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
30557
30558 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
30559 sub = XEXP (x, 0);
30560 if (GET_CODE (sub) == NEG)
30561 sub = XEXP (sub, 0);
30562 *total += rtx_cost (sub, FMA, 0, speed);
30563
30564 sub = XEXP (x, 2);
30565 if (GET_CODE (sub) == NEG)
30566 sub = XEXP (sub, 0);
30567 *total += rtx_cost (sub, FMA, 2, speed);
30568 return true;
30569 }
30570
30571 case MULT:
30572 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
30573 {
30574 /* ??? SSE scalar cost should be used here. */
30575 *total = cost->fmul;
30576 return false;
30577 }
30578 else if (X87_FLOAT_MODE_P (mode))
30579 {
30580 *total = cost->fmul;
30581 return false;
30582 }
30583 else if (FLOAT_MODE_P (mode))
30584 {
30585 /* ??? SSE vector cost should be used here. */
30586 *total = cost->fmul;
30587 return false;
30588 }
30589 else
30590 {
30591 rtx op0 = XEXP (x, 0);
30592 rtx op1 = XEXP (x, 1);
30593 int nbits;
30594 if (CONST_INT_P (XEXP (x, 1)))
30595 {
30596 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
30597 for (nbits = 0; value != 0; value &= value - 1)
30598 nbits++;
30599 }
30600 else
30601 /* This is arbitrary. */
30602 nbits = 7;
30603
30604 /* Compute costs correctly for widening multiplication. */
30605 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
30606 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
30607 == GET_MODE_SIZE (mode))
30608 {
30609 int is_mulwiden = 0;
30610 enum machine_mode inner_mode = GET_MODE (op0);
30611
30612 if (GET_CODE (op0) == GET_CODE (op1))
30613 is_mulwiden = 1, op1 = XEXP (op1, 0);
30614 else if (CONST_INT_P (op1))
30615 {
30616 if (GET_CODE (op0) == SIGN_EXTEND)
30617 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
30618 == INTVAL (op1);
30619 else
30620 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
30621 }
30622
30623 if (is_mulwiden)
30624 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
30625 }
30626
30627 *total = (cost->mult_init[MODE_INDEX (mode)]
30628 + nbits * cost->mult_bit
30629 + rtx_cost (op0, outer_code, opno, speed)
30630 + rtx_cost (op1, outer_code, opno, speed));
30631
30632 return true;
30633 }
30634
30635 case DIV:
30636 case UDIV:
30637 case MOD:
30638 case UMOD:
30639 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
30640 /* ??? SSE cost should be used here. */
30641 *total = cost->fdiv;
30642 else if (X87_FLOAT_MODE_P (mode))
30643 *total = cost->fdiv;
30644 else if (FLOAT_MODE_P (mode))
30645 /* ??? SSE vector cost should be used here. */
30646 *total = cost->fdiv;
30647 else
30648 *total = cost->divide[MODE_INDEX (mode)];
30649 return false;
30650
30651 case PLUS:
30652 if (GET_MODE_CLASS (mode) == MODE_INT
30653 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
30654 {
30655 if (GET_CODE (XEXP (x, 0)) == PLUS
30656 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
30657 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
30658 && CONSTANT_P (XEXP (x, 1)))
30659 {
30660 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
30661 if (val == 2 || val == 4 || val == 8)
30662 {
30663 *total = cost->lea;
30664 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
30665 outer_code, opno, speed);
30666 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
30667 outer_code, opno, speed);
30668 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
30669 return true;
30670 }
30671 }
30672 else if (GET_CODE (XEXP (x, 0)) == MULT
30673 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
30674 {
30675 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
30676 if (val == 2 || val == 4 || val == 8)
30677 {
30678 *total = cost->lea;
30679 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
30680 outer_code, opno, speed);
30681 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
30682 return true;
30683 }
30684 }
30685 else if (GET_CODE (XEXP (x, 0)) == PLUS)
30686 {
30687 *total = cost->lea;
30688 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
30689 outer_code, opno, speed);
30690 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
30691 outer_code, opno, speed);
30692 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
30693 return true;
30694 }
30695 }
30696 /* FALLTHRU */
30697
30698 case MINUS:
30699 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
30700 {
30701 /* ??? SSE cost should be used here. */
30702 *total = cost->fadd;
30703 return false;
30704 }
30705 else if (X87_FLOAT_MODE_P (mode))
30706 {
30707 *total = cost->fadd;
30708 return false;
30709 }
30710 else if (FLOAT_MODE_P (mode))
30711 {
30712 /* ??? SSE vector cost should be used here. */
30713 *total = cost->fadd;
30714 return false;
30715 }
30716 /* FALLTHRU */
30717
30718 case AND:
30719 case IOR:
30720 case XOR:
30721 if (!TARGET_64BIT && mode == DImode)
30722 {
30723 *total = (cost->add * 2
30724 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
30725 << (GET_MODE (XEXP (x, 0)) != DImode))
30726 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
30727 << (GET_MODE (XEXP (x, 1)) != DImode)));
30728 return true;
30729 }
30730 /* FALLTHRU */
30731
30732 case NEG:
30733 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
30734 {
30735 /* ??? SSE cost should be used here. */
30736 *total = cost->fchs;
30737 return false;
30738 }
30739 else if (X87_FLOAT_MODE_P (mode))
30740 {
30741 *total = cost->fchs;
30742 return false;
30743 }
30744 else if (FLOAT_MODE_P (mode))
30745 {
30746 /* ??? SSE vector cost should be used here. */
30747 *total = cost->fchs;
30748 return false;
30749 }
30750 /* FALLTHRU */
30751
30752 case NOT:
30753 if (!TARGET_64BIT && mode == DImode)
30754 *total = cost->add * 2;
30755 else
30756 *total = cost->add;
30757 return false;
30758
30759 case COMPARE:
30760 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
30761 && XEXP (XEXP (x, 0), 1) == const1_rtx
30762 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
30763 && XEXP (x, 1) == const0_rtx)
30764 {
30765 /* This kind of construct is implemented using test[bwl].
30766 Treat it as if we had an AND. */
30767 *total = (cost->add
30768 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
30769 + rtx_cost (const1_rtx, outer_code, opno, speed));
30770 return true;
30771 }
30772 return false;
30773
30774 case FLOAT_EXTEND:
30775 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
30776 *total = 0;
30777 return false;
30778
30779 case ABS:
30780 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
30781 /* ??? SSE cost should be used here. */
30782 *total = cost->fabs;
30783 else if (X87_FLOAT_MODE_P (mode))
30784 *total = cost->fabs;
30785 else if (FLOAT_MODE_P (mode))
30786 /* ??? SSE vector cost should be used here. */
30787 *total = cost->fabs;
30788 return false;
30789
30790 case SQRT:
30791 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
30792 /* ??? SSE cost should be used here. */
30793 *total = cost->fsqrt;
30794 else if (X87_FLOAT_MODE_P (mode))
30795 *total = cost->fsqrt;
30796 else if (FLOAT_MODE_P (mode))
30797 /* ??? SSE vector cost should be used here. */
30798 *total = cost->fsqrt;
30799 return false;
30800
30801 case UNSPEC:
30802 if (XINT (x, 1) == UNSPEC_TP)
30803 *total = 0;
30804 return false;
30805
30806 case VEC_SELECT:
30807 case VEC_CONCAT:
30808 case VEC_MERGE:
30809 case VEC_DUPLICATE:
30810 /* ??? Assume all of these vector manipulation patterns are
30811 recognizable. In which case they all pretty much have the
30812 same cost. */
30813 *total = COSTS_N_INSNS (1);
30814 return true;
30815
30816 default:
30817 return false;
30818 }
30819 }
30820
30821 #if TARGET_MACHO
30822
30823 static int current_machopic_label_num;
30824
30825 /* Given a symbol name and its associated stub, write out the
30826 definition of the stub. */
30827
30828 void
30829 machopic_output_stub (FILE *file, const char *symb, const char *stub)
30830 {
30831 unsigned int length;
30832 char *binder_name, *symbol_name, lazy_ptr_name[32];
30833 int label = ++current_machopic_label_num;
30834
30835 /* For 64-bit we shouldn't get here. */
30836 gcc_assert (!TARGET_64BIT);
30837
30838 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
30839 symb = targetm.strip_name_encoding (symb);
30840
30841 length = strlen (stub);
30842 binder_name = XALLOCAVEC (char, length + 32);
30843 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
30844
30845 length = strlen (symb);
30846 symbol_name = XALLOCAVEC (char, length + 32);
30847 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
30848
30849 sprintf (lazy_ptr_name, "L%d$lz", label);
30850
30851 if (MACHOPIC_ATT_STUB)
30852 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
30853 else if (MACHOPIC_PURE)
30854 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
30855 else
30856 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
30857
30858 fprintf (file, "%s:\n", stub);
30859 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
30860
30861 if (MACHOPIC_ATT_STUB)
30862 {
30863 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
30864 }
30865 else if (MACHOPIC_PURE)
30866 {
30867 /* PIC stub. */
30868 /* 25-byte PIC stub using "CALL get_pc_thunk". */
30869 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
30870 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
30871 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
30872 label, lazy_ptr_name, label);
30873 fprintf (file, "\tjmp\t*%%ecx\n");
30874 }
30875 else
30876 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
30877
30878 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
30879 it needs no stub-binding-helper. */
30880 if (MACHOPIC_ATT_STUB)
30881 return;
30882
30883 fprintf (file, "%s:\n", binder_name);
30884
30885 if (MACHOPIC_PURE)
30886 {
30887 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
30888 fprintf (file, "\tpushl\t%%ecx\n");
30889 }
30890 else
30891 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
30892
30893 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
30894
30895 /* N.B. Keep the correspondence of these
30896 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
30897 old-pic/new-pic/non-pic stubs; altering this will break
30898 compatibility with existing dylibs. */
30899 if (MACHOPIC_PURE)
30900 {
30901 /* 25-byte PIC stub using "CALL get_pc_thunk". */
30902 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
30903 }
30904 else
30905 /* 16-byte -mdynamic-no-pic stub. */
30906 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
30907
30908 fprintf (file, "%s:\n", lazy_ptr_name);
30909 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
30910 fprintf (file, ASM_LONG "%s\n", binder_name);
30911 }
30912 #endif /* TARGET_MACHO */
30913
30914 /* Order the registers for register allocator. */
30915
30916 void
30917 x86_order_regs_for_local_alloc (void)
30918 {
30919 int pos = 0;
30920 int i;
30921
30922 /* First allocate the local general purpose registers. */
30923 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
30924 if (GENERAL_REGNO_P (i) && call_used_regs[i])
30925 reg_alloc_order [pos++] = i;
30926
30927 /* Global general purpose registers. */
30928 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
30929 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
30930 reg_alloc_order [pos++] = i;
30931
30932 /* x87 registers come first in case we are doing FP math
30933 using them. */
30934 if (!TARGET_SSE_MATH)
30935 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
30936 reg_alloc_order [pos++] = i;
30937
30938 /* SSE registers. */
30939 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
30940 reg_alloc_order [pos++] = i;
30941 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
30942 reg_alloc_order [pos++] = i;
30943
30944 /* x87 registers. */
30945 if (TARGET_SSE_MATH)
30946 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
30947 reg_alloc_order [pos++] = i;
30948
30949 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
30950 reg_alloc_order [pos++] = i;
30951
30952 /* Initialize the rest of array as we do not allocate some registers
30953 at all. */
30954 while (pos < FIRST_PSEUDO_REGISTER)
30955 reg_alloc_order [pos++] = 0;
30956 }
30957
30958 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
30959 in struct attribute_spec handler. */
30960 static tree
30961 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
30962 tree args,
30963 int flags ATTRIBUTE_UNUSED,
30964 bool *no_add_attrs)
30965 {
30966 if (TREE_CODE (*node) != FUNCTION_TYPE
30967 && TREE_CODE (*node) != METHOD_TYPE
30968 && TREE_CODE (*node) != FIELD_DECL
30969 && TREE_CODE (*node) != TYPE_DECL)
30970 {
30971 warning (OPT_Wattributes, "%qE attribute only applies to functions",
30972 name);
30973 *no_add_attrs = true;
30974 return NULL_TREE;
30975 }
30976 if (TARGET_64BIT)
30977 {
30978 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
30979 name);
30980 *no_add_attrs = true;
30981 return NULL_TREE;
30982 }
30983 if (is_attribute_p ("callee_pop_aggregate_return", name))
30984 {
30985 tree cst;
30986
30987 cst = TREE_VALUE (args);
30988 if (TREE_CODE (cst) != INTEGER_CST)
30989 {
30990 warning (OPT_Wattributes,
30991 "%qE attribute requires an integer constant argument",
30992 name);
30993 *no_add_attrs = true;
30994 }
30995 else if (compare_tree_int (cst, 0) != 0
30996 && compare_tree_int (cst, 1) != 0)
30997 {
30998 warning (OPT_Wattributes,
30999 "argument to %qE attribute is neither zero, nor one",
31000 name);
31001 *no_add_attrs = true;
31002 }
31003
31004 return NULL_TREE;
31005 }
31006
31007 return NULL_TREE;
31008 }
31009
31010 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
31011 struct attribute_spec.handler. */
31012 static tree
31013 ix86_handle_abi_attribute (tree *node, tree name,
31014 tree args ATTRIBUTE_UNUSED,
31015 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31016 {
31017 if (TREE_CODE (*node) != FUNCTION_TYPE
31018 && TREE_CODE (*node) != METHOD_TYPE
31019 && TREE_CODE (*node) != FIELD_DECL
31020 && TREE_CODE (*node) != TYPE_DECL)
31021 {
31022 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31023 name);
31024 *no_add_attrs = true;
31025 return NULL_TREE;
31026 }
31027
31028 /* Can combine regparm with all attributes but fastcall. */
31029 if (is_attribute_p ("ms_abi", name))
31030 {
31031 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
31032 {
31033 error ("ms_abi and sysv_abi attributes are not compatible");
31034 }
31035
31036 return NULL_TREE;
31037 }
31038 else if (is_attribute_p ("sysv_abi", name))
31039 {
31040 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
31041 {
31042 error ("ms_abi and sysv_abi attributes are not compatible");
31043 }
31044
31045 return NULL_TREE;
31046 }
31047
31048 return NULL_TREE;
31049 }
31050
31051 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
31052 struct attribute_spec.handler. */
31053 static tree
31054 ix86_handle_struct_attribute (tree *node, tree name,
31055 tree args ATTRIBUTE_UNUSED,
31056 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31057 {
31058 tree *type = NULL;
31059 if (DECL_P (*node))
31060 {
31061 if (TREE_CODE (*node) == TYPE_DECL)
31062 type = &TREE_TYPE (*node);
31063 }
31064 else
31065 type = node;
31066
31067 if (!(type && (TREE_CODE (*type) == RECORD_TYPE
31068 || TREE_CODE (*type) == UNION_TYPE)))
31069 {
31070 warning (OPT_Wattributes, "%qE attribute ignored",
31071 name);
31072 *no_add_attrs = true;
31073 }
31074
31075 else if ((is_attribute_p ("ms_struct", name)
31076 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
31077 || ((is_attribute_p ("gcc_struct", name)
31078 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
31079 {
31080 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
31081 name);
31082 *no_add_attrs = true;
31083 }
31084
31085 return NULL_TREE;
31086 }
31087
31088 static tree
31089 ix86_handle_fndecl_attribute (tree *node, tree name,
31090 tree args ATTRIBUTE_UNUSED,
31091 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31092 {
31093 if (TREE_CODE (*node) != FUNCTION_DECL)
31094 {
31095 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31096 name);
31097 *no_add_attrs = true;
31098 }
31099 return NULL_TREE;
31100 }
31101
31102 static bool
31103 ix86_ms_bitfield_layout_p (const_tree record_type)
31104 {
31105 return ((TARGET_MS_BITFIELD_LAYOUT
31106 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
31107 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
31108 }
31109
31110 /* Returns an expression indicating where the this parameter is
31111 located on entry to the FUNCTION. */
31112
31113 static rtx
31114 x86_this_parameter (tree function)
31115 {
31116 tree type = TREE_TYPE (function);
31117 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
31118 int nregs;
31119
31120 if (TARGET_64BIT)
31121 {
31122 const int *parm_regs;
31123
31124 if (ix86_function_type_abi (type) == MS_ABI)
31125 parm_regs = x86_64_ms_abi_int_parameter_registers;
31126 else
31127 parm_regs = x86_64_int_parameter_registers;
31128 return gen_rtx_REG (DImode, parm_regs[aggr]);
31129 }
31130
31131 nregs = ix86_function_regparm (type, function);
31132
31133 if (nregs > 0 && !stdarg_p (type))
31134 {
31135 int regno;
31136 unsigned int ccvt = ix86_get_callcvt (type);
31137
31138 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
31139 regno = aggr ? DX_REG : CX_REG;
31140 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
31141 {
31142 regno = CX_REG;
31143 if (aggr)
31144 return gen_rtx_MEM (SImode,
31145 plus_constant (stack_pointer_rtx, 4));
31146 }
31147 else
31148 {
31149 regno = AX_REG;
31150 if (aggr)
31151 {
31152 regno = DX_REG;
31153 if (nregs == 1)
31154 return gen_rtx_MEM (SImode,
31155 plus_constant (stack_pointer_rtx, 4));
31156 }
31157 }
31158 return gen_rtx_REG (SImode, regno);
31159 }
31160
31161 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
31162 }
31163
31164 /* Determine whether x86_output_mi_thunk can succeed. */
31165
31166 static bool
31167 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
31168 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
31169 HOST_WIDE_INT vcall_offset, const_tree function)
31170 {
31171 /* 64-bit can handle anything. */
31172 if (TARGET_64BIT)
31173 return true;
31174
31175 /* For 32-bit, everything's fine if we have one free register. */
31176 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
31177 return true;
31178
31179 /* Need a free register for vcall_offset. */
31180 if (vcall_offset)
31181 return false;
31182
31183 /* Need a free register for GOT references. */
31184 if (flag_pic && !targetm.binds_local_p (function))
31185 return false;
31186
31187 /* Otherwise ok. */
31188 return true;
31189 }
31190
31191 /* Output the assembler code for a thunk function. THUNK_DECL is the
31192 declaration for the thunk function itself, FUNCTION is the decl for
31193 the target function. DELTA is an immediate constant offset to be
31194 added to THIS. If VCALL_OFFSET is nonzero, the word at
31195 *(*this + vcall_offset) should be added to THIS. */
31196
31197 static void
31198 x86_output_mi_thunk (FILE *file,
31199 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
31200 HOST_WIDE_INT vcall_offset, tree function)
31201 {
31202 rtx this_param = x86_this_parameter (function);
31203 rtx this_reg, tmp, fnaddr;
31204
31205 emit_note (NOTE_INSN_PROLOGUE_END);
31206
31207 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
31208 pull it in now and let DELTA benefit. */
31209 if (REG_P (this_param))
31210 this_reg = this_param;
31211 else if (vcall_offset)
31212 {
31213 /* Put the this parameter into %eax. */
31214 this_reg = gen_rtx_REG (Pmode, AX_REG);
31215 emit_move_insn (this_reg, this_param);
31216 }
31217 else
31218 this_reg = NULL_RTX;
31219
31220 /* Adjust the this parameter by a fixed constant. */
31221 if (delta)
31222 {
31223 rtx delta_rtx = GEN_INT (delta);
31224 rtx delta_dst = this_reg ? this_reg : this_param;
31225
31226 if (TARGET_64BIT)
31227 {
31228 if (!x86_64_general_operand (delta_rtx, Pmode))
31229 {
31230 tmp = gen_rtx_REG (Pmode, R10_REG);
31231 emit_move_insn (tmp, delta_rtx);
31232 delta_rtx = tmp;
31233 }
31234 }
31235
31236 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
31237 }
31238
31239 /* Adjust the this parameter by a value stored in the vtable. */
31240 if (vcall_offset)
31241 {
31242 rtx vcall_addr, vcall_mem, this_mem;
31243 unsigned int tmp_regno;
31244
31245 if (TARGET_64BIT)
31246 tmp_regno = R10_REG;
31247 else
31248 {
31249 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
31250 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
31251 tmp_regno = AX_REG;
31252 else
31253 tmp_regno = CX_REG;
31254 }
31255 tmp = gen_rtx_REG (Pmode, tmp_regno);
31256
31257 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
31258 if (Pmode != ptr_mode)
31259 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
31260 emit_move_insn (tmp, this_mem);
31261
31262 /* Adjust the this parameter. */
31263 vcall_addr = plus_constant (tmp, vcall_offset);
31264 if (TARGET_64BIT
31265 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
31266 {
31267 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
31268 emit_move_insn (tmp2, GEN_INT (vcall_offset));
31269 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
31270 }
31271
31272 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
31273 if (Pmode != ptr_mode)
31274 emit_insn (gen_addsi_1_zext (this_reg,
31275 gen_rtx_REG (ptr_mode,
31276 REGNO (this_reg)),
31277 vcall_mem));
31278 else
31279 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
31280 }
31281
31282 /* If necessary, drop THIS back to its stack slot. */
31283 if (this_reg && this_reg != this_param)
31284 emit_move_insn (this_param, this_reg);
31285
31286 fnaddr = XEXP (DECL_RTL (function), 0);
31287 if (TARGET_64BIT)
31288 {
31289 if (!flag_pic || targetm.binds_local_p (function)
31290 || cfun->machine->call_abi == MS_ABI)
31291 ;
31292 else
31293 {
31294 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
31295 tmp = gen_rtx_CONST (Pmode, tmp);
31296 fnaddr = gen_rtx_MEM (Pmode, tmp);
31297 }
31298 }
31299 else
31300 {
31301 if (!flag_pic || targetm.binds_local_p (function))
31302 ;
31303 #if TARGET_MACHO
31304 else if (TARGET_MACHO)
31305 {
31306 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
31307 fnaddr = XEXP (fnaddr, 0);
31308 }
31309 #endif /* TARGET_MACHO */
31310 else
31311 {
31312 tmp = gen_rtx_REG (Pmode, CX_REG);
31313 output_set_got (tmp, NULL_RTX);
31314
31315 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
31316 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
31317 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
31318 }
31319 }
31320
31321 /* Our sibling call patterns do not allow memories, because we have no
31322 predicate that can distinguish between frame and non-frame memory.
31323 For our purposes here, we can get away with (ab)using a jump pattern,
31324 because we're going to do no optimization. */
31325 if (MEM_P (fnaddr))
31326 emit_jump_insn (gen_indirect_jump (fnaddr));
31327 else
31328 {
31329 tmp = gen_rtx_MEM (QImode, fnaddr);
31330 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
31331 tmp = emit_call_insn (tmp);
31332 SIBLING_CALL_P (tmp) = 1;
31333 }
31334 emit_barrier ();
31335
31336 /* Emit just enough of rest_of_compilation to get the insns emitted.
31337 Note that use_thunk calls assemble_start_function et al. */
31338 tmp = get_insns ();
31339 insn_locators_alloc ();
31340 shorten_branches (tmp);
31341 final_start_function (tmp, file, 1);
31342 final (tmp, file, 1);
31343 final_end_function ();
31344 }
31345
31346 static void
31347 x86_file_start (void)
31348 {
31349 default_file_start ();
31350 #if TARGET_MACHO
31351 darwin_file_start ();
31352 #endif
31353 if (X86_FILE_START_VERSION_DIRECTIVE)
31354 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
31355 if (X86_FILE_START_FLTUSED)
31356 fputs ("\t.global\t__fltused\n", asm_out_file);
31357 if (ix86_asm_dialect == ASM_INTEL)
31358 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
31359 }
31360
31361 int
31362 x86_field_alignment (tree field, int computed)
31363 {
31364 enum machine_mode mode;
31365 tree type = TREE_TYPE (field);
31366
31367 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
31368 return computed;
31369 mode = TYPE_MODE (strip_array_types (type));
31370 if (mode == DFmode || mode == DCmode
31371 || GET_MODE_CLASS (mode) == MODE_INT
31372 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
31373 return MIN (32, computed);
31374 return computed;
31375 }
31376
31377 /* Output assembler code to FILE to increment profiler label # LABELNO
31378 for profiling a function entry. */
31379 void
31380 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
31381 {
31382 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
31383 : MCOUNT_NAME);
31384
31385 if (TARGET_64BIT)
31386 {
31387 #ifndef NO_PROFILE_COUNTERS
31388 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
31389 #endif
31390
31391 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
31392 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
31393 else
31394 fprintf (file, "\tcall\t%s\n", mcount_name);
31395 }
31396 else if (flag_pic)
31397 {
31398 #ifndef NO_PROFILE_COUNTERS
31399 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
31400 LPREFIX, labelno);
31401 #endif
31402 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
31403 }
31404 else
31405 {
31406 #ifndef NO_PROFILE_COUNTERS
31407 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
31408 LPREFIX, labelno);
31409 #endif
31410 fprintf (file, "\tcall\t%s\n", mcount_name);
31411 }
31412 }
31413
31414 /* We don't have exact information about the insn sizes, but we may assume
31415 quite safely that we are informed about all 1 byte insns and memory
31416 address sizes. This is enough to eliminate unnecessary padding in
31417 99% of cases. */
31418
31419 static int
31420 min_insn_size (rtx insn)
31421 {
31422 int l = 0, len;
31423
31424 if (!INSN_P (insn) || !active_insn_p (insn))
31425 return 0;
31426
31427 /* Discard alignments we've emit and jump instructions. */
31428 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
31429 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
31430 return 0;
31431 if (JUMP_TABLE_DATA_P (insn))
31432 return 0;
31433
31434 /* Important case - calls are always 5 bytes.
31435 It is common to have many calls in the row. */
31436 if (CALL_P (insn)
31437 && symbolic_reference_mentioned_p (PATTERN (insn))
31438 && !SIBLING_CALL_P (insn))
31439 return 5;
31440 len = get_attr_length (insn);
31441 if (len <= 1)
31442 return 1;
31443
31444 /* For normal instructions we rely on get_attr_length being exact,
31445 with a few exceptions. */
31446 if (!JUMP_P (insn))
31447 {
31448 enum attr_type type = get_attr_type (insn);
31449
31450 switch (type)
31451 {
31452 case TYPE_MULTI:
31453 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
31454 || asm_noperands (PATTERN (insn)) >= 0)
31455 return 0;
31456 break;
31457 case TYPE_OTHER:
31458 case TYPE_FCMP:
31459 break;
31460 default:
31461 /* Otherwise trust get_attr_length. */
31462 return len;
31463 }
31464
31465 l = get_attr_length_address (insn);
31466 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
31467 l = 4;
31468 }
31469 if (l)
31470 return 1+l;
31471 else
31472 return 2;
31473 }
31474
31475 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
31476
31477 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
31478 window. */
31479
31480 static void
31481 ix86_avoid_jump_mispredicts (void)
31482 {
31483 rtx insn, start = get_insns ();
31484 int nbytes = 0, njumps = 0;
31485 int isjump = 0;
31486
31487 /* Look for all minimal intervals of instructions containing 4 jumps.
31488 The intervals are bounded by START and INSN. NBYTES is the total
31489 size of instructions in the interval including INSN and not including
31490 START. When the NBYTES is smaller than 16 bytes, it is possible
31491 that the end of START and INSN ends up in the same 16byte page.
31492
31493 The smallest offset in the page INSN can start is the case where START
31494 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
31495 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
31496 */
31497 for (insn = start; insn; insn = NEXT_INSN (insn))
31498 {
31499 int min_size;
31500
31501 if (LABEL_P (insn))
31502 {
31503 int align = label_to_alignment (insn);
31504 int max_skip = label_to_max_skip (insn);
31505
31506 if (max_skip > 15)
31507 max_skip = 15;
31508 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
31509 already in the current 16 byte page, because otherwise
31510 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
31511 bytes to reach 16 byte boundary. */
31512 if (align <= 0
31513 || (align <= 3 && max_skip != (1 << align) - 1))
31514 max_skip = 0;
31515 if (dump_file)
31516 fprintf (dump_file, "Label %i with max_skip %i\n",
31517 INSN_UID (insn), max_skip);
31518 if (max_skip)
31519 {
31520 while (nbytes + max_skip >= 16)
31521 {
31522 start = NEXT_INSN (start);
31523 if ((JUMP_P (start)
31524 && GET_CODE (PATTERN (start)) != ADDR_VEC
31525 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
31526 || CALL_P (start))
31527 njumps--, isjump = 1;
31528 else
31529 isjump = 0;
31530 nbytes -= min_insn_size (start);
31531 }
31532 }
31533 continue;
31534 }
31535
31536 min_size = min_insn_size (insn);
31537 nbytes += min_size;
31538 if (dump_file)
31539 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
31540 INSN_UID (insn), min_size);
31541 if ((JUMP_P (insn)
31542 && GET_CODE (PATTERN (insn)) != ADDR_VEC
31543 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
31544 || CALL_P (insn))
31545 njumps++;
31546 else
31547 continue;
31548
31549 while (njumps > 3)
31550 {
31551 start = NEXT_INSN (start);
31552 if ((JUMP_P (start)
31553 && GET_CODE (PATTERN (start)) != ADDR_VEC
31554 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
31555 || CALL_P (start))
31556 njumps--, isjump = 1;
31557 else
31558 isjump = 0;
31559 nbytes -= min_insn_size (start);
31560 }
31561 gcc_assert (njumps >= 0);
31562 if (dump_file)
31563 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
31564 INSN_UID (start), INSN_UID (insn), nbytes);
31565
31566 if (njumps == 3 && isjump && nbytes < 16)
31567 {
31568 int padsize = 15 - nbytes + min_insn_size (insn);
31569
31570 if (dump_file)
31571 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
31572 INSN_UID (insn), padsize);
31573 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
31574 }
31575 }
31576 }
31577 #endif
31578
31579 /* AMD Athlon works faster
31580 when RET is not destination of conditional jump or directly preceded
31581 by other jump instruction. We avoid the penalty by inserting NOP just
31582 before the RET instructions in such cases. */
31583 static void
31584 ix86_pad_returns (void)
31585 {
31586 edge e;
31587 edge_iterator ei;
31588
31589 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
31590 {
31591 basic_block bb = e->src;
31592 rtx ret = BB_END (bb);
31593 rtx prev;
31594 bool replace = false;
31595
31596 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
31597 || optimize_bb_for_size_p (bb))
31598 continue;
31599 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
31600 if (active_insn_p (prev) || LABEL_P (prev))
31601 break;
31602 if (prev && LABEL_P (prev))
31603 {
31604 edge e;
31605 edge_iterator ei;
31606
31607 FOR_EACH_EDGE (e, ei, bb->preds)
31608 if (EDGE_FREQUENCY (e) && e->src->index >= 0
31609 && !(e->flags & EDGE_FALLTHRU))
31610 replace = true;
31611 }
31612 if (!replace)
31613 {
31614 prev = prev_active_insn (ret);
31615 if (prev
31616 && ((JUMP_P (prev) && any_condjump_p (prev))
31617 || CALL_P (prev)))
31618 replace = true;
31619 /* Empty functions get branch mispredict even when
31620 the jump destination is not visible to us. */
31621 if (!prev && !optimize_function_for_size_p (cfun))
31622 replace = true;
31623 }
31624 if (replace)
31625 {
31626 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
31627 delete_insn (ret);
31628 }
31629 }
31630 }
31631
31632 /* Count the minimum number of instructions in BB. Return 4 if the
31633 number of instructions >= 4. */
31634
31635 static int
31636 ix86_count_insn_bb (basic_block bb)
31637 {
31638 rtx insn;
31639 int insn_count = 0;
31640
31641 /* Count number of instructions in this block. Return 4 if the number
31642 of instructions >= 4. */
31643 FOR_BB_INSNS (bb, insn)
31644 {
31645 /* Only happen in exit blocks. */
31646 if (JUMP_P (insn)
31647 && ANY_RETURN_P (PATTERN (insn)))
31648 break;
31649
31650 if (NONDEBUG_INSN_P (insn)
31651 && GET_CODE (PATTERN (insn)) != USE
31652 && GET_CODE (PATTERN (insn)) != CLOBBER)
31653 {
31654 insn_count++;
31655 if (insn_count >= 4)
31656 return insn_count;
31657 }
31658 }
31659
31660 return insn_count;
31661 }
31662
31663
31664 /* Count the minimum number of instructions in code path in BB.
31665 Return 4 if the number of instructions >= 4. */
31666
31667 static int
31668 ix86_count_insn (basic_block bb)
31669 {
31670 edge e;
31671 edge_iterator ei;
31672 int min_prev_count;
31673
31674 /* Only bother counting instructions along paths with no
31675 more than 2 basic blocks between entry and exit. Given
31676 that BB has an edge to exit, determine if a predecessor
31677 of BB has an edge from entry. If so, compute the number
31678 of instructions in the predecessor block. If there
31679 happen to be multiple such blocks, compute the minimum. */
31680 min_prev_count = 4;
31681 FOR_EACH_EDGE (e, ei, bb->preds)
31682 {
31683 edge prev_e;
31684 edge_iterator prev_ei;
31685
31686 if (e->src == ENTRY_BLOCK_PTR)
31687 {
31688 min_prev_count = 0;
31689 break;
31690 }
31691 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
31692 {
31693 if (prev_e->src == ENTRY_BLOCK_PTR)
31694 {
31695 int count = ix86_count_insn_bb (e->src);
31696 if (count < min_prev_count)
31697 min_prev_count = count;
31698 break;
31699 }
31700 }
31701 }
31702
31703 if (min_prev_count < 4)
31704 min_prev_count += ix86_count_insn_bb (bb);
31705
31706 return min_prev_count;
31707 }
31708
31709 /* Pad short funtion to 4 instructions. */
31710
31711 static void
31712 ix86_pad_short_function (void)
31713 {
31714 edge e;
31715 edge_iterator ei;
31716
31717 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
31718 {
31719 rtx ret = BB_END (e->src);
31720 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
31721 {
31722 int insn_count = ix86_count_insn (e->src);
31723
31724 /* Pad short function. */
31725 if (insn_count < 4)
31726 {
31727 rtx insn = ret;
31728
31729 /* Find epilogue. */
31730 while (insn
31731 && (!NOTE_P (insn)
31732 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
31733 insn = PREV_INSN (insn);
31734
31735 if (!insn)
31736 insn = ret;
31737
31738 /* Two NOPs count as one instruction. */
31739 insn_count = 2 * (4 - insn_count);
31740 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
31741 }
31742 }
31743 }
31744 }
31745
31746 /* Implement machine specific optimizations. We implement padding of returns
31747 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
31748 static void
31749 ix86_reorg (void)
31750 {
31751 /* We are freeing block_for_insn in the toplev to keep compatibility
31752 with old MDEP_REORGS that are not CFG based. Recompute it now. */
31753 compute_bb_for_insn ();
31754
31755 /* Run the vzeroupper optimization if needed. */
31756 if (TARGET_VZEROUPPER)
31757 move_or_delete_vzeroupper ();
31758
31759 if (optimize && optimize_function_for_speed_p (cfun))
31760 {
31761 if (TARGET_PAD_SHORT_FUNCTION)
31762 ix86_pad_short_function ();
31763 else if (TARGET_PAD_RETURNS)
31764 ix86_pad_returns ();
31765 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
31766 if (TARGET_FOUR_JUMP_LIMIT)
31767 ix86_avoid_jump_mispredicts ();
31768 #endif
31769 }
31770 }
31771
31772 /* Return nonzero when QImode register that must be represented via REX prefix
31773 is used. */
31774 bool
31775 x86_extended_QIreg_mentioned_p (rtx insn)
31776 {
31777 int i;
31778 extract_insn_cached (insn);
31779 for (i = 0; i < recog_data.n_operands; i++)
31780 if (REG_P (recog_data.operand[i])
31781 && REGNO (recog_data.operand[i]) > BX_REG)
31782 return true;
31783 return false;
31784 }
31785
31786 /* Return nonzero when P points to register encoded via REX prefix.
31787 Called via for_each_rtx. */
31788 static int
31789 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
31790 {
31791 unsigned int regno;
31792 if (!REG_P (*p))
31793 return 0;
31794 regno = REGNO (*p);
31795 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
31796 }
31797
31798 /* Return true when INSN mentions register that must be encoded using REX
31799 prefix. */
31800 bool
31801 x86_extended_reg_mentioned_p (rtx insn)
31802 {
31803 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
31804 extended_reg_mentioned_1, NULL);
31805 }
31806
31807 /* If profitable, negate (without causing overflow) integer constant
31808 of mode MODE at location LOC. Return true in this case. */
31809 bool
31810 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
31811 {
31812 HOST_WIDE_INT val;
31813
31814 if (!CONST_INT_P (*loc))
31815 return false;
31816
31817 switch (mode)
31818 {
31819 case DImode:
31820 /* DImode x86_64 constants must fit in 32 bits. */
31821 gcc_assert (x86_64_immediate_operand (*loc, mode));
31822
31823 mode = SImode;
31824 break;
31825
31826 case SImode:
31827 case HImode:
31828 case QImode:
31829 break;
31830
31831 default:
31832 gcc_unreachable ();
31833 }
31834
31835 /* Avoid overflows. */
31836 if (mode_signbit_p (mode, *loc))
31837 return false;
31838
31839 val = INTVAL (*loc);
31840
31841 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
31842 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
31843 if ((val < 0 && val != -128)
31844 || val == 128)
31845 {
31846 *loc = GEN_INT (-val);
31847 return true;
31848 }
31849
31850 return false;
31851 }
31852
31853 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
31854 optabs would emit if we didn't have TFmode patterns. */
31855
31856 void
31857 x86_emit_floatuns (rtx operands[2])
31858 {
31859 rtx neglab, donelab, i0, i1, f0, in, out;
31860 enum machine_mode mode, inmode;
31861
31862 inmode = GET_MODE (operands[1]);
31863 gcc_assert (inmode == SImode || inmode == DImode);
31864
31865 out = operands[0];
31866 in = force_reg (inmode, operands[1]);
31867 mode = GET_MODE (out);
31868 neglab = gen_label_rtx ();
31869 donelab = gen_label_rtx ();
31870 f0 = gen_reg_rtx (mode);
31871
31872 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
31873
31874 expand_float (out, in, 0);
31875
31876 emit_jump_insn (gen_jump (donelab));
31877 emit_barrier ();
31878
31879 emit_label (neglab);
31880
31881 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
31882 1, OPTAB_DIRECT);
31883 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
31884 1, OPTAB_DIRECT);
31885 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
31886
31887 expand_float (f0, i0, 0);
31888
31889 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
31890
31891 emit_label (donelab);
31892 }
31893 \f
31894 /* AVX2 does support 32-byte integer vector operations,
31895 thus the longest vector we are faced with is V32QImode. */
31896 #define MAX_VECT_LEN 32
31897
31898 struct expand_vec_perm_d
31899 {
31900 rtx target, op0, op1;
31901 unsigned char perm[MAX_VECT_LEN];
31902 enum machine_mode vmode;
31903 unsigned char nelt;
31904 bool testing_p;
31905 };
31906
31907 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
31908 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
31909
31910 /* Get a vector mode of the same size as the original but with elements
31911 twice as wide. This is only guaranteed to apply to integral vectors. */
31912
31913 static inline enum machine_mode
31914 get_mode_wider_vector (enum machine_mode o)
31915 {
31916 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
31917 enum machine_mode n = GET_MODE_WIDER_MODE (o);
31918 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
31919 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
31920 return n;
31921 }
31922
31923 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
31924 with all elements equal to VAR. Return true if successful. */
31925
31926 static bool
31927 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
31928 rtx target, rtx val)
31929 {
31930 bool ok;
31931
31932 switch (mode)
31933 {
31934 case V2SImode:
31935 case V2SFmode:
31936 if (!mmx_ok)
31937 return false;
31938 /* FALLTHRU */
31939
31940 case V4DFmode:
31941 case V4DImode:
31942 case V8SFmode:
31943 case V8SImode:
31944 case V2DFmode:
31945 case V2DImode:
31946 case V4SFmode:
31947 case V4SImode:
31948 {
31949 rtx insn, dup;
31950
31951 /* First attempt to recognize VAL as-is. */
31952 dup = gen_rtx_VEC_DUPLICATE (mode, val);
31953 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
31954 if (recog_memoized (insn) < 0)
31955 {
31956 rtx seq;
31957 /* If that fails, force VAL into a register. */
31958
31959 start_sequence ();
31960 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
31961 seq = get_insns ();
31962 end_sequence ();
31963 if (seq)
31964 emit_insn_before (seq, insn);
31965
31966 ok = recog_memoized (insn) >= 0;
31967 gcc_assert (ok);
31968 }
31969 }
31970 return true;
31971
31972 case V4HImode:
31973 if (!mmx_ok)
31974 return false;
31975 if (TARGET_SSE || TARGET_3DNOW_A)
31976 {
31977 rtx x;
31978
31979 val = gen_lowpart (SImode, val);
31980 x = gen_rtx_TRUNCATE (HImode, val);
31981 x = gen_rtx_VEC_DUPLICATE (mode, x);
31982 emit_insn (gen_rtx_SET (VOIDmode, target, x));
31983 return true;
31984 }
31985 goto widen;
31986
31987 case V8QImode:
31988 if (!mmx_ok)
31989 return false;
31990 goto widen;
31991
31992 case V8HImode:
31993 if (TARGET_SSE2)
31994 {
31995 struct expand_vec_perm_d dperm;
31996 rtx tmp1, tmp2;
31997
31998 permute:
31999 memset (&dperm, 0, sizeof (dperm));
32000 dperm.target = target;
32001 dperm.vmode = mode;
32002 dperm.nelt = GET_MODE_NUNITS (mode);
32003 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
32004
32005 /* Extend to SImode using a paradoxical SUBREG. */
32006 tmp1 = gen_reg_rtx (SImode);
32007 emit_move_insn (tmp1, gen_lowpart (SImode, val));
32008
32009 /* Insert the SImode value as low element of a V4SImode vector. */
32010 tmp2 = gen_lowpart (V4SImode, dperm.op0);
32011 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
32012
32013 ok = (expand_vec_perm_1 (&dperm)
32014 || expand_vec_perm_broadcast_1 (&dperm));
32015 gcc_assert (ok);
32016 return ok;
32017 }
32018 goto widen;
32019
32020 case V16QImode:
32021 if (TARGET_SSE2)
32022 goto permute;
32023 goto widen;
32024
32025 widen:
32026 /* Replicate the value once into the next wider mode and recurse. */
32027 {
32028 enum machine_mode smode, wsmode, wvmode;
32029 rtx x;
32030
32031 smode = GET_MODE_INNER (mode);
32032 wvmode = get_mode_wider_vector (mode);
32033 wsmode = GET_MODE_INNER (wvmode);
32034
32035 val = convert_modes (wsmode, smode, val, true);
32036 x = expand_simple_binop (wsmode, ASHIFT, val,
32037 GEN_INT (GET_MODE_BITSIZE (smode)),
32038 NULL_RTX, 1, OPTAB_LIB_WIDEN);
32039 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
32040
32041 x = gen_lowpart (wvmode, target);
32042 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
32043 gcc_assert (ok);
32044 return ok;
32045 }
32046
32047 case V16HImode:
32048 case V32QImode:
32049 {
32050 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
32051 rtx x = gen_reg_rtx (hvmode);
32052
32053 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
32054 gcc_assert (ok);
32055
32056 x = gen_rtx_VEC_CONCAT (mode, x, x);
32057 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32058 }
32059 return true;
32060
32061 default:
32062 return false;
32063 }
32064 }
32065
32066 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32067 whose ONE_VAR element is VAR, and other elements are zero. Return true
32068 if successful. */
32069
32070 static bool
32071 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
32072 rtx target, rtx var, int one_var)
32073 {
32074 enum machine_mode vsimode;
32075 rtx new_target;
32076 rtx x, tmp;
32077 bool use_vector_set = false;
32078
32079 switch (mode)
32080 {
32081 case V2DImode:
32082 /* For SSE4.1, we normally use vector set. But if the second
32083 element is zero and inter-unit moves are OK, we use movq
32084 instead. */
32085 use_vector_set = (TARGET_64BIT
32086 && TARGET_SSE4_1
32087 && !(TARGET_INTER_UNIT_MOVES
32088 && one_var == 0));
32089 break;
32090 case V16QImode:
32091 case V4SImode:
32092 case V4SFmode:
32093 use_vector_set = TARGET_SSE4_1;
32094 break;
32095 case V8HImode:
32096 use_vector_set = TARGET_SSE2;
32097 break;
32098 case V4HImode:
32099 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
32100 break;
32101 case V32QImode:
32102 case V16HImode:
32103 case V8SImode:
32104 case V8SFmode:
32105 case V4DFmode:
32106 use_vector_set = TARGET_AVX;
32107 break;
32108 case V4DImode:
32109 /* Use ix86_expand_vector_set in 64bit mode only. */
32110 use_vector_set = TARGET_AVX && TARGET_64BIT;
32111 break;
32112 default:
32113 break;
32114 }
32115
32116 if (use_vector_set)
32117 {
32118 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
32119 var = force_reg (GET_MODE_INNER (mode), var);
32120 ix86_expand_vector_set (mmx_ok, target, var, one_var);
32121 return true;
32122 }
32123
32124 switch (mode)
32125 {
32126 case V2SFmode:
32127 case V2SImode:
32128 if (!mmx_ok)
32129 return false;
32130 /* FALLTHRU */
32131
32132 case V2DFmode:
32133 case V2DImode:
32134 if (one_var != 0)
32135 return false;
32136 var = force_reg (GET_MODE_INNER (mode), var);
32137 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
32138 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32139 return true;
32140
32141 case V4SFmode:
32142 case V4SImode:
32143 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
32144 new_target = gen_reg_rtx (mode);
32145 else
32146 new_target = target;
32147 var = force_reg (GET_MODE_INNER (mode), var);
32148 x = gen_rtx_VEC_DUPLICATE (mode, var);
32149 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
32150 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
32151 if (one_var != 0)
32152 {
32153 /* We need to shuffle the value to the correct position, so
32154 create a new pseudo to store the intermediate result. */
32155
32156 /* With SSE2, we can use the integer shuffle insns. */
32157 if (mode != V4SFmode && TARGET_SSE2)
32158 {
32159 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
32160 const1_rtx,
32161 GEN_INT (one_var == 1 ? 0 : 1),
32162 GEN_INT (one_var == 2 ? 0 : 1),
32163 GEN_INT (one_var == 3 ? 0 : 1)));
32164 if (target != new_target)
32165 emit_move_insn (target, new_target);
32166 return true;
32167 }
32168
32169 /* Otherwise convert the intermediate result to V4SFmode and
32170 use the SSE1 shuffle instructions. */
32171 if (mode != V4SFmode)
32172 {
32173 tmp = gen_reg_rtx (V4SFmode);
32174 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
32175 }
32176 else
32177 tmp = new_target;
32178
32179 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
32180 const1_rtx,
32181 GEN_INT (one_var == 1 ? 0 : 1),
32182 GEN_INT (one_var == 2 ? 0+4 : 1+4),
32183 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
32184
32185 if (mode != V4SFmode)
32186 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
32187 else if (tmp != target)
32188 emit_move_insn (target, tmp);
32189 }
32190 else if (target != new_target)
32191 emit_move_insn (target, new_target);
32192 return true;
32193
32194 case V8HImode:
32195 case V16QImode:
32196 vsimode = V4SImode;
32197 goto widen;
32198 case V4HImode:
32199 case V8QImode:
32200 if (!mmx_ok)
32201 return false;
32202 vsimode = V2SImode;
32203 goto widen;
32204 widen:
32205 if (one_var != 0)
32206 return false;
32207
32208 /* Zero extend the variable element to SImode and recurse. */
32209 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
32210
32211 x = gen_reg_rtx (vsimode);
32212 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
32213 var, one_var))
32214 gcc_unreachable ();
32215
32216 emit_move_insn (target, gen_lowpart (mode, x));
32217 return true;
32218
32219 default:
32220 return false;
32221 }
32222 }
32223
32224 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32225 consisting of the values in VALS. It is known that all elements
32226 except ONE_VAR are constants. Return true if successful. */
32227
32228 static bool
32229 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
32230 rtx target, rtx vals, int one_var)
32231 {
32232 rtx var = XVECEXP (vals, 0, one_var);
32233 enum machine_mode wmode;
32234 rtx const_vec, x;
32235
32236 const_vec = copy_rtx (vals);
32237 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
32238 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
32239
32240 switch (mode)
32241 {
32242 case V2DFmode:
32243 case V2DImode:
32244 case V2SFmode:
32245 case V2SImode:
32246 /* For the two element vectors, it's just as easy to use
32247 the general case. */
32248 return false;
32249
32250 case V4DImode:
32251 /* Use ix86_expand_vector_set in 64bit mode only. */
32252 if (!TARGET_64BIT)
32253 return false;
32254 case V4DFmode:
32255 case V8SFmode:
32256 case V8SImode:
32257 case V16HImode:
32258 case V32QImode:
32259 case V4SFmode:
32260 case V4SImode:
32261 case V8HImode:
32262 case V4HImode:
32263 break;
32264
32265 case V16QImode:
32266 if (TARGET_SSE4_1)
32267 break;
32268 wmode = V8HImode;
32269 goto widen;
32270 case V8QImode:
32271 wmode = V4HImode;
32272 goto widen;
32273 widen:
32274 /* There's no way to set one QImode entry easily. Combine
32275 the variable value with its adjacent constant value, and
32276 promote to an HImode set. */
32277 x = XVECEXP (vals, 0, one_var ^ 1);
32278 if (one_var & 1)
32279 {
32280 var = convert_modes (HImode, QImode, var, true);
32281 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
32282 NULL_RTX, 1, OPTAB_LIB_WIDEN);
32283 x = GEN_INT (INTVAL (x) & 0xff);
32284 }
32285 else
32286 {
32287 var = convert_modes (HImode, QImode, var, true);
32288 x = gen_int_mode (INTVAL (x) << 8, HImode);
32289 }
32290 if (x != const0_rtx)
32291 var = expand_simple_binop (HImode, IOR, var, x, var,
32292 1, OPTAB_LIB_WIDEN);
32293
32294 x = gen_reg_rtx (wmode);
32295 emit_move_insn (x, gen_lowpart (wmode, const_vec));
32296 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
32297
32298 emit_move_insn (target, gen_lowpart (mode, x));
32299 return true;
32300
32301 default:
32302 return false;
32303 }
32304
32305 emit_move_insn (target, const_vec);
32306 ix86_expand_vector_set (mmx_ok, target, var, one_var);
32307 return true;
32308 }
32309
32310 /* A subroutine of ix86_expand_vector_init_general. Use vector
32311 concatenate to handle the most general case: all values variable,
32312 and none identical. */
32313
32314 static void
32315 ix86_expand_vector_init_concat (enum machine_mode mode,
32316 rtx target, rtx *ops, int n)
32317 {
32318 enum machine_mode cmode, hmode = VOIDmode;
32319 rtx first[8], second[4];
32320 rtvec v;
32321 int i, j;
32322
32323 switch (n)
32324 {
32325 case 2:
32326 switch (mode)
32327 {
32328 case V8SImode:
32329 cmode = V4SImode;
32330 break;
32331 case V8SFmode:
32332 cmode = V4SFmode;
32333 break;
32334 case V4DImode:
32335 cmode = V2DImode;
32336 break;
32337 case V4DFmode:
32338 cmode = V2DFmode;
32339 break;
32340 case V4SImode:
32341 cmode = V2SImode;
32342 break;
32343 case V4SFmode:
32344 cmode = V2SFmode;
32345 break;
32346 case V2DImode:
32347 cmode = DImode;
32348 break;
32349 case V2SImode:
32350 cmode = SImode;
32351 break;
32352 case V2DFmode:
32353 cmode = DFmode;
32354 break;
32355 case V2SFmode:
32356 cmode = SFmode;
32357 break;
32358 default:
32359 gcc_unreachable ();
32360 }
32361
32362 if (!register_operand (ops[1], cmode))
32363 ops[1] = force_reg (cmode, ops[1]);
32364 if (!register_operand (ops[0], cmode))
32365 ops[0] = force_reg (cmode, ops[0]);
32366 emit_insn (gen_rtx_SET (VOIDmode, target,
32367 gen_rtx_VEC_CONCAT (mode, ops[0],
32368 ops[1])));
32369 break;
32370
32371 case 4:
32372 switch (mode)
32373 {
32374 case V4DImode:
32375 cmode = V2DImode;
32376 break;
32377 case V4DFmode:
32378 cmode = V2DFmode;
32379 break;
32380 case V4SImode:
32381 cmode = V2SImode;
32382 break;
32383 case V4SFmode:
32384 cmode = V2SFmode;
32385 break;
32386 default:
32387 gcc_unreachable ();
32388 }
32389 goto half;
32390
32391 case 8:
32392 switch (mode)
32393 {
32394 case V8SImode:
32395 cmode = V2SImode;
32396 hmode = V4SImode;
32397 break;
32398 case V8SFmode:
32399 cmode = V2SFmode;
32400 hmode = V4SFmode;
32401 break;
32402 default:
32403 gcc_unreachable ();
32404 }
32405 goto half;
32406
32407 half:
32408 /* FIXME: We process inputs backward to help RA. PR 36222. */
32409 i = n - 1;
32410 j = (n >> 1) - 1;
32411 for (; i > 0; i -= 2, j--)
32412 {
32413 first[j] = gen_reg_rtx (cmode);
32414 v = gen_rtvec (2, ops[i - 1], ops[i]);
32415 ix86_expand_vector_init (false, first[j],
32416 gen_rtx_PARALLEL (cmode, v));
32417 }
32418
32419 n >>= 1;
32420 if (n > 2)
32421 {
32422 gcc_assert (hmode != VOIDmode);
32423 for (i = j = 0; i < n; i += 2, j++)
32424 {
32425 second[j] = gen_reg_rtx (hmode);
32426 ix86_expand_vector_init_concat (hmode, second [j],
32427 &first [i], 2);
32428 }
32429 n >>= 1;
32430 ix86_expand_vector_init_concat (mode, target, second, n);
32431 }
32432 else
32433 ix86_expand_vector_init_concat (mode, target, first, n);
32434 break;
32435
32436 default:
32437 gcc_unreachable ();
32438 }
32439 }
32440
32441 /* A subroutine of ix86_expand_vector_init_general. Use vector
32442 interleave to handle the most general case: all values variable,
32443 and none identical. */
32444
32445 static void
32446 ix86_expand_vector_init_interleave (enum machine_mode mode,
32447 rtx target, rtx *ops, int n)
32448 {
32449 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
32450 int i, j;
32451 rtx op0, op1;
32452 rtx (*gen_load_even) (rtx, rtx, rtx);
32453 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
32454 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
32455
32456 switch (mode)
32457 {
32458 case V8HImode:
32459 gen_load_even = gen_vec_setv8hi;
32460 gen_interleave_first_low = gen_vec_interleave_lowv4si;
32461 gen_interleave_second_low = gen_vec_interleave_lowv2di;
32462 inner_mode = HImode;
32463 first_imode = V4SImode;
32464 second_imode = V2DImode;
32465 third_imode = VOIDmode;
32466 break;
32467 case V16QImode:
32468 gen_load_even = gen_vec_setv16qi;
32469 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
32470 gen_interleave_second_low = gen_vec_interleave_lowv4si;
32471 inner_mode = QImode;
32472 first_imode = V8HImode;
32473 second_imode = V4SImode;
32474 third_imode = V2DImode;
32475 break;
32476 default:
32477 gcc_unreachable ();
32478 }
32479
32480 for (i = 0; i < n; i++)
32481 {
32482 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
32483 op0 = gen_reg_rtx (SImode);
32484 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
32485
32486 /* Insert the SImode value as low element of V4SImode vector. */
32487 op1 = gen_reg_rtx (V4SImode);
32488 op0 = gen_rtx_VEC_MERGE (V4SImode,
32489 gen_rtx_VEC_DUPLICATE (V4SImode,
32490 op0),
32491 CONST0_RTX (V4SImode),
32492 const1_rtx);
32493 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
32494
32495 /* Cast the V4SImode vector back to a vector in orignal mode. */
32496 op0 = gen_reg_rtx (mode);
32497 emit_move_insn (op0, gen_lowpart (mode, op1));
32498
32499 /* Load even elements into the second positon. */
32500 emit_insn (gen_load_even (op0,
32501 force_reg (inner_mode,
32502 ops [i + i + 1]),
32503 const1_rtx));
32504
32505 /* Cast vector to FIRST_IMODE vector. */
32506 ops[i] = gen_reg_rtx (first_imode);
32507 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
32508 }
32509
32510 /* Interleave low FIRST_IMODE vectors. */
32511 for (i = j = 0; i < n; i += 2, j++)
32512 {
32513 op0 = gen_reg_rtx (first_imode);
32514 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
32515
32516 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
32517 ops[j] = gen_reg_rtx (second_imode);
32518 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
32519 }
32520
32521 /* Interleave low SECOND_IMODE vectors. */
32522 switch (second_imode)
32523 {
32524 case V4SImode:
32525 for (i = j = 0; i < n / 2; i += 2, j++)
32526 {
32527 op0 = gen_reg_rtx (second_imode);
32528 emit_insn (gen_interleave_second_low (op0, ops[i],
32529 ops[i + 1]));
32530
32531 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
32532 vector. */
32533 ops[j] = gen_reg_rtx (third_imode);
32534 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
32535 }
32536 second_imode = V2DImode;
32537 gen_interleave_second_low = gen_vec_interleave_lowv2di;
32538 /* FALLTHRU */
32539
32540 case V2DImode:
32541 op0 = gen_reg_rtx (second_imode);
32542 emit_insn (gen_interleave_second_low (op0, ops[0],
32543 ops[1]));
32544
32545 /* Cast the SECOND_IMODE vector back to a vector on original
32546 mode. */
32547 emit_insn (gen_rtx_SET (VOIDmode, target,
32548 gen_lowpart (mode, op0)));
32549 break;
32550
32551 default:
32552 gcc_unreachable ();
32553 }
32554 }
32555
32556 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
32557 all values variable, and none identical. */
32558
32559 static void
32560 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
32561 rtx target, rtx vals)
32562 {
32563 rtx ops[32], op0, op1;
32564 enum machine_mode half_mode = VOIDmode;
32565 int n, i;
32566
32567 switch (mode)
32568 {
32569 case V2SFmode:
32570 case V2SImode:
32571 if (!mmx_ok && !TARGET_SSE)
32572 break;
32573 /* FALLTHRU */
32574
32575 case V8SFmode:
32576 case V8SImode:
32577 case V4DFmode:
32578 case V4DImode:
32579 case V4SFmode:
32580 case V4SImode:
32581 case V2DFmode:
32582 case V2DImode:
32583 n = GET_MODE_NUNITS (mode);
32584 for (i = 0; i < n; i++)
32585 ops[i] = XVECEXP (vals, 0, i);
32586 ix86_expand_vector_init_concat (mode, target, ops, n);
32587 return;
32588
32589 case V32QImode:
32590 half_mode = V16QImode;
32591 goto half;
32592
32593 case V16HImode:
32594 half_mode = V8HImode;
32595 goto half;
32596
32597 half:
32598 n = GET_MODE_NUNITS (mode);
32599 for (i = 0; i < n; i++)
32600 ops[i] = XVECEXP (vals, 0, i);
32601 op0 = gen_reg_rtx (half_mode);
32602 op1 = gen_reg_rtx (half_mode);
32603 ix86_expand_vector_init_interleave (half_mode, op0, ops,
32604 n >> 2);
32605 ix86_expand_vector_init_interleave (half_mode, op1,
32606 &ops [n >> 1], n >> 2);
32607 emit_insn (gen_rtx_SET (VOIDmode, target,
32608 gen_rtx_VEC_CONCAT (mode, op0, op1)));
32609 return;
32610
32611 case V16QImode:
32612 if (!TARGET_SSE4_1)
32613 break;
32614 /* FALLTHRU */
32615
32616 case V8HImode:
32617 if (!TARGET_SSE2)
32618 break;
32619
32620 /* Don't use ix86_expand_vector_init_interleave if we can't
32621 move from GPR to SSE register directly. */
32622 if (!TARGET_INTER_UNIT_MOVES)
32623 break;
32624
32625 n = GET_MODE_NUNITS (mode);
32626 for (i = 0; i < n; i++)
32627 ops[i] = XVECEXP (vals, 0, i);
32628 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
32629 return;
32630
32631 case V4HImode:
32632 case V8QImode:
32633 break;
32634
32635 default:
32636 gcc_unreachable ();
32637 }
32638
32639 {
32640 int i, j, n_elts, n_words, n_elt_per_word;
32641 enum machine_mode inner_mode;
32642 rtx words[4], shift;
32643
32644 inner_mode = GET_MODE_INNER (mode);
32645 n_elts = GET_MODE_NUNITS (mode);
32646 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
32647 n_elt_per_word = n_elts / n_words;
32648 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
32649
32650 for (i = 0; i < n_words; ++i)
32651 {
32652 rtx word = NULL_RTX;
32653
32654 for (j = 0; j < n_elt_per_word; ++j)
32655 {
32656 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
32657 elt = convert_modes (word_mode, inner_mode, elt, true);
32658
32659 if (j == 0)
32660 word = elt;
32661 else
32662 {
32663 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
32664 word, 1, OPTAB_LIB_WIDEN);
32665 word = expand_simple_binop (word_mode, IOR, word, elt,
32666 word, 1, OPTAB_LIB_WIDEN);
32667 }
32668 }
32669
32670 words[i] = word;
32671 }
32672
32673 if (n_words == 1)
32674 emit_move_insn (target, gen_lowpart (mode, words[0]));
32675 else if (n_words == 2)
32676 {
32677 rtx tmp = gen_reg_rtx (mode);
32678 emit_clobber (tmp);
32679 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
32680 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
32681 emit_move_insn (target, tmp);
32682 }
32683 else if (n_words == 4)
32684 {
32685 rtx tmp = gen_reg_rtx (V4SImode);
32686 gcc_assert (word_mode == SImode);
32687 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
32688 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
32689 emit_move_insn (target, gen_lowpart (mode, tmp));
32690 }
32691 else
32692 gcc_unreachable ();
32693 }
32694 }
32695
32696 /* Initialize vector TARGET via VALS. Suppress the use of MMX
32697 instructions unless MMX_OK is true. */
32698
32699 void
32700 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
32701 {
32702 enum machine_mode mode = GET_MODE (target);
32703 enum machine_mode inner_mode = GET_MODE_INNER (mode);
32704 int n_elts = GET_MODE_NUNITS (mode);
32705 int n_var = 0, one_var = -1;
32706 bool all_same = true, all_const_zero = true;
32707 int i;
32708 rtx x;
32709
32710 for (i = 0; i < n_elts; ++i)
32711 {
32712 x = XVECEXP (vals, 0, i);
32713 if (!(CONST_INT_P (x)
32714 || GET_CODE (x) == CONST_DOUBLE
32715 || GET_CODE (x) == CONST_FIXED))
32716 n_var++, one_var = i;
32717 else if (x != CONST0_RTX (inner_mode))
32718 all_const_zero = false;
32719 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
32720 all_same = false;
32721 }
32722
32723 /* Constants are best loaded from the constant pool. */
32724 if (n_var == 0)
32725 {
32726 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
32727 return;
32728 }
32729
32730 /* If all values are identical, broadcast the value. */
32731 if (all_same
32732 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
32733 XVECEXP (vals, 0, 0)))
32734 return;
32735
32736 /* Values where only one field is non-constant are best loaded from
32737 the pool and overwritten via move later. */
32738 if (n_var == 1)
32739 {
32740 if (all_const_zero
32741 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
32742 XVECEXP (vals, 0, one_var),
32743 one_var))
32744 return;
32745
32746 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
32747 return;
32748 }
32749
32750 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
32751 }
32752
32753 void
32754 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
32755 {
32756 enum machine_mode mode = GET_MODE (target);
32757 enum machine_mode inner_mode = GET_MODE_INNER (mode);
32758 enum machine_mode half_mode;
32759 bool use_vec_merge = false;
32760 rtx tmp;
32761 static rtx (*gen_extract[6][2]) (rtx, rtx)
32762 = {
32763 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
32764 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
32765 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
32766 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
32767 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
32768 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
32769 };
32770 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
32771 = {
32772 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
32773 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
32774 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
32775 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
32776 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
32777 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
32778 };
32779 int i, j, n;
32780
32781 switch (mode)
32782 {
32783 case V2SFmode:
32784 case V2SImode:
32785 if (mmx_ok)
32786 {
32787 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
32788 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
32789 if (elt == 0)
32790 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
32791 else
32792 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
32793 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
32794 return;
32795 }
32796 break;
32797
32798 case V2DImode:
32799 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
32800 if (use_vec_merge)
32801 break;
32802
32803 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
32804 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
32805 if (elt == 0)
32806 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
32807 else
32808 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
32809 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
32810 return;
32811
32812 case V2DFmode:
32813 {
32814 rtx op0, op1;
32815
32816 /* For the two element vectors, we implement a VEC_CONCAT with
32817 the extraction of the other element. */
32818
32819 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
32820 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
32821
32822 if (elt == 0)
32823 op0 = val, op1 = tmp;
32824 else
32825 op0 = tmp, op1 = val;
32826
32827 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
32828 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
32829 }
32830 return;
32831
32832 case V4SFmode:
32833 use_vec_merge = TARGET_SSE4_1;
32834 if (use_vec_merge)
32835 break;
32836
32837 switch (elt)
32838 {
32839 case 0:
32840 use_vec_merge = true;
32841 break;
32842
32843 case 1:
32844 /* tmp = target = A B C D */
32845 tmp = copy_to_reg (target);
32846 /* target = A A B B */
32847 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
32848 /* target = X A B B */
32849 ix86_expand_vector_set (false, target, val, 0);
32850 /* target = A X C D */
32851 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
32852 const1_rtx, const0_rtx,
32853 GEN_INT (2+4), GEN_INT (3+4)));
32854 return;
32855
32856 case 2:
32857 /* tmp = target = A B C D */
32858 tmp = copy_to_reg (target);
32859 /* tmp = X B C D */
32860 ix86_expand_vector_set (false, tmp, val, 0);
32861 /* target = A B X D */
32862 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
32863 const0_rtx, const1_rtx,
32864 GEN_INT (0+4), GEN_INT (3+4)));
32865 return;
32866
32867 case 3:
32868 /* tmp = target = A B C D */
32869 tmp = copy_to_reg (target);
32870 /* tmp = X B C D */
32871 ix86_expand_vector_set (false, tmp, val, 0);
32872 /* target = A B X D */
32873 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
32874 const0_rtx, const1_rtx,
32875 GEN_INT (2+4), GEN_INT (0+4)));
32876 return;
32877
32878 default:
32879 gcc_unreachable ();
32880 }
32881 break;
32882
32883 case V4SImode:
32884 use_vec_merge = TARGET_SSE4_1;
32885 if (use_vec_merge)
32886 break;
32887
32888 /* Element 0 handled by vec_merge below. */
32889 if (elt == 0)
32890 {
32891 use_vec_merge = true;
32892 break;
32893 }
32894
32895 if (TARGET_SSE2)
32896 {
32897 /* With SSE2, use integer shuffles to swap element 0 and ELT,
32898 store into element 0, then shuffle them back. */
32899
32900 rtx order[4];
32901
32902 order[0] = GEN_INT (elt);
32903 order[1] = const1_rtx;
32904 order[2] = const2_rtx;
32905 order[3] = GEN_INT (3);
32906 order[elt] = const0_rtx;
32907
32908 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
32909 order[1], order[2], order[3]));
32910
32911 ix86_expand_vector_set (false, target, val, 0);
32912
32913 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
32914 order[1], order[2], order[3]));
32915 }
32916 else
32917 {
32918 /* For SSE1, we have to reuse the V4SF code. */
32919 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
32920 gen_lowpart (SFmode, val), elt);
32921 }
32922 return;
32923
32924 case V8HImode:
32925 use_vec_merge = TARGET_SSE2;
32926 break;
32927 case V4HImode:
32928 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
32929 break;
32930
32931 case V16QImode:
32932 use_vec_merge = TARGET_SSE4_1;
32933 break;
32934
32935 case V8QImode:
32936 break;
32937
32938 case V32QImode:
32939 half_mode = V16QImode;
32940 j = 0;
32941 n = 16;
32942 goto half;
32943
32944 case V16HImode:
32945 half_mode = V8HImode;
32946 j = 1;
32947 n = 8;
32948 goto half;
32949
32950 case V8SImode:
32951 half_mode = V4SImode;
32952 j = 2;
32953 n = 4;
32954 goto half;
32955
32956 case V4DImode:
32957 half_mode = V2DImode;
32958 j = 3;
32959 n = 2;
32960 goto half;
32961
32962 case V8SFmode:
32963 half_mode = V4SFmode;
32964 j = 4;
32965 n = 4;
32966 goto half;
32967
32968 case V4DFmode:
32969 half_mode = V2DFmode;
32970 j = 5;
32971 n = 2;
32972 goto half;
32973
32974 half:
32975 /* Compute offset. */
32976 i = elt / n;
32977 elt %= n;
32978
32979 gcc_assert (i <= 1);
32980
32981 /* Extract the half. */
32982 tmp = gen_reg_rtx (half_mode);
32983 emit_insn (gen_extract[j][i] (tmp, target));
32984
32985 /* Put val in tmp at elt. */
32986 ix86_expand_vector_set (false, tmp, val, elt);
32987
32988 /* Put it back. */
32989 emit_insn (gen_insert[j][i] (target, target, tmp));
32990 return;
32991
32992 default:
32993 break;
32994 }
32995
32996 if (use_vec_merge)
32997 {
32998 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
32999 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
33000 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33001 }
33002 else
33003 {
33004 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33005
33006 emit_move_insn (mem, target);
33007
33008 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
33009 emit_move_insn (tmp, val);
33010
33011 emit_move_insn (target, mem);
33012 }
33013 }
33014
33015 void
33016 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
33017 {
33018 enum machine_mode mode = GET_MODE (vec);
33019 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33020 bool use_vec_extr = false;
33021 rtx tmp;
33022
33023 switch (mode)
33024 {
33025 case V2SImode:
33026 case V2SFmode:
33027 if (!mmx_ok)
33028 break;
33029 /* FALLTHRU */
33030
33031 case V2DFmode:
33032 case V2DImode:
33033 use_vec_extr = true;
33034 break;
33035
33036 case V4SFmode:
33037 use_vec_extr = TARGET_SSE4_1;
33038 if (use_vec_extr)
33039 break;
33040
33041 switch (elt)
33042 {
33043 case 0:
33044 tmp = vec;
33045 break;
33046
33047 case 1:
33048 case 3:
33049 tmp = gen_reg_rtx (mode);
33050 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
33051 GEN_INT (elt), GEN_INT (elt),
33052 GEN_INT (elt+4), GEN_INT (elt+4)));
33053 break;
33054
33055 case 2:
33056 tmp = gen_reg_rtx (mode);
33057 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
33058 break;
33059
33060 default:
33061 gcc_unreachable ();
33062 }
33063 vec = tmp;
33064 use_vec_extr = true;
33065 elt = 0;
33066 break;
33067
33068 case V4SImode:
33069 use_vec_extr = TARGET_SSE4_1;
33070 if (use_vec_extr)
33071 break;
33072
33073 if (TARGET_SSE2)
33074 {
33075 switch (elt)
33076 {
33077 case 0:
33078 tmp = vec;
33079 break;
33080
33081 case 1:
33082 case 3:
33083 tmp = gen_reg_rtx (mode);
33084 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
33085 GEN_INT (elt), GEN_INT (elt),
33086 GEN_INT (elt), GEN_INT (elt)));
33087 break;
33088
33089 case 2:
33090 tmp = gen_reg_rtx (mode);
33091 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
33092 break;
33093
33094 default:
33095 gcc_unreachable ();
33096 }
33097 vec = tmp;
33098 use_vec_extr = true;
33099 elt = 0;
33100 }
33101 else
33102 {
33103 /* For SSE1, we have to reuse the V4SF code. */
33104 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
33105 gen_lowpart (V4SFmode, vec), elt);
33106 return;
33107 }
33108 break;
33109
33110 case V8HImode:
33111 use_vec_extr = TARGET_SSE2;
33112 break;
33113 case V4HImode:
33114 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33115 break;
33116
33117 case V16QImode:
33118 use_vec_extr = TARGET_SSE4_1;
33119 break;
33120
33121 case V8SFmode:
33122 if (TARGET_AVX)
33123 {
33124 tmp = gen_reg_rtx (V4SFmode);
33125 if (elt < 4)
33126 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
33127 else
33128 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
33129 ix86_expand_vector_extract (false, target, tmp, elt & 3);
33130 return;
33131 }
33132 break;
33133
33134 case V4DFmode:
33135 if (TARGET_AVX)
33136 {
33137 tmp = gen_reg_rtx (V2DFmode);
33138 if (elt < 2)
33139 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
33140 else
33141 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
33142 ix86_expand_vector_extract (false, target, tmp, elt & 1);
33143 return;
33144 }
33145 break;
33146
33147 case V32QImode:
33148 if (TARGET_AVX)
33149 {
33150 tmp = gen_reg_rtx (V16QImode);
33151 if (elt < 16)
33152 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
33153 else
33154 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
33155 ix86_expand_vector_extract (false, target, tmp, elt & 15);
33156 return;
33157 }
33158 break;
33159
33160 case V16HImode:
33161 if (TARGET_AVX)
33162 {
33163 tmp = gen_reg_rtx (V8HImode);
33164 if (elt < 8)
33165 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
33166 else
33167 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
33168 ix86_expand_vector_extract (false, target, tmp, elt & 7);
33169 return;
33170 }
33171 break;
33172
33173 case V8SImode:
33174 if (TARGET_AVX)
33175 {
33176 tmp = gen_reg_rtx (V4SImode);
33177 if (elt < 4)
33178 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
33179 else
33180 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
33181 ix86_expand_vector_extract (false, target, tmp, elt & 3);
33182 return;
33183 }
33184 break;
33185
33186 case V4DImode:
33187 if (TARGET_AVX)
33188 {
33189 tmp = gen_reg_rtx (V2DImode);
33190 if (elt < 2)
33191 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
33192 else
33193 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
33194 ix86_expand_vector_extract (false, target, tmp, elt & 1);
33195 return;
33196 }
33197 break;
33198
33199 case V8QImode:
33200 /* ??? Could extract the appropriate HImode element and shift. */
33201 default:
33202 break;
33203 }
33204
33205 if (use_vec_extr)
33206 {
33207 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
33208 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
33209
33210 /* Let the rtl optimizers know about the zero extension performed. */
33211 if (inner_mode == QImode || inner_mode == HImode)
33212 {
33213 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
33214 target = gen_lowpart (SImode, target);
33215 }
33216
33217 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33218 }
33219 else
33220 {
33221 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33222
33223 emit_move_insn (mem, vec);
33224
33225 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
33226 emit_move_insn (target, tmp);
33227 }
33228 }
33229
33230 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
33231 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
33232 The upper bits of DEST are undefined, though they shouldn't cause
33233 exceptions (some bits from src or all zeros are ok). */
33234
33235 static void
33236 emit_reduc_half (rtx dest, rtx src, int i)
33237 {
33238 rtx tem;
33239 switch (GET_MODE (src))
33240 {
33241 case V4SFmode:
33242 if (i == 128)
33243 tem = gen_sse_movhlps (dest, src, src);
33244 else
33245 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
33246 GEN_INT (1 + 4), GEN_INT (1 + 4));
33247 break;
33248 case V2DFmode:
33249 tem = gen_vec_interleave_highv2df (dest, src, src);
33250 break;
33251 case V16QImode:
33252 case V8HImode:
33253 case V4SImode:
33254 case V2DImode:
33255 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
33256 gen_lowpart (V1TImode, src),
33257 GEN_INT (i / 2));
33258 break;
33259 case V8SFmode:
33260 if (i == 256)
33261 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
33262 else
33263 tem = gen_avx_shufps256 (dest, src, src,
33264 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
33265 break;
33266 case V4DFmode:
33267 if (i == 256)
33268 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
33269 else
33270 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
33271 break;
33272 case V32QImode:
33273 case V16HImode:
33274 case V8SImode:
33275 case V4DImode:
33276 if (i == 256)
33277 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
33278 gen_lowpart (V4DImode, src),
33279 gen_lowpart (V4DImode, src),
33280 const1_rtx);
33281 else
33282 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
33283 gen_lowpart (V2TImode, src),
33284 GEN_INT (i / 2));
33285 break;
33286 default:
33287 gcc_unreachable ();
33288 }
33289 emit_insn (tem);
33290 }
33291
33292 /* Expand a vector reduction. FN is the binary pattern to reduce;
33293 DEST is the destination; IN is the input vector. */
33294
33295 void
33296 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
33297 {
33298 rtx half, dst, vec = in;
33299 enum machine_mode mode = GET_MODE (in);
33300 int i;
33301
33302 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
33303 if (TARGET_SSE4_1
33304 && mode == V8HImode
33305 && fn == gen_uminv8hi3)
33306 {
33307 emit_insn (gen_sse4_1_phminposuw (dest, in));
33308 return;
33309 }
33310
33311 for (i = GET_MODE_BITSIZE (mode);
33312 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
33313 i >>= 1)
33314 {
33315 half = gen_reg_rtx (mode);
33316 emit_reduc_half (half, vec, i);
33317 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
33318 dst = dest;
33319 else
33320 dst = gen_reg_rtx (mode);
33321 emit_insn (fn (dst, half, vec));
33322 vec = dst;
33323 }
33324 }
33325 \f
33326 /* Target hook for scalar_mode_supported_p. */
33327 static bool
33328 ix86_scalar_mode_supported_p (enum machine_mode mode)
33329 {
33330 if (DECIMAL_FLOAT_MODE_P (mode))
33331 return default_decimal_float_supported_p ();
33332 else if (mode == TFmode)
33333 return true;
33334 else
33335 return default_scalar_mode_supported_p (mode);
33336 }
33337
33338 /* Implements target hook vector_mode_supported_p. */
33339 static bool
33340 ix86_vector_mode_supported_p (enum machine_mode mode)
33341 {
33342 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
33343 return true;
33344 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
33345 return true;
33346 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
33347 return true;
33348 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
33349 return true;
33350 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
33351 return true;
33352 return false;
33353 }
33354
33355 /* Target hook for c_mode_for_suffix. */
33356 static enum machine_mode
33357 ix86_c_mode_for_suffix (char suffix)
33358 {
33359 if (suffix == 'q')
33360 return TFmode;
33361 if (suffix == 'w')
33362 return XFmode;
33363
33364 return VOIDmode;
33365 }
33366
33367 /* Worker function for TARGET_MD_ASM_CLOBBERS.
33368
33369 We do this in the new i386 backend to maintain source compatibility
33370 with the old cc0-based compiler. */
33371
33372 static tree
33373 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
33374 tree inputs ATTRIBUTE_UNUSED,
33375 tree clobbers)
33376 {
33377 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
33378 clobbers);
33379 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
33380 clobbers);
33381 return clobbers;
33382 }
33383
33384 /* Implements target vector targetm.asm.encode_section_info. */
33385
33386 static void ATTRIBUTE_UNUSED
33387 ix86_encode_section_info (tree decl, rtx rtl, int first)
33388 {
33389 default_encode_section_info (decl, rtl, first);
33390
33391 if (TREE_CODE (decl) == VAR_DECL
33392 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
33393 && ix86_in_large_data_p (decl))
33394 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
33395 }
33396
33397 /* Worker function for REVERSE_CONDITION. */
33398
33399 enum rtx_code
33400 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
33401 {
33402 return (mode != CCFPmode && mode != CCFPUmode
33403 ? reverse_condition (code)
33404 : reverse_condition_maybe_unordered (code));
33405 }
33406
33407 /* Output code to perform an x87 FP register move, from OPERANDS[1]
33408 to OPERANDS[0]. */
33409
33410 const char *
33411 output_387_reg_move (rtx insn, rtx *operands)
33412 {
33413 if (REG_P (operands[0]))
33414 {
33415 if (REG_P (operands[1])
33416 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
33417 {
33418 if (REGNO (operands[0]) == FIRST_STACK_REG)
33419 return output_387_ffreep (operands, 0);
33420 return "fstp\t%y0";
33421 }
33422 if (STACK_TOP_P (operands[0]))
33423 return "fld%Z1\t%y1";
33424 return "fst\t%y0";
33425 }
33426 else if (MEM_P (operands[0]))
33427 {
33428 gcc_assert (REG_P (operands[1]));
33429 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
33430 return "fstp%Z0\t%y0";
33431 else
33432 {
33433 /* There is no non-popping store to memory for XFmode.
33434 So if we need one, follow the store with a load. */
33435 if (GET_MODE (operands[0]) == XFmode)
33436 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
33437 else
33438 return "fst%Z0\t%y0";
33439 }
33440 }
33441 else
33442 gcc_unreachable();
33443 }
33444
33445 /* Output code to perform a conditional jump to LABEL, if C2 flag in
33446 FP status register is set. */
33447
33448 void
33449 ix86_emit_fp_unordered_jump (rtx label)
33450 {
33451 rtx reg = gen_reg_rtx (HImode);
33452 rtx temp;
33453
33454 emit_insn (gen_x86_fnstsw_1 (reg));
33455
33456 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
33457 {
33458 emit_insn (gen_x86_sahf_1 (reg));
33459
33460 temp = gen_rtx_REG (CCmode, FLAGS_REG);
33461 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
33462 }
33463 else
33464 {
33465 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
33466
33467 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
33468 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
33469 }
33470
33471 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
33472 gen_rtx_LABEL_REF (VOIDmode, label),
33473 pc_rtx);
33474 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
33475
33476 emit_jump_insn (temp);
33477 predict_jump (REG_BR_PROB_BASE * 10 / 100);
33478 }
33479
33480 /* Output code to perform a log1p XFmode calculation. */
33481
33482 void ix86_emit_i387_log1p (rtx op0, rtx op1)
33483 {
33484 rtx label1 = gen_label_rtx ();
33485 rtx label2 = gen_label_rtx ();
33486
33487 rtx tmp = gen_reg_rtx (XFmode);
33488 rtx tmp2 = gen_reg_rtx (XFmode);
33489 rtx test;
33490
33491 emit_insn (gen_absxf2 (tmp, op1));
33492 test = gen_rtx_GE (VOIDmode, tmp,
33493 CONST_DOUBLE_FROM_REAL_VALUE (
33494 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
33495 XFmode));
33496 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
33497
33498 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
33499 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
33500 emit_jump (label2);
33501
33502 emit_label (label1);
33503 emit_move_insn (tmp, CONST1_RTX (XFmode));
33504 emit_insn (gen_addxf3 (tmp, op1, tmp));
33505 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
33506 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
33507
33508 emit_label (label2);
33509 }
33510
33511 /* Emit code for round calculation. */
33512 void ix86_emit_i387_round (rtx op0, rtx op1)
33513 {
33514 enum machine_mode inmode = GET_MODE (op1);
33515 enum machine_mode outmode = GET_MODE (op0);
33516 rtx e1, e2, res, tmp, tmp1, half;
33517 rtx scratch = gen_reg_rtx (HImode);
33518 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
33519 rtx jump_label = gen_label_rtx ();
33520 rtx insn;
33521 rtx (*gen_abs) (rtx, rtx);
33522 rtx (*gen_neg) (rtx, rtx);
33523
33524 switch (inmode)
33525 {
33526 case SFmode:
33527 gen_abs = gen_abssf2;
33528 break;
33529 case DFmode:
33530 gen_abs = gen_absdf2;
33531 break;
33532 case XFmode:
33533 gen_abs = gen_absxf2;
33534 break;
33535 default:
33536 gcc_unreachable ();
33537 }
33538
33539 switch (outmode)
33540 {
33541 case SFmode:
33542 gen_neg = gen_negsf2;
33543 break;
33544 case DFmode:
33545 gen_neg = gen_negdf2;
33546 break;
33547 case XFmode:
33548 gen_neg = gen_negxf2;
33549 break;
33550 case HImode:
33551 gen_neg = gen_neghi2;
33552 break;
33553 case SImode:
33554 gen_neg = gen_negsi2;
33555 break;
33556 case DImode:
33557 gen_neg = gen_negdi2;
33558 break;
33559 default:
33560 gcc_unreachable ();
33561 }
33562
33563 e1 = gen_reg_rtx (inmode);
33564 e2 = gen_reg_rtx (inmode);
33565 res = gen_reg_rtx (outmode);
33566
33567 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
33568
33569 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
33570
33571 /* scratch = fxam(op1) */
33572 emit_insn (gen_rtx_SET (VOIDmode, scratch,
33573 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
33574 UNSPEC_FXAM)));
33575 /* e1 = fabs(op1) */
33576 emit_insn (gen_abs (e1, op1));
33577
33578 /* e2 = e1 + 0.5 */
33579 half = force_reg (inmode, half);
33580 emit_insn (gen_rtx_SET (VOIDmode, e2,
33581 gen_rtx_PLUS (inmode, e1, half)));
33582
33583 /* res = floor(e2) */
33584 if (inmode != XFmode)
33585 {
33586 tmp1 = gen_reg_rtx (XFmode);
33587
33588 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
33589 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
33590 }
33591 else
33592 tmp1 = e2;
33593
33594 switch (outmode)
33595 {
33596 case SFmode:
33597 case DFmode:
33598 {
33599 rtx tmp0 = gen_reg_rtx (XFmode);
33600
33601 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
33602
33603 emit_insn (gen_rtx_SET (VOIDmode, res,
33604 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
33605 UNSPEC_TRUNC_NOOP)));
33606 }
33607 break;
33608 case XFmode:
33609 emit_insn (gen_frndintxf2_floor (res, tmp1));
33610 break;
33611 case HImode:
33612 emit_insn (gen_lfloorxfhi2 (res, tmp1));
33613 break;
33614 case SImode:
33615 emit_insn (gen_lfloorxfsi2 (res, tmp1));
33616 break;
33617 case DImode:
33618 emit_insn (gen_lfloorxfdi2 (res, tmp1));
33619 break;
33620 default:
33621 gcc_unreachable ();
33622 }
33623
33624 /* flags = signbit(a) */
33625 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
33626
33627 /* if (flags) then res = -res */
33628 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
33629 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
33630 gen_rtx_LABEL_REF (VOIDmode, jump_label),
33631 pc_rtx);
33632 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
33633 predict_jump (REG_BR_PROB_BASE * 50 / 100);
33634 JUMP_LABEL (insn) = jump_label;
33635
33636 emit_insn (gen_neg (res, res));
33637
33638 emit_label (jump_label);
33639 LABEL_NUSES (jump_label) = 1;
33640
33641 emit_move_insn (op0, res);
33642 }
33643
33644 /* Output code to perform a Newton-Rhapson approximation of a single precision
33645 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
33646
33647 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
33648 {
33649 rtx x0, x1, e0, e1;
33650
33651 x0 = gen_reg_rtx (mode);
33652 e0 = gen_reg_rtx (mode);
33653 e1 = gen_reg_rtx (mode);
33654 x1 = gen_reg_rtx (mode);
33655
33656 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
33657
33658 b = force_reg (mode, b);
33659
33660 /* x0 = rcp(b) estimate */
33661 emit_insn (gen_rtx_SET (VOIDmode, x0,
33662 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
33663 UNSPEC_RCP)));
33664 /* e0 = x0 * b */
33665 emit_insn (gen_rtx_SET (VOIDmode, e0,
33666 gen_rtx_MULT (mode, x0, b)));
33667
33668 /* e0 = x0 * e0 */
33669 emit_insn (gen_rtx_SET (VOIDmode, e0,
33670 gen_rtx_MULT (mode, x0, e0)));
33671
33672 /* e1 = x0 + x0 */
33673 emit_insn (gen_rtx_SET (VOIDmode, e1,
33674 gen_rtx_PLUS (mode, x0, x0)));
33675
33676 /* x1 = e1 - e0 */
33677 emit_insn (gen_rtx_SET (VOIDmode, x1,
33678 gen_rtx_MINUS (mode, e1, e0)));
33679
33680 /* res = a * x1 */
33681 emit_insn (gen_rtx_SET (VOIDmode, res,
33682 gen_rtx_MULT (mode, a, x1)));
33683 }
33684
33685 /* Output code to perform a Newton-Rhapson approximation of a
33686 single precision floating point [reciprocal] square root. */
33687
33688 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
33689 bool recip)
33690 {
33691 rtx x0, e0, e1, e2, e3, mthree, mhalf;
33692 REAL_VALUE_TYPE r;
33693
33694 x0 = gen_reg_rtx (mode);
33695 e0 = gen_reg_rtx (mode);
33696 e1 = gen_reg_rtx (mode);
33697 e2 = gen_reg_rtx (mode);
33698 e3 = gen_reg_rtx (mode);
33699
33700 real_from_integer (&r, VOIDmode, -3, -1, 0);
33701 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
33702
33703 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
33704 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
33705
33706 if (VECTOR_MODE_P (mode))
33707 {
33708 mthree = ix86_build_const_vector (mode, true, mthree);
33709 mhalf = ix86_build_const_vector (mode, true, mhalf);
33710 }
33711
33712 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
33713 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
33714
33715 a = force_reg (mode, a);
33716
33717 /* x0 = rsqrt(a) estimate */
33718 emit_insn (gen_rtx_SET (VOIDmode, x0,
33719 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
33720 UNSPEC_RSQRT)));
33721
33722 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
33723 if (!recip)
33724 {
33725 rtx zero, mask;
33726
33727 zero = gen_reg_rtx (mode);
33728 mask = gen_reg_rtx (mode);
33729
33730 zero = force_reg (mode, CONST0_RTX(mode));
33731 emit_insn (gen_rtx_SET (VOIDmode, mask,
33732 gen_rtx_NE (mode, zero, a)));
33733
33734 emit_insn (gen_rtx_SET (VOIDmode, x0,
33735 gen_rtx_AND (mode, x0, mask)));
33736 }
33737
33738 /* e0 = x0 * a */
33739 emit_insn (gen_rtx_SET (VOIDmode, e0,
33740 gen_rtx_MULT (mode, x0, a)));
33741 /* e1 = e0 * x0 */
33742 emit_insn (gen_rtx_SET (VOIDmode, e1,
33743 gen_rtx_MULT (mode, e0, x0)));
33744
33745 /* e2 = e1 - 3. */
33746 mthree = force_reg (mode, mthree);
33747 emit_insn (gen_rtx_SET (VOIDmode, e2,
33748 gen_rtx_PLUS (mode, e1, mthree)));
33749
33750 mhalf = force_reg (mode, mhalf);
33751 if (recip)
33752 /* e3 = -.5 * x0 */
33753 emit_insn (gen_rtx_SET (VOIDmode, e3,
33754 gen_rtx_MULT (mode, x0, mhalf)));
33755 else
33756 /* e3 = -.5 * e0 */
33757 emit_insn (gen_rtx_SET (VOIDmode, e3,
33758 gen_rtx_MULT (mode, e0, mhalf)));
33759 /* ret = e2 * e3 */
33760 emit_insn (gen_rtx_SET (VOIDmode, res,
33761 gen_rtx_MULT (mode, e2, e3)));
33762 }
33763
33764 #ifdef TARGET_SOLARIS
33765 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
33766
33767 static void
33768 i386_solaris_elf_named_section (const char *name, unsigned int flags,
33769 tree decl)
33770 {
33771 /* With Binutils 2.15, the "@unwind" marker must be specified on
33772 every occurrence of the ".eh_frame" section, not just the first
33773 one. */
33774 if (TARGET_64BIT
33775 && strcmp (name, ".eh_frame") == 0)
33776 {
33777 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
33778 flags & SECTION_WRITE ? "aw" : "a");
33779 return;
33780 }
33781
33782 #ifndef USE_GAS
33783 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
33784 {
33785 solaris_elf_asm_comdat_section (name, flags, decl);
33786 return;
33787 }
33788 #endif
33789
33790 default_elf_asm_named_section (name, flags, decl);
33791 }
33792 #endif /* TARGET_SOLARIS */
33793
33794 /* Return the mangling of TYPE if it is an extended fundamental type. */
33795
33796 static const char *
33797 ix86_mangle_type (const_tree type)
33798 {
33799 type = TYPE_MAIN_VARIANT (type);
33800
33801 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
33802 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
33803 return NULL;
33804
33805 switch (TYPE_MODE (type))
33806 {
33807 case TFmode:
33808 /* __float128 is "g". */
33809 return "g";
33810 case XFmode:
33811 /* "long double" or __float80 is "e". */
33812 return "e";
33813 default:
33814 return NULL;
33815 }
33816 }
33817
33818 /* For 32-bit code we can save PIC register setup by using
33819 __stack_chk_fail_local hidden function instead of calling
33820 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
33821 register, so it is better to call __stack_chk_fail directly. */
33822
33823 static tree ATTRIBUTE_UNUSED
33824 ix86_stack_protect_fail (void)
33825 {
33826 return TARGET_64BIT
33827 ? default_external_stack_protect_fail ()
33828 : default_hidden_stack_protect_fail ();
33829 }
33830
33831 /* Select a format to encode pointers in exception handling data. CODE
33832 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
33833 true if the symbol may be affected by dynamic relocations.
33834
33835 ??? All x86 object file formats are capable of representing this.
33836 After all, the relocation needed is the same as for the call insn.
33837 Whether or not a particular assembler allows us to enter such, I
33838 guess we'll have to see. */
33839 int
33840 asm_preferred_eh_data_format (int code, int global)
33841 {
33842 if (flag_pic)
33843 {
33844 int type = DW_EH_PE_sdata8;
33845 if (!TARGET_64BIT
33846 || ix86_cmodel == CM_SMALL_PIC
33847 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
33848 type = DW_EH_PE_sdata4;
33849 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
33850 }
33851 if (ix86_cmodel == CM_SMALL
33852 || (ix86_cmodel == CM_MEDIUM && code))
33853 return DW_EH_PE_udata4;
33854 return DW_EH_PE_absptr;
33855 }
33856 \f
33857 /* Expand copysign from SIGN to the positive value ABS_VALUE
33858 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
33859 the sign-bit. */
33860 static void
33861 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
33862 {
33863 enum machine_mode mode = GET_MODE (sign);
33864 rtx sgn = gen_reg_rtx (mode);
33865 if (mask == NULL_RTX)
33866 {
33867 enum machine_mode vmode;
33868
33869 if (mode == SFmode)
33870 vmode = V4SFmode;
33871 else if (mode == DFmode)
33872 vmode = V2DFmode;
33873 else
33874 vmode = mode;
33875
33876 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
33877 if (!VECTOR_MODE_P (mode))
33878 {
33879 /* We need to generate a scalar mode mask in this case. */
33880 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
33881 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
33882 mask = gen_reg_rtx (mode);
33883 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
33884 }
33885 }
33886 else
33887 mask = gen_rtx_NOT (mode, mask);
33888 emit_insn (gen_rtx_SET (VOIDmode, sgn,
33889 gen_rtx_AND (mode, mask, sign)));
33890 emit_insn (gen_rtx_SET (VOIDmode, result,
33891 gen_rtx_IOR (mode, abs_value, sgn)));
33892 }
33893
33894 /* Expand fabs (OP0) and return a new rtx that holds the result. The
33895 mask for masking out the sign-bit is stored in *SMASK, if that is
33896 non-null. */
33897 static rtx
33898 ix86_expand_sse_fabs (rtx op0, rtx *smask)
33899 {
33900 enum machine_mode vmode, mode = GET_MODE (op0);
33901 rtx xa, mask;
33902
33903 xa = gen_reg_rtx (mode);
33904 if (mode == SFmode)
33905 vmode = V4SFmode;
33906 else if (mode == DFmode)
33907 vmode = V2DFmode;
33908 else
33909 vmode = mode;
33910 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
33911 if (!VECTOR_MODE_P (mode))
33912 {
33913 /* We need to generate a scalar mode mask in this case. */
33914 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
33915 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
33916 mask = gen_reg_rtx (mode);
33917 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
33918 }
33919 emit_insn (gen_rtx_SET (VOIDmode, xa,
33920 gen_rtx_AND (mode, op0, mask)));
33921
33922 if (smask)
33923 *smask = mask;
33924
33925 return xa;
33926 }
33927
33928 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
33929 swapping the operands if SWAP_OPERANDS is true. The expanded
33930 code is a forward jump to a newly created label in case the
33931 comparison is true. The generated label rtx is returned. */
33932 static rtx
33933 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
33934 bool swap_operands)
33935 {
33936 rtx label, tmp;
33937
33938 if (swap_operands)
33939 {
33940 tmp = op0;
33941 op0 = op1;
33942 op1 = tmp;
33943 }
33944
33945 label = gen_label_rtx ();
33946 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
33947 emit_insn (gen_rtx_SET (VOIDmode, tmp,
33948 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
33949 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
33950 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
33951 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
33952 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
33953 JUMP_LABEL (tmp) = label;
33954
33955 return label;
33956 }
33957
33958 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
33959 using comparison code CODE. Operands are swapped for the comparison if
33960 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
33961 static rtx
33962 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
33963 bool swap_operands)
33964 {
33965 rtx (*insn)(rtx, rtx, rtx, rtx);
33966 enum machine_mode mode = GET_MODE (op0);
33967 rtx mask = gen_reg_rtx (mode);
33968
33969 if (swap_operands)
33970 {
33971 rtx tmp = op0;
33972 op0 = op1;
33973 op1 = tmp;
33974 }
33975
33976 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
33977
33978 emit_insn (insn (mask, op0, op1,
33979 gen_rtx_fmt_ee (code, mode, op0, op1)));
33980 return mask;
33981 }
33982
33983 /* Generate and return a rtx of mode MODE for 2**n where n is the number
33984 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
33985 static rtx
33986 ix86_gen_TWO52 (enum machine_mode mode)
33987 {
33988 REAL_VALUE_TYPE TWO52r;
33989 rtx TWO52;
33990
33991 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
33992 TWO52 = const_double_from_real_value (TWO52r, mode);
33993 TWO52 = force_reg (mode, TWO52);
33994
33995 return TWO52;
33996 }
33997
33998 /* Expand SSE sequence for computing lround from OP1 storing
33999 into OP0. */
34000 void
34001 ix86_expand_lround (rtx op0, rtx op1)
34002 {
34003 /* C code for the stuff we're doing below:
34004 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
34005 return (long)tmp;
34006 */
34007 enum machine_mode mode = GET_MODE (op1);
34008 const struct real_format *fmt;
34009 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34010 rtx adj;
34011
34012 /* load nextafter (0.5, 0.0) */
34013 fmt = REAL_MODE_FORMAT (mode);
34014 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34015 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34016
34017 /* adj = copysign (0.5, op1) */
34018 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
34019 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
34020
34021 /* adj = op1 + adj */
34022 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
34023
34024 /* op0 = (imode)adj */
34025 expand_fix (op0, adj, 0);
34026 }
34027
34028 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
34029 into OPERAND0. */
34030 void
34031 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
34032 {
34033 /* C code for the stuff we're doing below (for do_floor):
34034 xi = (long)op1;
34035 xi -= (double)xi > op1 ? 1 : 0;
34036 return xi;
34037 */
34038 enum machine_mode fmode = GET_MODE (op1);
34039 enum machine_mode imode = GET_MODE (op0);
34040 rtx ireg, freg, label, tmp;
34041
34042 /* reg = (long)op1 */
34043 ireg = gen_reg_rtx (imode);
34044 expand_fix (ireg, op1, 0);
34045
34046 /* freg = (double)reg */
34047 freg = gen_reg_rtx (fmode);
34048 expand_float (freg, ireg, 0);
34049
34050 /* ireg = (freg > op1) ? ireg - 1 : ireg */
34051 label = ix86_expand_sse_compare_and_jump (UNLE,
34052 freg, op1, !do_floor);
34053 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
34054 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
34055 emit_move_insn (ireg, tmp);
34056
34057 emit_label (label);
34058 LABEL_NUSES (label) = 1;
34059
34060 emit_move_insn (op0, ireg);
34061 }
34062
34063 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
34064 result in OPERAND0. */
34065 void
34066 ix86_expand_rint (rtx operand0, rtx operand1)
34067 {
34068 /* C code for the stuff we're doing below:
34069 xa = fabs (operand1);
34070 if (!isless (xa, 2**52))
34071 return operand1;
34072 xa = xa + 2**52 - 2**52;
34073 return copysign (xa, operand1);
34074 */
34075 enum machine_mode mode = GET_MODE (operand0);
34076 rtx res, xa, label, TWO52, mask;
34077
34078 res = gen_reg_rtx (mode);
34079 emit_move_insn (res, operand1);
34080
34081 /* xa = abs (operand1) */
34082 xa = ix86_expand_sse_fabs (res, &mask);
34083
34084 /* if (!isless (xa, TWO52)) goto label; */
34085 TWO52 = ix86_gen_TWO52 (mode);
34086 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34087
34088 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34089 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
34090
34091 ix86_sse_copysign_to_positive (res, xa, res, mask);
34092
34093 emit_label (label);
34094 LABEL_NUSES (label) = 1;
34095
34096 emit_move_insn (operand0, res);
34097 }
34098
34099 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
34100 into OPERAND0. */
34101 void
34102 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
34103 {
34104 /* C code for the stuff we expand below.
34105 double xa = fabs (x), x2;
34106 if (!isless (xa, TWO52))
34107 return x;
34108 xa = xa + TWO52 - TWO52;
34109 x2 = copysign (xa, x);
34110 Compensate. Floor:
34111 if (x2 > x)
34112 x2 -= 1;
34113 Compensate. Ceil:
34114 if (x2 < x)
34115 x2 -= -1;
34116 return x2;
34117 */
34118 enum machine_mode mode = GET_MODE (operand0);
34119 rtx xa, TWO52, tmp, label, one, res, mask;
34120
34121 TWO52 = ix86_gen_TWO52 (mode);
34122
34123 /* Temporary for holding the result, initialized to the input
34124 operand to ease control flow. */
34125 res = gen_reg_rtx (mode);
34126 emit_move_insn (res, operand1);
34127
34128 /* xa = abs (operand1) */
34129 xa = ix86_expand_sse_fabs (res, &mask);
34130
34131 /* if (!isless (xa, TWO52)) goto label; */
34132 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34133
34134 /* xa = xa + TWO52 - TWO52; */
34135 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34136 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
34137
34138 /* xa = copysign (xa, operand1) */
34139 ix86_sse_copysign_to_positive (xa, xa, res, mask);
34140
34141 /* generate 1.0 or -1.0 */
34142 one = force_reg (mode,
34143 const_double_from_real_value (do_floor
34144 ? dconst1 : dconstm1, mode));
34145
34146 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
34147 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
34148 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34149 gen_rtx_AND (mode, one, tmp)));
34150 /* We always need to subtract here to preserve signed zero. */
34151 tmp = expand_simple_binop (mode, MINUS,
34152 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34153 emit_move_insn (res, tmp);
34154
34155 emit_label (label);
34156 LABEL_NUSES (label) = 1;
34157
34158 emit_move_insn (operand0, res);
34159 }
34160
34161 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
34162 into OPERAND0. */
34163 void
34164 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
34165 {
34166 /* C code for the stuff we expand below.
34167 double xa = fabs (x), x2;
34168 if (!isless (xa, TWO52))
34169 return x;
34170 x2 = (double)(long)x;
34171 Compensate. Floor:
34172 if (x2 > x)
34173 x2 -= 1;
34174 Compensate. Ceil:
34175 if (x2 < x)
34176 x2 += 1;
34177 if (HONOR_SIGNED_ZEROS (mode))
34178 return copysign (x2, x);
34179 return x2;
34180 */
34181 enum machine_mode mode = GET_MODE (operand0);
34182 rtx xa, xi, TWO52, tmp, label, one, res, mask;
34183
34184 TWO52 = ix86_gen_TWO52 (mode);
34185
34186 /* Temporary for holding the result, initialized to the input
34187 operand to ease control flow. */
34188 res = gen_reg_rtx (mode);
34189 emit_move_insn (res, operand1);
34190
34191 /* xa = abs (operand1) */
34192 xa = ix86_expand_sse_fabs (res, &mask);
34193
34194 /* if (!isless (xa, TWO52)) goto label; */
34195 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34196
34197 /* xa = (double)(long)x */
34198 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
34199 expand_fix (xi, res, 0);
34200 expand_float (xa, xi, 0);
34201
34202 /* generate 1.0 */
34203 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
34204
34205 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
34206 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
34207 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34208 gen_rtx_AND (mode, one, tmp)));
34209 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
34210 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34211 emit_move_insn (res, tmp);
34212
34213 if (HONOR_SIGNED_ZEROS (mode))
34214 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
34215
34216 emit_label (label);
34217 LABEL_NUSES (label) = 1;
34218
34219 emit_move_insn (operand0, res);
34220 }
34221
34222 /* Expand SSE sequence for computing round from OPERAND1 storing
34223 into OPERAND0. Sequence that works without relying on DImode truncation
34224 via cvttsd2siq that is only available on 64bit targets. */
34225 void
34226 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
34227 {
34228 /* C code for the stuff we expand below.
34229 double xa = fabs (x), xa2, x2;
34230 if (!isless (xa, TWO52))
34231 return x;
34232 Using the absolute value and copying back sign makes
34233 -0.0 -> -0.0 correct.
34234 xa2 = xa + TWO52 - TWO52;
34235 Compensate.
34236 dxa = xa2 - xa;
34237 if (dxa <= -0.5)
34238 xa2 += 1;
34239 else if (dxa > 0.5)
34240 xa2 -= 1;
34241 x2 = copysign (xa2, x);
34242 return x2;
34243 */
34244 enum machine_mode mode = GET_MODE (operand0);
34245 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
34246
34247 TWO52 = ix86_gen_TWO52 (mode);
34248
34249 /* Temporary for holding the result, initialized to the input
34250 operand to ease control flow. */
34251 res = gen_reg_rtx (mode);
34252 emit_move_insn (res, operand1);
34253
34254 /* xa = abs (operand1) */
34255 xa = ix86_expand_sse_fabs (res, &mask);
34256
34257 /* if (!isless (xa, TWO52)) goto label; */
34258 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34259
34260 /* xa2 = xa + TWO52 - TWO52; */
34261 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34262 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
34263
34264 /* dxa = xa2 - xa; */
34265 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
34266
34267 /* generate 0.5, 1.0 and -0.5 */
34268 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
34269 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
34270 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
34271 0, OPTAB_DIRECT);
34272
34273 /* Compensate. */
34274 tmp = gen_reg_rtx (mode);
34275 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
34276 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
34277 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34278 gen_rtx_AND (mode, one, tmp)));
34279 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34280 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
34281 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
34282 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34283 gen_rtx_AND (mode, one, tmp)));
34284 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34285
34286 /* res = copysign (xa2, operand1) */
34287 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
34288
34289 emit_label (label);
34290 LABEL_NUSES (label) = 1;
34291
34292 emit_move_insn (operand0, res);
34293 }
34294
34295 /* Expand SSE sequence for computing trunc from OPERAND1 storing
34296 into OPERAND0. */
34297 void
34298 ix86_expand_trunc (rtx operand0, rtx operand1)
34299 {
34300 /* C code for SSE variant we expand below.
34301 double xa = fabs (x), x2;
34302 if (!isless (xa, TWO52))
34303 return x;
34304 x2 = (double)(long)x;
34305 if (HONOR_SIGNED_ZEROS (mode))
34306 return copysign (x2, x);
34307 return x2;
34308 */
34309 enum machine_mode mode = GET_MODE (operand0);
34310 rtx xa, xi, TWO52, label, res, mask;
34311
34312 TWO52 = ix86_gen_TWO52 (mode);
34313
34314 /* Temporary for holding the result, initialized to the input
34315 operand to ease control flow. */
34316 res = gen_reg_rtx (mode);
34317 emit_move_insn (res, operand1);
34318
34319 /* xa = abs (operand1) */
34320 xa = ix86_expand_sse_fabs (res, &mask);
34321
34322 /* if (!isless (xa, TWO52)) goto label; */
34323 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34324
34325 /* x = (double)(long)x */
34326 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
34327 expand_fix (xi, res, 0);
34328 expand_float (res, xi, 0);
34329
34330 if (HONOR_SIGNED_ZEROS (mode))
34331 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
34332
34333 emit_label (label);
34334 LABEL_NUSES (label) = 1;
34335
34336 emit_move_insn (operand0, res);
34337 }
34338
34339 /* Expand SSE sequence for computing trunc from OPERAND1 storing
34340 into OPERAND0. */
34341 void
34342 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
34343 {
34344 enum machine_mode mode = GET_MODE (operand0);
34345 rtx xa, mask, TWO52, label, one, res, smask, tmp;
34346
34347 /* C code for SSE variant we expand below.
34348 double xa = fabs (x), x2;
34349 if (!isless (xa, TWO52))
34350 return x;
34351 xa2 = xa + TWO52 - TWO52;
34352 Compensate:
34353 if (xa2 > xa)
34354 xa2 -= 1.0;
34355 x2 = copysign (xa2, x);
34356 return x2;
34357 */
34358
34359 TWO52 = ix86_gen_TWO52 (mode);
34360
34361 /* Temporary for holding the result, initialized to the input
34362 operand to ease control flow. */
34363 res = gen_reg_rtx (mode);
34364 emit_move_insn (res, operand1);
34365
34366 /* xa = abs (operand1) */
34367 xa = ix86_expand_sse_fabs (res, &smask);
34368
34369 /* if (!isless (xa, TWO52)) goto label; */
34370 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34371
34372 /* res = xa + TWO52 - TWO52; */
34373 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34374 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
34375 emit_move_insn (res, tmp);
34376
34377 /* generate 1.0 */
34378 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
34379
34380 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
34381 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
34382 emit_insn (gen_rtx_SET (VOIDmode, mask,
34383 gen_rtx_AND (mode, mask, one)));
34384 tmp = expand_simple_binop (mode, MINUS,
34385 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
34386 emit_move_insn (res, tmp);
34387
34388 /* res = copysign (res, operand1) */
34389 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
34390
34391 emit_label (label);
34392 LABEL_NUSES (label) = 1;
34393
34394 emit_move_insn (operand0, res);
34395 }
34396
34397 /* Expand SSE sequence for computing round from OPERAND1 storing
34398 into OPERAND0. */
34399 void
34400 ix86_expand_round (rtx operand0, rtx operand1)
34401 {
34402 /* C code for the stuff we're doing below:
34403 double xa = fabs (x);
34404 if (!isless (xa, TWO52))
34405 return x;
34406 xa = (double)(long)(xa + nextafter (0.5, 0.0));
34407 return copysign (xa, x);
34408 */
34409 enum machine_mode mode = GET_MODE (operand0);
34410 rtx res, TWO52, xa, label, xi, half, mask;
34411 const struct real_format *fmt;
34412 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34413
34414 /* Temporary for holding the result, initialized to the input
34415 operand to ease control flow. */
34416 res = gen_reg_rtx (mode);
34417 emit_move_insn (res, operand1);
34418
34419 TWO52 = ix86_gen_TWO52 (mode);
34420 xa = ix86_expand_sse_fabs (res, &mask);
34421 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34422
34423 /* load nextafter (0.5, 0.0) */
34424 fmt = REAL_MODE_FORMAT (mode);
34425 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34426 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34427
34428 /* xa = xa + 0.5 */
34429 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
34430 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
34431
34432 /* xa = (double)(int64_t)xa */
34433 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
34434 expand_fix (xi, xa, 0);
34435 expand_float (xa, xi, 0);
34436
34437 /* res = copysign (xa, operand1) */
34438 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
34439
34440 emit_label (label);
34441 LABEL_NUSES (label) = 1;
34442
34443 emit_move_insn (operand0, res);
34444 }
34445
34446 /* Expand SSE sequence for computing round
34447 from OP1 storing into OP0 using sse4 round insn. */
34448 void
34449 ix86_expand_round_sse4 (rtx op0, rtx op1)
34450 {
34451 enum machine_mode mode = GET_MODE (op0);
34452 rtx e1, e2, res, half;
34453 const struct real_format *fmt;
34454 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34455 rtx (*gen_copysign) (rtx, rtx, rtx);
34456 rtx (*gen_round) (rtx, rtx, rtx);
34457
34458 switch (mode)
34459 {
34460 case SFmode:
34461 gen_copysign = gen_copysignsf3;
34462 gen_round = gen_sse4_1_roundsf2;
34463 break;
34464 case DFmode:
34465 gen_copysign = gen_copysigndf3;
34466 gen_round = gen_sse4_1_rounddf2;
34467 break;
34468 default:
34469 gcc_unreachable ();
34470 }
34471
34472 /* round (a) = trunc (a + copysign (0.5, a)) */
34473
34474 /* load nextafter (0.5, 0.0) */
34475 fmt = REAL_MODE_FORMAT (mode);
34476 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34477 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34478 half = const_double_from_real_value (pred_half, mode);
34479
34480 /* e1 = copysign (0.5, op1) */
34481 e1 = gen_reg_rtx (mode);
34482 emit_insn (gen_copysign (e1, half, op1));
34483
34484 /* e2 = op1 + e1 */
34485 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
34486
34487 /* res = trunc (e2) */
34488 res = gen_reg_rtx (mode);
34489 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
34490
34491 emit_move_insn (op0, res);
34492 }
34493 \f
34494
34495 /* Table of valid machine attributes. */
34496 static const struct attribute_spec ix86_attribute_table[] =
34497 {
34498 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
34499 affects_type_identity } */
34500 /* Stdcall attribute says callee is responsible for popping arguments
34501 if they are not variable. */
34502 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
34503 true },
34504 /* Fastcall attribute says callee is responsible for popping arguments
34505 if they are not variable. */
34506 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
34507 true },
34508 /* Thiscall attribute says callee is responsible for popping arguments
34509 if they are not variable. */
34510 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
34511 true },
34512 /* Cdecl attribute says the callee is a normal C declaration */
34513 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
34514 true },
34515 /* Regparm attribute specifies how many integer arguments are to be
34516 passed in registers. */
34517 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
34518 true },
34519 /* Sseregparm attribute says we are using x86_64 calling conventions
34520 for FP arguments. */
34521 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
34522 true },
34523 /* force_align_arg_pointer says this function realigns the stack at entry. */
34524 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
34525 false, true, true, ix86_handle_cconv_attribute, false },
34526 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
34527 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
34528 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
34529 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
34530 false },
34531 #endif
34532 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
34533 false },
34534 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
34535 false },
34536 #ifdef SUBTARGET_ATTRIBUTE_TABLE
34537 SUBTARGET_ATTRIBUTE_TABLE,
34538 #endif
34539 /* ms_abi and sysv_abi calling convention function attributes. */
34540 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
34541 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
34542 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
34543 false },
34544 { "callee_pop_aggregate_return", 1, 1, false, true, true,
34545 ix86_handle_callee_pop_aggregate_return, true },
34546 /* End element. */
34547 { NULL, 0, 0, false, false, false, NULL, false }
34548 };
34549
34550 /* Implement targetm.vectorize.builtin_vectorization_cost. */
34551 static int
34552 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
34553 tree vectype ATTRIBUTE_UNUSED,
34554 int misalign ATTRIBUTE_UNUSED)
34555 {
34556 switch (type_of_cost)
34557 {
34558 case scalar_stmt:
34559 return ix86_cost->scalar_stmt_cost;
34560
34561 case scalar_load:
34562 return ix86_cost->scalar_load_cost;
34563
34564 case scalar_store:
34565 return ix86_cost->scalar_store_cost;
34566
34567 case vector_stmt:
34568 return ix86_cost->vec_stmt_cost;
34569
34570 case vector_load:
34571 return ix86_cost->vec_align_load_cost;
34572
34573 case vector_store:
34574 return ix86_cost->vec_store_cost;
34575
34576 case vec_to_scalar:
34577 return ix86_cost->vec_to_scalar_cost;
34578
34579 case scalar_to_vec:
34580 return ix86_cost->scalar_to_vec_cost;
34581
34582 case unaligned_load:
34583 case unaligned_store:
34584 return ix86_cost->vec_unalign_load_cost;
34585
34586 case cond_branch_taken:
34587 return ix86_cost->cond_taken_branch_cost;
34588
34589 case cond_branch_not_taken:
34590 return ix86_cost->cond_not_taken_branch_cost;
34591
34592 case vec_perm:
34593 return 1;
34594
34595 default:
34596 gcc_unreachable ();
34597 }
34598 }
34599
34600
34601 /* Return a vector mode with twice as many elements as VMODE. */
34602 /* ??? Consider moving this to a table generated by genmodes.c. */
34603
34604 static enum machine_mode
34605 doublesize_vector_mode (enum machine_mode vmode)
34606 {
34607 switch (vmode)
34608 {
34609 case V2SFmode: return V4SFmode;
34610 case V1DImode: return V2DImode;
34611 case V2SImode: return V4SImode;
34612 case V4HImode: return V8HImode;
34613 case V8QImode: return V16QImode;
34614
34615 case V2DFmode: return V4DFmode;
34616 case V4SFmode: return V8SFmode;
34617 case V2DImode: return V4DImode;
34618 case V4SImode: return V8SImode;
34619 case V8HImode: return V16HImode;
34620 case V16QImode: return V32QImode;
34621
34622 case V4DFmode: return V8DFmode;
34623 case V8SFmode: return V16SFmode;
34624 case V4DImode: return V8DImode;
34625 case V8SImode: return V16SImode;
34626 case V16HImode: return V32HImode;
34627 case V32QImode: return V64QImode;
34628
34629 default:
34630 gcc_unreachable ();
34631 }
34632 }
34633
34634 /* Construct (set target (vec_select op0 (parallel perm))) and
34635 return true if that's a valid instruction in the active ISA. */
34636
34637 static bool
34638 expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
34639 {
34640 rtx rperm[MAX_VECT_LEN], x;
34641 unsigned i;
34642
34643 for (i = 0; i < nelt; ++i)
34644 rperm[i] = GEN_INT (perm[i]);
34645
34646 x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
34647 x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
34648 x = gen_rtx_SET (VOIDmode, target, x);
34649
34650 x = emit_insn (x);
34651 if (recog_memoized (x) < 0)
34652 {
34653 remove_insn (x);
34654 return false;
34655 }
34656 return true;
34657 }
34658
34659 /* Similar, but generate a vec_concat from op0 and op1 as well. */
34660
34661 static bool
34662 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
34663 const unsigned char *perm, unsigned nelt)
34664 {
34665 enum machine_mode v2mode;
34666 rtx x;
34667
34668 v2mode = doublesize_vector_mode (GET_MODE (op0));
34669 x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
34670 return expand_vselect (target, x, perm, nelt);
34671 }
34672
34673 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
34674 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
34675
34676 static bool
34677 expand_vec_perm_blend (struct expand_vec_perm_d *d)
34678 {
34679 enum machine_mode vmode = d->vmode;
34680 unsigned i, mask, nelt = d->nelt;
34681 rtx target, op0, op1, x;
34682 rtx rperm[32], vperm;
34683
34684 if (d->op0 == d->op1)
34685 return false;
34686 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
34687 ;
34688 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
34689 ;
34690 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
34691 ;
34692 else
34693 return false;
34694
34695 /* This is a blend, not a permute. Elements must stay in their
34696 respective lanes. */
34697 for (i = 0; i < nelt; ++i)
34698 {
34699 unsigned e = d->perm[i];
34700 if (!(e == i || e == i + nelt))
34701 return false;
34702 }
34703
34704 if (d->testing_p)
34705 return true;
34706
34707 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
34708 decision should be extracted elsewhere, so that we only try that
34709 sequence once all budget==3 options have been tried. */
34710 target = d->target;
34711 op0 = d->op0;
34712 op1 = d->op1;
34713 mask = 0;
34714
34715 switch (vmode)
34716 {
34717 case V4DFmode:
34718 case V8SFmode:
34719 case V2DFmode:
34720 case V4SFmode:
34721 case V8HImode:
34722 case V8SImode:
34723 for (i = 0; i < nelt; ++i)
34724 mask |= (d->perm[i] >= nelt) << i;
34725 break;
34726
34727 case V2DImode:
34728 for (i = 0; i < 2; ++i)
34729 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
34730 vmode = V8HImode;
34731 goto do_subreg;
34732
34733 case V4SImode:
34734 for (i = 0; i < 4; ++i)
34735 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
34736 vmode = V8HImode;
34737 goto do_subreg;
34738
34739 case V16QImode:
34740 /* See if bytes move in pairs so we can use pblendw with
34741 an immediate argument, rather than pblendvb with a vector
34742 argument. */
34743 for (i = 0; i < 16; i += 2)
34744 if (d->perm[i] + 1 != d->perm[i + 1])
34745 {
34746 use_pblendvb:
34747 for (i = 0; i < nelt; ++i)
34748 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
34749
34750 finish_pblendvb:
34751 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
34752 vperm = force_reg (vmode, vperm);
34753
34754 if (GET_MODE_SIZE (vmode) == 16)
34755 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
34756 else
34757 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
34758 return true;
34759 }
34760
34761 for (i = 0; i < 8; ++i)
34762 mask |= (d->perm[i * 2] >= 16) << i;
34763 vmode = V8HImode;
34764 /* FALLTHRU */
34765
34766 do_subreg:
34767 target = gen_lowpart (vmode, target);
34768 op0 = gen_lowpart (vmode, op0);
34769 op1 = gen_lowpart (vmode, op1);
34770 break;
34771
34772 case V32QImode:
34773 /* See if bytes move in pairs. If not, vpblendvb must be used. */
34774 for (i = 0; i < 32; i += 2)
34775 if (d->perm[i] + 1 != d->perm[i + 1])
34776 goto use_pblendvb;
34777 /* See if bytes move in quadruplets. If yes, vpblendd
34778 with immediate can be used. */
34779 for (i = 0; i < 32; i += 4)
34780 if (d->perm[i] + 2 != d->perm[i + 2])
34781 break;
34782 if (i < 32)
34783 {
34784 /* See if bytes move the same in both lanes. If yes,
34785 vpblendw with immediate can be used. */
34786 for (i = 0; i < 16; i += 2)
34787 if (d->perm[i] + 16 != d->perm[i + 16])
34788 goto use_pblendvb;
34789
34790 /* Use vpblendw. */
34791 for (i = 0; i < 16; ++i)
34792 mask |= (d->perm[i * 2] >= 32) << i;
34793 vmode = V16HImode;
34794 goto do_subreg;
34795 }
34796
34797 /* Use vpblendd. */
34798 for (i = 0; i < 8; ++i)
34799 mask |= (d->perm[i * 4] >= 32) << i;
34800 vmode = V8SImode;
34801 goto do_subreg;
34802
34803 case V16HImode:
34804 /* See if words move in pairs. If yes, vpblendd can be used. */
34805 for (i = 0; i < 16; i += 2)
34806 if (d->perm[i] + 1 != d->perm[i + 1])
34807 break;
34808 if (i < 16)
34809 {
34810 /* See if words move the same in both lanes. If not,
34811 vpblendvb must be used. */
34812 for (i = 0; i < 8; i++)
34813 if (d->perm[i] + 8 != d->perm[i + 8])
34814 {
34815 /* Use vpblendvb. */
34816 for (i = 0; i < 32; ++i)
34817 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
34818
34819 vmode = V32QImode;
34820 nelt = 32;
34821 target = gen_lowpart (vmode, target);
34822 op0 = gen_lowpart (vmode, op0);
34823 op1 = gen_lowpart (vmode, op1);
34824 goto finish_pblendvb;
34825 }
34826
34827 /* Use vpblendw. */
34828 for (i = 0; i < 16; ++i)
34829 mask |= (d->perm[i] >= 16) << i;
34830 break;
34831 }
34832
34833 /* Use vpblendd. */
34834 for (i = 0; i < 8; ++i)
34835 mask |= (d->perm[i * 2] >= 16) << i;
34836 vmode = V8SImode;
34837 goto do_subreg;
34838
34839 case V4DImode:
34840 /* Use vpblendd. */
34841 for (i = 0; i < 4; ++i)
34842 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
34843 vmode = V8SImode;
34844 goto do_subreg;
34845
34846 default:
34847 gcc_unreachable ();
34848 }
34849
34850 /* This matches five different patterns with the different modes. */
34851 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
34852 x = gen_rtx_SET (VOIDmode, target, x);
34853 emit_insn (x);
34854
34855 return true;
34856 }
34857
34858 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
34859 in terms of the variable form of vpermilps.
34860
34861 Note that we will have already failed the immediate input vpermilps,
34862 which requires that the high and low part shuffle be identical; the
34863 variable form doesn't require that. */
34864
34865 static bool
34866 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
34867 {
34868 rtx rperm[8], vperm;
34869 unsigned i;
34870
34871 if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
34872 return false;
34873
34874 /* We can only permute within the 128-bit lane. */
34875 for (i = 0; i < 8; ++i)
34876 {
34877 unsigned e = d->perm[i];
34878 if (i < 4 ? e >= 4 : e < 4)
34879 return false;
34880 }
34881
34882 if (d->testing_p)
34883 return true;
34884
34885 for (i = 0; i < 8; ++i)
34886 {
34887 unsigned e = d->perm[i];
34888
34889 /* Within each 128-bit lane, the elements of op0 are numbered
34890 from 0 and the elements of op1 are numbered from 4. */
34891 if (e >= 8 + 4)
34892 e -= 8;
34893 else if (e >= 4)
34894 e -= 4;
34895
34896 rperm[i] = GEN_INT (e);
34897 }
34898
34899 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
34900 vperm = force_reg (V8SImode, vperm);
34901 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
34902
34903 return true;
34904 }
34905
34906 /* Return true if permutation D can be performed as VMODE permutation
34907 instead. */
34908
34909 static bool
34910 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
34911 {
34912 unsigned int i, j, chunk;
34913
34914 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
34915 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
34916 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
34917 return false;
34918
34919 if (GET_MODE_NUNITS (vmode) >= d->nelt)
34920 return true;
34921
34922 chunk = d->nelt / GET_MODE_NUNITS (vmode);
34923 for (i = 0; i < d->nelt; i += chunk)
34924 if (d->perm[i] & (chunk - 1))
34925 return false;
34926 else
34927 for (j = 1; j < chunk; ++j)
34928 if (d->perm[i] + j != d->perm[i + j])
34929 return false;
34930
34931 return true;
34932 }
34933
34934 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
34935 in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128. */
34936
34937 static bool
34938 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
34939 {
34940 unsigned i, nelt, eltsz, mask;
34941 unsigned char perm[32];
34942 enum machine_mode vmode = V16QImode;
34943 rtx rperm[32], vperm, target, op0, op1;
34944
34945 nelt = d->nelt;
34946
34947 if (d->op0 != d->op1)
34948 {
34949 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
34950 {
34951 if (TARGET_AVX2
34952 && valid_perm_using_mode_p (V2TImode, d))
34953 {
34954 if (d->testing_p)
34955 return true;
34956
34957 /* Use vperm2i128 insn. The pattern uses
34958 V4DImode instead of V2TImode. */
34959 target = gen_lowpart (V4DImode, d->target);
34960 op0 = gen_lowpart (V4DImode, d->op0);
34961 op1 = gen_lowpart (V4DImode, d->op1);
34962 rperm[0]
34963 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
34964 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
34965 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
34966 return true;
34967 }
34968 return false;
34969 }
34970 }
34971 else
34972 {
34973 if (GET_MODE_SIZE (d->vmode) == 16)
34974 {
34975 if (!TARGET_SSSE3)
34976 return false;
34977 }
34978 else if (GET_MODE_SIZE (d->vmode) == 32)
34979 {
34980 if (!TARGET_AVX2)
34981 return false;
34982
34983 /* V4DImode should be already handled through
34984 expand_vselect by vpermq instruction. */
34985 gcc_assert (d->vmode != V4DImode);
34986
34987 vmode = V32QImode;
34988 if (d->vmode == V8SImode
34989 || d->vmode == V16HImode
34990 || d->vmode == V32QImode)
34991 {
34992 /* First see if vpermq can be used for
34993 V8SImode/V16HImode/V32QImode. */
34994 if (valid_perm_using_mode_p (V4DImode, d))
34995 {
34996 for (i = 0; i < 4; i++)
34997 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
34998 if (d->testing_p)
34999 return true;
35000 return expand_vselect (gen_lowpart (V4DImode, d->target),
35001 gen_lowpart (V4DImode, d->op0),
35002 perm, 4);
35003 }
35004
35005 /* Next see if vpermd can be used. */
35006 if (valid_perm_using_mode_p (V8SImode, d))
35007 vmode = V8SImode;
35008 }
35009
35010 if (vmode == V32QImode)
35011 {
35012 /* vpshufb only works intra lanes, it is not
35013 possible to shuffle bytes in between the lanes. */
35014 for (i = 0; i < nelt; ++i)
35015 if ((d->perm[i] ^ i) & (nelt / 2))
35016 return false;
35017 }
35018 }
35019 else
35020 return false;
35021 }
35022
35023 if (d->testing_p)
35024 return true;
35025
35026 if (vmode == V8SImode)
35027 for (i = 0; i < 8; ++i)
35028 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
35029 else
35030 {
35031 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
35032 if (d->op0 != d->op1)
35033 mask = 2 * nelt - 1;
35034 else if (vmode == V16QImode)
35035 mask = nelt - 1;
35036 else
35037 mask = nelt / 2 - 1;
35038
35039 for (i = 0; i < nelt; ++i)
35040 {
35041 unsigned j, e = d->perm[i] & mask;
35042 for (j = 0; j < eltsz; ++j)
35043 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
35044 }
35045 }
35046
35047 vperm = gen_rtx_CONST_VECTOR (vmode,
35048 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
35049 vperm = force_reg (vmode, vperm);
35050
35051 target = gen_lowpart (vmode, d->target);
35052 op0 = gen_lowpart (vmode, d->op0);
35053 if (d->op0 == d->op1)
35054 {
35055 if (vmode == V16QImode)
35056 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
35057 else if (vmode == V32QImode)
35058 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
35059 else
35060 emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
35061 }
35062 else
35063 {
35064 op1 = gen_lowpart (vmode, d->op1);
35065 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
35066 }
35067
35068 return true;
35069 }
35070
35071 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
35072 in a single instruction. */
35073
35074 static bool
35075 expand_vec_perm_1 (struct expand_vec_perm_d *d)
35076 {
35077 unsigned i, nelt = d->nelt;
35078 unsigned char perm2[MAX_VECT_LEN];
35079
35080 /* Check plain VEC_SELECT first, because AVX has instructions that could
35081 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
35082 input where SEL+CONCAT may not. */
35083 if (d->op0 == d->op1)
35084 {
35085 int mask = nelt - 1;
35086 bool identity_perm = true;
35087 bool broadcast_perm = true;
35088
35089 for (i = 0; i < nelt; i++)
35090 {
35091 perm2[i] = d->perm[i] & mask;
35092 if (perm2[i] != i)
35093 identity_perm = false;
35094 if (perm2[i])
35095 broadcast_perm = false;
35096 }
35097
35098 if (identity_perm)
35099 {
35100 if (!d->testing_p)
35101 emit_move_insn (d->target, d->op0);
35102 return true;
35103 }
35104 else if (broadcast_perm && TARGET_AVX2)
35105 {
35106 /* Use vpbroadcast{b,w,d}. */
35107 rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
35108 switch (d->vmode)
35109 {
35110 case V32QImode:
35111 op = gen_lowpart (V16QImode, op);
35112 gen = gen_avx2_pbroadcastv32qi;
35113 break;
35114 case V16HImode:
35115 op = gen_lowpart (V8HImode, op);
35116 gen = gen_avx2_pbroadcastv16hi;
35117 break;
35118 case V8SImode:
35119 op = gen_lowpart (V4SImode, op);
35120 gen = gen_avx2_pbroadcastv8si;
35121 break;
35122 case V16QImode:
35123 gen = gen_avx2_pbroadcastv16qi;
35124 break;
35125 case V8HImode:
35126 gen = gen_avx2_pbroadcastv8hi;
35127 break;
35128 /* For other modes prefer other shuffles this function creates. */
35129 default: break;
35130 }
35131 if (gen != NULL)
35132 {
35133 if (!d->testing_p)
35134 emit_insn (gen (d->target, op));
35135 return true;
35136 }
35137 }
35138
35139 if (expand_vselect (d->target, d->op0, perm2, nelt))
35140 return true;
35141
35142 /* There are plenty of patterns in sse.md that are written for
35143 SEL+CONCAT and are not replicated for a single op. Perhaps
35144 that should be changed, to avoid the nastiness here. */
35145
35146 /* Recognize interleave style patterns, which means incrementing
35147 every other permutation operand. */
35148 for (i = 0; i < nelt; i += 2)
35149 {
35150 perm2[i] = d->perm[i] & mask;
35151 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
35152 }
35153 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
35154 return true;
35155
35156 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
35157 if (nelt >= 4)
35158 {
35159 for (i = 0; i < nelt; i += 4)
35160 {
35161 perm2[i + 0] = d->perm[i + 0] & mask;
35162 perm2[i + 1] = d->perm[i + 1] & mask;
35163 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
35164 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
35165 }
35166
35167 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
35168 return true;
35169 }
35170 }
35171
35172 /* Finally, try the fully general two operand permute. */
35173 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
35174 return true;
35175
35176 /* Recognize interleave style patterns with reversed operands. */
35177 if (d->op0 != d->op1)
35178 {
35179 for (i = 0; i < nelt; ++i)
35180 {
35181 unsigned e = d->perm[i];
35182 if (e >= nelt)
35183 e -= nelt;
35184 else
35185 e += nelt;
35186 perm2[i] = e;
35187 }
35188
35189 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
35190 return true;
35191 }
35192
35193 /* Try the SSE4.1 blend variable merge instructions. */
35194 if (expand_vec_perm_blend (d))
35195 return true;
35196
35197 /* Try one of the AVX vpermil variable permutations. */
35198 if (expand_vec_perm_vpermil (d))
35199 return true;
35200
35201 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
35202 vpshufb, vpermd or vpermq variable permutation. */
35203 if (expand_vec_perm_pshufb (d))
35204 return true;
35205
35206 return false;
35207 }
35208
35209 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35210 in terms of a pair of pshuflw + pshufhw instructions. */
35211
35212 static bool
35213 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
35214 {
35215 unsigned char perm2[MAX_VECT_LEN];
35216 unsigned i;
35217 bool ok;
35218
35219 if (d->vmode != V8HImode || d->op0 != d->op1)
35220 return false;
35221
35222 /* The two permutations only operate in 64-bit lanes. */
35223 for (i = 0; i < 4; ++i)
35224 if (d->perm[i] >= 4)
35225 return false;
35226 for (i = 4; i < 8; ++i)
35227 if (d->perm[i] < 4)
35228 return false;
35229
35230 if (d->testing_p)
35231 return true;
35232
35233 /* Emit the pshuflw. */
35234 memcpy (perm2, d->perm, 4);
35235 for (i = 4; i < 8; ++i)
35236 perm2[i] = i;
35237 ok = expand_vselect (d->target, d->op0, perm2, 8);
35238 gcc_assert (ok);
35239
35240 /* Emit the pshufhw. */
35241 memcpy (perm2 + 4, d->perm + 4, 4);
35242 for (i = 0; i < 4; ++i)
35243 perm2[i] = i;
35244 ok = expand_vselect (d->target, d->target, perm2, 8);
35245 gcc_assert (ok);
35246
35247 return true;
35248 }
35249
35250 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
35251 the permutation using the SSSE3 palignr instruction. This succeeds
35252 when all of the elements in PERM fit within one vector and we merely
35253 need to shift them down so that a single vector permutation has a
35254 chance to succeed. */
35255
35256 static bool
35257 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
35258 {
35259 unsigned i, nelt = d->nelt;
35260 unsigned min, max;
35261 bool in_order, ok;
35262 rtx shift;
35263
35264 /* Even with AVX, palignr only operates on 128-bit vectors. */
35265 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
35266 return false;
35267
35268 min = nelt, max = 0;
35269 for (i = 0; i < nelt; ++i)
35270 {
35271 unsigned e = d->perm[i];
35272 if (e < min)
35273 min = e;
35274 if (e > max)
35275 max = e;
35276 }
35277 if (min == 0 || max - min >= nelt)
35278 return false;
35279
35280 /* Given that we have SSSE3, we know we'll be able to implement the
35281 single operand permutation after the palignr with pshufb. */
35282 if (d->testing_p)
35283 return true;
35284
35285 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
35286 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
35287 gen_lowpart (TImode, d->op1),
35288 gen_lowpart (TImode, d->op0), shift));
35289
35290 d->op0 = d->op1 = d->target;
35291
35292 in_order = true;
35293 for (i = 0; i < nelt; ++i)
35294 {
35295 unsigned e = d->perm[i] - min;
35296 if (e != i)
35297 in_order = false;
35298 d->perm[i] = e;
35299 }
35300
35301 /* Test for the degenerate case where the alignment by itself
35302 produces the desired permutation. */
35303 if (in_order)
35304 return true;
35305
35306 ok = expand_vec_perm_1 (d);
35307 gcc_assert (ok);
35308
35309 return ok;
35310 }
35311
35312 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
35313 a two vector permutation into a single vector permutation by using
35314 an interleave operation to merge the vectors. */
35315
35316 static bool
35317 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
35318 {
35319 struct expand_vec_perm_d dremap, dfinal;
35320 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
35321 unsigned HOST_WIDE_INT contents;
35322 unsigned char remap[2 * MAX_VECT_LEN];
35323 rtx seq;
35324 bool ok, same_halves = false;
35325
35326 if (GET_MODE_SIZE (d->vmode) == 16)
35327 {
35328 if (d->op0 == d->op1)
35329 return false;
35330 }
35331 else if (GET_MODE_SIZE (d->vmode) == 32)
35332 {
35333 if (!TARGET_AVX)
35334 return false;
35335 /* For 32-byte modes allow even d->op0 == d->op1.
35336 The lack of cross-lane shuffling in some instructions
35337 might prevent a single insn shuffle. */
35338 }
35339 else
35340 return false;
35341
35342 /* Examine from whence the elements come. */
35343 contents = 0;
35344 for (i = 0; i < nelt; ++i)
35345 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
35346
35347 memset (remap, 0xff, sizeof (remap));
35348 dremap = *d;
35349
35350 if (GET_MODE_SIZE (d->vmode) == 16)
35351 {
35352 unsigned HOST_WIDE_INT h1, h2, h3, h4;
35353
35354 /* Split the two input vectors into 4 halves. */
35355 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
35356 h2 = h1 << nelt2;
35357 h3 = h2 << nelt2;
35358 h4 = h3 << nelt2;
35359
35360 /* If the elements from the low halves use interleave low, and similarly
35361 for interleave high. If the elements are from mis-matched halves, we
35362 can use shufps for V4SF/V4SI or do a DImode shuffle. */
35363 if ((contents & (h1 | h3)) == contents)
35364 {
35365 /* punpckl* */
35366 for (i = 0; i < nelt2; ++i)
35367 {
35368 remap[i] = i * 2;
35369 remap[i + nelt] = i * 2 + 1;
35370 dremap.perm[i * 2] = i;
35371 dremap.perm[i * 2 + 1] = i + nelt;
35372 }
35373 }
35374 else if ((contents & (h2 | h4)) == contents)
35375 {
35376 /* punpckh* */
35377 for (i = 0; i < nelt2; ++i)
35378 {
35379 remap[i + nelt2] = i * 2;
35380 remap[i + nelt + nelt2] = i * 2 + 1;
35381 dremap.perm[i * 2] = i + nelt2;
35382 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
35383 }
35384 }
35385 else if ((contents & (h1 | h4)) == contents)
35386 {
35387 /* shufps */
35388 for (i = 0; i < nelt2; ++i)
35389 {
35390 remap[i] = i;
35391 remap[i + nelt + nelt2] = i + nelt2;
35392 dremap.perm[i] = i;
35393 dremap.perm[i + nelt2] = i + nelt + nelt2;
35394 }
35395 if (nelt != 4)
35396 {
35397 /* shufpd */
35398 dremap.vmode = V2DImode;
35399 dremap.nelt = 2;
35400 dremap.perm[0] = 0;
35401 dremap.perm[1] = 3;
35402 }
35403 }
35404 else if ((contents & (h2 | h3)) == contents)
35405 {
35406 /* shufps */
35407 for (i = 0; i < nelt2; ++i)
35408 {
35409 remap[i + nelt2] = i;
35410 remap[i + nelt] = i + nelt2;
35411 dremap.perm[i] = i + nelt2;
35412 dremap.perm[i + nelt2] = i + nelt;
35413 }
35414 if (nelt != 4)
35415 {
35416 /* shufpd */
35417 dremap.vmode = V2DImode;
35418 dremap.nelt = 2;
35419 dremap.perm[0] = 1;
35420 dremap.perm[1] = 2;
35421 }
35422 }
35423 else
35424 return false;
35425 }
35426 else
35427 {
35428 unsigned int nelt4 = nelt / 4, nzcnt = 0;
35429 unsigned HOST_WIDE_INT q[8];
35430 unsigned int nonzero_halves[4];
35431
35432 /* Split the two input vectors into 8 quarters. */
35433 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
35434 for (i = 1; i < 8; ++i)
35435 q[i] = q[0] << (nelt4 * i);
35436 for (i = 0; i < 4; ++i)
35437 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
35438 {
35439 nonzero_halves[nzcnt] = i;
35440 ++nzcnt;
35441 }
35442
35443 if (nzcnt == 1)
35444 {
35445 gcc_assert (d->op0 == d->op1);
35446 nonzero_halves[1] = nonzero_halves[0];
35447 same_halves = true;
35448 }
35449 else if (d->op0 == d->op1)
35450 {
35451 gcc_assert (nonzero_halves[0] == 0);
35452 gcc_assert (nonzero_halves[1] == 1);
35453 }
35454
35455 if (nzcnt <= 2)
35456 {
35457 if (d->perm[0] / nelt2 == nonzero_halves[1])
35458 {
35459 /* Attempt to increase the likelyhood that dfinal
35460 shuffle will be intra-lane. */
35461 char tmph = nonzero_halves[0];
35462 nonzero_halves[0] = nonzero_halves[1];
35463 nonzero_halves[1] = tmph;
35464 }
35465
35466 /* vperm2f128 or vperm2i128. */
35467 for (i = 0; i < nelt2; ++i)
35468 {
35469 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
35470 remap[i + nonzero_halves[0] * nelt2] = i;
35471 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
35472 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
35473 }
35474
35475 if (d->vmode != V8SFmode
35476 && d->vmode != V4DFmode
35477 && d->vmode != V8SImode)
35478 {
35479 dremap.vmode = V8SImode;
35480 dremap.nelt = 8;
35481 for (i = 0; i < 4; ++i)
35482 {
35483 dremap.perm[i] = i + nonzero_halves[0] * 4;
35484 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
35485 }
35486 }
35487 }
35488 else if (d->op0 == d->op1)
35489 return false;
35490 else if (TARGET_AVX2
35491 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
35492 {
35493 /* vpunpckl* */
35494 for (i = 0; i < nelt4; ++i)
35495 {
35496 remap[i] = i * 2;
35497 remap[i + nelt] = i * 2 + 1;
35498 remap[i + nelt2] = i * 2 + nelt2;
35499 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
35500 dremap.perm[i * 2] = i;
35501 dremap.perm[i * 2 + 1] = i + nelt;
35502 dremap.perm[i * 2 + nelt2] = i + nelt2;
35503 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
35504 }
35505 }
35506 else if (TARGET_AVX2
35507 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
35508 {
35509 /* vpunpckh* */
35510 for (i = 0; i < nelt4; ++i)
35511 {
35512 remap[i + nelt4] = i * 2;
35513 remap[i + nelt + nelt4] = i * 2 + 1;
35514 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
35515 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
35516 dremap.perm[i * 2] = i + nelt4;
35517 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
35518 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
35519 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
35520 }
35521 }
35522 else
35523 return false;
35524 }
35525
35526 /* Use the remapping array set up above to move the elements from their
35527 swizzled locations into their final destinations. */
35528 dfinal = *d;
35529 for (i = 0; i < nelt; ++i)
35530 {
35531 unsigned e = remap[d->perm[i]];
35532 gcc_assert (e < nelt);
35533 /* If same_halves is true, both halves of the remapped vector are the
35534 same. Avoid cross-lane accesses if possible. */
35535 if (same_halves && i >= nelt2)
35536 {
35537 gcc_assert (e < nelt2);
35538 dfinal.perm[i] = e + nelt2;
35539 }
35540 else
35541 dfinal.perm[i] = e;
35542 }
35543 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
35544 dfinal.op1 = dfinal.op0;
35545 dremap.target = dfinal.op0;
35546
35547 /* Test if the final remap can be done with a single insn. For V4SFmode or
35548 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
35549 start_sequence ();
35550 ok = expand_vec_perm_1 (&dfinal);
35551 seq = get_insns ();
35552 end_sequence ();
35553
35554 if (!ok)
35555 return false;
35556
35557 if (d->testing_p)
35558 return true;
35559
35560 if (dremap.vmode != dfinal.vmode)
35561 {
35562 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
35563 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
35564 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
35565 }
35566
35567 ok = expand_vec_perm_1 (&dremap);
35568 gcc_assert (ok);
35569
35570 emit_insn (seq);
35571 return true;
35572 }
35573
35574 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
35575 a single vector cross-lane permutation into vpermq followed
35576 by any of the single insn permutations. */
35577
35578 static bool
35579 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
35580 {
35581 struct expand_vec_perm_d dremap, dfinal;
35582 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
35583 unsigned contents[2];
35584 bool ok;
35585
35586 if (!(TARGET_AVX2
35587 && (d->vmode == V32QImode || d->vmode == V16HImode)
35588 && d->op0 == d->op1))
35589 return false;
35590
35591 contents[0] = 0;
35592 contents[1] = 0;
35593 for (i = 0; i < nelt2; ++i)
35594 {
35595 contents[0] |= 1u << (d->perm[i] / nelt4);
35596 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
35597 }
35598
35599 for (i = 0; i < 2; ++i)
35600 {
35601 unsigned int cnt = 0;
35602 for (j = 0; j < 4; ++j)
35603 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
35604 return false;
35605 }
35606
35607 if (d->testing_p)
35608 return true;
35609
35610 dremap = *d;
35611 dremap.vmode = V4DImode;
35612 dremap.nelt = 4;
35613 dremap.target = gen_reg_rtx (V4DImode);
35614 dremap.op0 = gen_lowpart (V4DImode, d->op0);
35615 dremap.op1 = dremap.op0;
35616 for (i = 0; i < 2; ++i)
35617 {
35618 unsigned int cnt = 0;
35619 for (j = 0; j < 4; ++j)
35620 if ((contents[i] & (1u << j)) != 0)
35621 dremap.perm[2 * i + cnt++] = j;
35622 for (; cnt < 2; ++cnt)
35623 dremap.perm[2 * i + cnt] = 0;
35624 }
35625
35626 dfinal = *d;
35627 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
35628 dfinal.op1 = dfinal.op0;
35629 for (i = 0, j = 0; i < nelt; ++i)
35630 {
35631 if (i == nelt2)
35632 j = 2;
35633 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
35634 if ((d->perm[i] / nelt4) == dremap.perm[j])
35635 ;
35636 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
35637 dfinal.perm[i] |= nelt4;
35638 else
35639 gcc_unreachable ();
35640 }
35641
35642 ok = expand_vec_perm_1 (&dremap);
35643 gcc_assert (ok);
35644
35645 ok = expand_vec_perm_1 (&dfinal);
35646 gcc_assert (ok);
35647
35648 return true;
35649 }
35650
35651 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
35652 a two vector permutation using 2 intra-lane interleave insns
35653 and cross-lane shuffle for 32-byte vectors. */
35654
35655 static bool
35656 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
35657 {
35658 unsigned i, nelt;
35659 rtx (*gen) (rtx, rtx, rtx);
35660
35661 if (d->op0 == d->op1)
35662 return false;
35663 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
35664 ;
35665 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
35666 ;
35667 else
35668 return false;
35669
35670 nelt = d->nelt;
35671 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
35672 return false;
35673 for (i = 0; i < nelt; i += 2)
35674 if (d->perm[i] != d->perm[0] + i / 2
35675 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
35676 return false;
35677
35678 if (d->testing_p)
35679 return true;
35680
35681 switch (d->vmode)
35682 {
35683 case V32QImode:
35684 if (d->perm[0])
35685 gen = gen_vec_interleave_highv32qi;
35686 else
35687 gen = gen_vec_interleave_lowv32qi;
35688 break;
35689 case V16HImode:
35690 if (d->perm[0])
35691 gen = gen_vec_interleave_highv16hi;
35692 else
35693 gen = gen_vec_interleave_lowv16hi;
35694 break;
35695 case V8SImode:
35696 if (d->perm[0])
35697 gen = gen_vec_interleave_highv8si;
35698 else
35699 gen = gen_vec_interleave_lowv8si;
35700 break;
35701 case V4DImode:
35702 if (d->perm[0])
35703 gen = gen_vec_interleave_highv4di;
35704 else
35705 gen = gen_vec_interleave_lowv4di;
35706 break;
35707 case V8SFmode:
35708 if (d->perm[0])
35709 gen = gen_vec_interleave_highv8sf;
35710 else
35711 gen = gen_vec_interleave_lowv8sf;
35712 break;
35713 case V4DFmode:
35714 if (d->perm[0])
35715 gen = gen_vec_interleave_highv4df;
35716 else
35717 gen = gen_vec_interleave_lowv4df;
35718 break;
35719 default:
35720 gcc_unreachable ();
35721 }
35722
35723 emit_insn (gen (d->target, d->op0, d->op1));
35724 return true;
35725 }
35726
35727 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
35728 permutation with two pshufb insns and an ior. We should have already
35729 failed all two instruction sequences. */
35730
35731 static bool
35732 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
35733 {
35734 rtx rperm[2][16], vperm, l, h, op, m128;
35735 unsigned int i, nelt, eltsz;
35736
35737 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
35738 return false;
35739 gcc_assert (d->op0 != d->op1);
35740
35741 nelt = d->nelt;
35742 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
35743
35744 /* Generate two permutation masks. If the required element is within
35745 the given vector it is shuffled into the proper lane. If the required
35746 element is in the other vector, force a zero into the lane by setting
35747 bit 7 in the permutation mask. */
35748 m128 = GEN_INT (-128);
35749 for (i = 0; i < nelt; ++i)
35750 {
35751 unsigned j, e = d->perm[i];
35752 unsigned which = (e >= nelt);
35753 if (e >= nelt)
35754 e -= nelt;
35755
35756 for (j = 0; j < eltsz; ++j)
35757 {
35758 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
35759 rperm[1-which][i*eltsz + j] = m128;
35760 }
35761 }
35762
35763 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
35764 vperm = force_reg (V16QImode, vperm);
35765
35766 l = gen_reg_rtx (V16QImode);
35767 op = gen_lowpart (V16QImode, d->op0);
35768 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
35769
35770 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
35771 vperm = force_reg (V16QImode, vperm);
35772
35773 h = gen_reg_rtx (V16QImode);
35774 op = gen_lowpart (V16QImode, d->op1);
35775 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
35776
35777 op = gen_lowpart (V16QImode, d->target);
35778 emit_insn (gen_iorv16qi3 (op, l, h));
35779
35780 return true;
35781 }
35782
35783 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
35784 with two vpshufb insns, vpermq and vpor. We should have already failed
35785 all two or three instruction sequences. */
35786
35787 static bool
35788 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
35789 {
35790 rtx rperm[2][32], vperm, l, h, hp, op, m128;
35791 unsigned int i, nelt, eltsz;
35792
35793 if (!TARGET_AVX2
35794 || d->op0 != d->op1
35795 || (d->vmode != V32QImode && d->vmode != V16HImode))
35796 return false;
35797
35798 if (d->testing_p)
35799 return true;
35800
35801 nelt = d->nelt;
35802 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
35803
35804 /* Generate two permutation masks. If the required element is within
35805 the same lane, it is shuffled in. If the required element from the
35806 other lane, force a zero by setting bit 7 in the permutation mask.
35807 In the other mask the mask has non-negative elements if element
35808 is requested from the other lane, but also moved to the other lane,
35809 so that the result of vpshufb can have the two V2TImode halves
35810 swapped. */
35811 m128 = GEN_INT (-128);
35812 for (i = 0; i < nelt; ++i)
35813 {
35814 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
35815 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
35816
35817 for (j = 0; j < eltsz; ++j)
35818 {
35819 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
35820 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
35821 }
35822 }
35823
35824 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
35825 vperm = force_reg (V32QImode, vperm);
35826
35827 h = gen_reg_rtx (V32QImode);
35828 op = gen_lowpart (V32QImode, d->op0);
35829 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
35830
35831 /* Swap the 128-byte lanes of h into hp. */
35832 hp = gen_reg_rtx (V4DImode);
35833 op = gen_lowpart (V4DImode, h);
35834 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
35835 const1_rtx));
35836
35837 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
35838 vperm = force_reg (V32QImode, vperm);
35839
35840 l = gen_reg_rtx (V32QImode);
35841 op = gen_lowpart (V32QImode, d->op0);
35842 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
35843
35844 op = gen_lowpart (V32QImode, d->target);
35845 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
35846
35847 return true;
35848 }
35849
35850 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
35851 and extract-odd permutations of two V32QImode and V16QImode operand
35852 with two vpshufb insns, vpor and vpermq. We should have already
35853 failed all two or three instruction sequences. */
35854
35855 static bool
35856 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
35857 {
35858 rtx rperm[2][32], vperm, l, h, ior, op, m128;
35859 unsigned int i, nelt, eltsz;
35860
35861 if (!TARGET_AVX2
35862 || d->op0 == d->op1
35863 || (d->vmode != V32QImode && d->vmode != V16HImode))
35864 return false;
35865
35866 for (i = 0; i < d->nelt; ++i)
35867 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
35868 return false;
35869
35870 if (d->testing_p)
35871 return true;
35872
35873 nelt = d->nelt;
35874 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
35875
35876 /* Generate two permutation masks. In the first permutation mask
35877 the first quarter will contain indexes for the first half
35878 of the op0, the second quarter will contain bit 7 set, third quarter
35879 will contain indexes for the second half of the op0 and the
35880 last quarter bit 7 set. In the second permutation mask
35881 the first quarter will contain bit 7 set, the second quarter
35882 indexes for the first half of the op1, the third quarter bit 7 set
35883 and last quarter indexes for the second half of the op1.
35884 I.e. the first mask e.g. for V32QImode extract even will be:
35885 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
35886 (all values masked with 0xf except for -128) and second mask
35887 for extract even will be
35888 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
35889 m128 = GEN_INT (-128);
35890 for (i = 0; i < nelt; ++i)
35891 {
35892 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
35893 unsigned which = d->perm[i] >= nelt;
35894 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
35895
35896 for (j = 0; j < eltsz; ++j)
35897 {
35898 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
35899 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
35900 }
35901 }
35902
35903 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
35904 vperm = force_reg (V32QImode, vperm);
35905
35906 l = gen_reg_rtx (V32QImode);
35907 op = gen_lowpart (V32QImode, d->op0);
35908 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
35909
35910 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
35911 vperm = force_reg (V32QImode, vperm);
35912
35913 h = gen_reg_rtx (V32QImode);
35914 op = gen_lowpart (V32QImode, d->op1);
35915 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
35916
35917 ior = gen_reg_rtx (V32QImode);
35918 emit_insn (gen_iorv32qi3 (ior, l, h));
35919
35920 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
35921 op = gen_lowpart (V4DImode, d->target);
35922 ior = gen_lowpart (V4DImode, ior);
35923 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
35924 const1_rtx, GEN_INT (3)));
35925
35926 return true;
35927 }
35928
35929 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
35930 and extract-odd permutations. */
35931
35932 static bool
35933 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
35934 {
35935 rtx t1, t2, t3;
35936
35937 switch (d->vmode)
35938 {
35939 case V4DFmode:
35940 t1 = gen_reg_rtx (V4DFmode);
35941 t2 = gen_reg_rtx (V4DFmode);
35942
35943 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
35944 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
35945 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
35946
35947 /* Now an unpck[lh]pd will produce the result required. */
35948 if (odd)
35949 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
35950 else
35951 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
35952 emit_insn (t3);
35953 break;
35954
35955 case V8SFmode:
35956 {
35957 int mask = odd ? 0xdd : 0x88;
35958
35959 t1 = gen_reg_rtx (V8SFmode);
35960 t2 = gen_reg_rtx (V8SFmode);
35961 t3 = gen_reg_rtx (V8SFmode);
35962
35963 /* Shuffle within the 128-bit lanes to produce:
35964 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
35965 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
35966 GEN_INT (mask)));
35967
35968 /* Shuffle the lanes around to produce:
35969 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
35970 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
35971 GEN_INT (0x3)));
35972
35973 /* Shuffle within the 128-bit lanes to produce:
35974 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
35975 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
35976
35977 /* Shuffle within the 128-bit lanes to produce:
35978 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
35979 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
35980
35981 /* Shuffle the lanes around to produce:
35982 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
35983 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
35984 GEN_INT (0x20)));
35985 }
35986 break;
35987
35988 case V2DFmode:
35989 case V4SFmode:
35990 case V2DImode:
35991 case V4SImode:
35992 /* These are always directly implementable by expand_vec_perm_1. */
35993 gcc_unreachable ();
35994
35995 case V8HImode:
35996 if (TARGET_SSSE3)
35997 return expand_vec_perm_pshufb2 (d);
35998 else
35999 {
36000 /* We need 2*log2(N)-1 operations to achieve odd/even
36001 with interleave. */
36002 t1 = gen_reg_rtx (V8HImode);
36003 t2 = gen_reg_rtx (V8HImode);
36004 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
36005 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
36006 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
36007 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
36008 if (odd)
36009 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
36010 else
36011 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
36012 emit_insn (t3);
36013 }
36014 break;
36015
36016 case V16QImode:
36017 if (TARGET_SSSE3)
36018 return expand_vec_perm_pshufb2 (d);
36019 else
36020 {
36021 t1 = gen_reg_rtx (V16QImode);
36022 t2 = gen_reg_rtx (V16QImode);
36023 t3 = gen_reg_rtx (V16QImode);
36024 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
36025 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
36026 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
36027 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
36028 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
36029 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
36030 if (odd)
36031 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
36032 else
36033 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
36034 emit_insn (t3);
36035 }
36036 break;
36037
36038 case V16HImode:
36039 case V32QImode:
36040 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
36041
36042 case V4DImode:
36043 if (!TARGET_AVX2)
36044 {
36045 struct expand_vec_perm_d d_copy = *d;
36046 d_copy.vmode = V4DFmode;
36047 d_copy.target = gen_lowpart (V4DFmode, d->target);
36048 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
36049 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
36050 return expand_vec_perm_even_odd_1 (&d_copy, odd);
36051 }
36052
36053 t1 = gen_reg_rtx (V4DImode);
36054 t2 = gen_reg_rtx (V4DImode);
36055
36056 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
36057 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
36058 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
36059
36060 /* Now an vpunpck[lh]qdq will produce the result required. */
36061 if (odd)
36062 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
36063 else
36064 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
36065 emit_insn (t3);
36066 break;
36067
36068 case V8SImode:
36069 if (!TARGET_AVX2)
36070 {
36071 struct expand_vec_perm_d d_copy = *d;
36072 d_copy.vmode = V8SFmode;
36073 d_copy.target = gen_lowpart (V8SFmode, d->target);
36074 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
36075 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
36076 return expand_vec_perm_even_odd_1 (&d_copy, odd);
36077 }
36078
36079 t1 = gen_reg_rtx (V8SImode);
36080 t2 = gen_reg_rtx (V8SImode);
36081
36082 /* Shuffle the lanes around into
36083 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
36084 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
36085 gen_lowpart (V4DImode, d->op0),
36086 gen_lowpart (V4DImode, d->op1),
36087 GEN_INT (0x20)));
36088 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
36089 gen_lowpart (V4DImode, d->op0),
36090 gen_lowpart (V4DImode, d->op1),
36091 GEN_INT (0x31)));
36092
36093 /* Swap the 2nd and 3rd position in each lane into
36094 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
36095 emit_insn (gen_avx2_pshufdv3 (t1, t1,
36096 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
36097 emit_insn (gen_avx2_pshufdv3 (t2, t2,
36098 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
36099
36100 /* Now an vpunpck[lh]qdq will produce
36101 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
36102 if (odd)
36103 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
36104 gen_lowpart (V4DImode, t1),
36105 gen_lowpart (V4DImode, t2));
36106 else
36107 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
36108 gen_lowpart (V4DImode, t1),
36109 gen_lowpart (V4DImode, t2));
36110 emit_insn (t3);
36111 break;
36112
36113 default:
36114 gcc_unreachable ();
36115 }
36116
36117 return true;
36118 }
36119
36120 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
36121 extract-even and extract-odd permutations. */
36122
36123 static bool
36124 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
36125 {
36126 unsigned i, odd, nelt = d->nelt;
36127
36128 odd = d->perm[0];
36129 if (odd != 0 && odd != 1)
36130 return false;
36131
36132 for (i = 1; i < nelt; ++i)
36133 if (d->perm[i] != 2 * i + odd)
36134 return false;
36135
36136 return expand_vec_perm_even_odd_1 (d, odd);
36137 }
36138
36139 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
36140 permutations. We assume that expand_vec_perm_1 has already failed. */
36141
36142 static bool
36143 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
36144 {
36145 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
36146 enum machine_mode vmode = d->vmode;
36147 unsigned char perm2[4];
36148 rtx op0 = d->op0;
36149 bool ok;
36150
36151 switch (vmode)
36152 {
36153 case V4DFmode:
36154 case V8SFmode:
36155 /* These are special-cased in sse.md so that we can optionally
36156 use the vbroadcast instruction. They expand to two insns
36157 if the input happens to be in a register. */
36158 gcc_unreachable ();
36159
36160 case V2DFmode:
36161 case V2DImode:
36162 case V4SFmode:
36163 case V4SImode:
36164 /* These are always implementable using standard shuffle patterns. */
36165 gcc_unreachable ();
36166
36167 case V8HImode:
36168 case V16QImode:
36169 /* These can be implemented via interleave. We save one insn by
36170 stopping once we have promoted to V4SImode and then use pshufd. */
36171 do
36172 {
36173 optab otab = vec_interleave_low_optab;
36174
36175 if (elt >= nelt2)
36176 {
36177 otab = vec_interleave_high_optab;
36178 elt -= nelt2;
36179 }
36180 nelt2 /= 2;
36181
36182 op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT);
36183 vmode = get_mode_wider_vector (vmode);
36184 op0 = gen_lowpart (vmode, op0);
36185 }
36186 while (vmode != V4SImode);
36187
36188 memset (perm2, elt, 4);
36189 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
36190 gcc_assert (ok);
36191 return true;
36192
36193 case V32QImode:
36194 case V16HImode:
36195 case V8SImode:
36196 case V4DImode:
36197 /* For AVX2 broadcasts of the first element vpbroadcast* or
36198 vpermq should be used by expand_vec_perm_1. */
36199 gcc_assert (!TARGET_AVX2 || d->perm[0]);
36200 return false;
36201
36202 default:
36203 gcc_unreachable ();
36204 }
36205 }
36206
36207 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
36208 broadcast permutations. */
36209
36210 static bool
36211 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
36212 {
36213 unsigned i, elt, nelt = d->nelt;
36214
36215 if (d->op0 != d->op1)
36216 return false;
36217
36218 elt = d->perm[0];
36219 for (i = 1; i < nelt; ++i)
36220 if (d->perm[i] != elt)
36221 return false;
36222
36223 return expand_vec_perm_broadcast_1 (d);
36224 }
36225
36226 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
36227 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
36228 all the shorter instruction sequences. */
36229
36230 static bool
36231 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
36232 {
36233 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
36234 unsigned int i, nelt, eltsz;
36235 bool used[4];
36236
36237 if (!TARGET_AVX2
36238 || d->op0 == d->op1
36239 || (d->vmode != V32QImode && d->vmode != V16HImode))
36240 return false;
36241
36242 if (d->testing_p)
36243 return true;
36244
36245 nelt = d->nelt;
36246 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36247
36248 /* Generate 4 permutation masks. If the required element is within
36249 the same lane, it is shuffled in. If the required element from the
36250 other lane, force a zero by setting bit 7 in the permutation mask.
36251 In the other mask the mask has non-negative elements if element
36252 is requested from the other lane, but also moved to the other lane,
36253 so that the result of vpshufb can have the two V2TImode halves
36254 swapped. */
36255 m128 = GEN_INT (-128);
36256 for (i = 0; i < 32; ++i)
36257 {
36258 rperm[0][i] = m128;
36259 rperm[1][i] = m128;
36260 rperm[2][i] = m128;
36261 rperm[3][i] = m128;
36262 }
36263 used[0] = false;
36264 used[1] = false;
36265 used[2] = false;
36266 used[3] = false;
36267 for (i = 0; i < nelt; ++i)
36268 {
36269 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36270 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
36271 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
36272
36273 for (j = 0; j < eltsz; ++j)
36274 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
36275 used[which] = true;
36276 }
36277
36278 for (i = 0; i < 2; ++i)
36279 {
36280 if (!used[2 * i + 1])
36281 {
36282 h[i] = NULL_RTX;
36283 continue;
36284 }
36285 vperm = gen_rtx_CONST_VECTOR (V32QImode,
36286 gen_rtvec_v (32, rperm[2 * i + 1]));
36287 vperm = force_reg (V32QImode, vperm);
36288 h[i] = gen_reg_rtx (V32QImode);
36289 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
36290 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
36291 }
36292
36293 /* Swap the 128-byte lanes of h[X]. */
36294 for (i = 0; i < 2; ++i)
36295 {
36296 if (h[i] == NULL_RTX)
36297 continue;
36298 op = gen_reg_rtx (V4DImode);
36299 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
36300 const2_rtx, GEN_INT (3), const0_rtx,
36301 const1_rtx));
36302 h[i] = gen_lowpart (V32QImode, op);
36303 }
36304
36305 for (i = 0; i < 2; ++i)
36306 {
36307 if (!used[2 * i])
36308 {
36309 l[i] = NULL_RTX;
36310 continue;
36311 }
36312 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
36313 vperm = force_reg (V32QImode, vperm);
36314 l[i] = gen_reg_rtx (V32QImode);
36315 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
36316 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
36317 }
36318
36319 for (i = 0; i < 2; ++i)
36320 {
36321 if (h[i] && l[i])
36322 {
36323 op = gen_reg_rtx (V32QImode);
36324 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
36325 l[i] = op;
36326 }
36327 else if (h[i])
36328 l[i] = h[i];
36329 }
36330
36331 gcc_assert (l[0] && l[1]);
36332 op = gen_lowpart (V32QImode, d->target);
36333 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
36334 return true;
36335 }
36336
36337 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
36338 With all of the interface bits taken care of, perform the expansion
36339 in D and return true on success. */
36340
36341 static bool
36342 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
36343 {
36344 /* Try a single instruction expansion. */
36345 if (expand_vec_perm_1 (d))
36346 return true;
36347
36348 /* Try sequences of two instructions. */
36349
36350 if (expand_vec_perm_pshuflw_pshufhw (d))
36351 return true;
36352
36353 if (expand_vec_perm_palignr (d))
36354 return true;
36355
36356 if (expand_vec_perm_interleave2 (d))
36357 return true;
36358
36359 if (expand_vec_perm_broadcast (d))
36360 return true;
36361
36362 if (expand_vec_perm_vpermq_perm_1 (d))
36363 return true;
36364
36365 /* Try sequences of three instructions. */
36366
36367 if (expand_vec_perm_pshufb2 (d))
36368 return true;
36369
36370 if (expand_vec_perm_interleave3 (d))
36371 return true;
36372
36373 /* Try sequences of four instructions. */
36374
36375 if (expand_vec_perm_vpshufb2_vpermq (d))
36376 return true;
36377
36378 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
36379 return true;
36380
36381 /* ??? Look for narrow permutations whose element orderings would
36382 allow the promotion to a wider mode. */
36383
36384 /* ??? Look for sequences of interleave or a wider permute that place
36385 the data into the correct lanes for a half-vector shuffle like
36386 pshuf[lh]w or vpermilps. */
36387
36388 /* ??? Look for sequences of interleave that produce the desired results.
36389 The combinatorics of punpck[lh] get pretty ugly... */
36390
36391 if (expand_vec_perm_even_odd (d))
36392 return true;
36393
36394 /* Even longer sequences. */
36395 if (expand_vec_perm_vpshufb4_vpermq2 (d))
36396 return true;
36397
36398 return false;
36399 }
36400
36401 bool
36402 ix86_expand_vec_perm_const (rtx operands[4])
36403 {
36404 struct expand_vec_perm_d d;
36405 unsigned char perm[MAX_VECT_LEN];
36406 int i, nelt, which;
36407 rtx sel;
36408
36409 d.target = operands[0];
36410 d.op0 = operands[1];
36411 d.op1 = operands[2];
36412 sel = operands[3];
36413
36414 d.vmode = GET_MODE (d.target);
36415 gcc_assert (VECTOR_MODE_P (d.vmode));
36416 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
36417 d.testing_p = false;
36418
36419 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
36420 gcc_assert (XVECLEN (sel, 0) == nelt);
36421 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
36422
36423 for (i = which = 0; i < nelt; ++i)
36424 {
36425 rtx e = XVECEXP (sel, 0, i);
36426 int ei = INTVAL (e) & (2 * nelt - 1);
36427
36428 which |= (ei < nelt ? 1 : 2);
36429 d.perm[i] = ei;
36430 perm[i] = ei;
36431 }
36432
36433 switch (which)
36434 {
36435 default:
36436 gcc_unreachable();
36437
36438 case 3:
36439 if (!rtx_equal_p (d.op0, d.op1))
36440 break;
36441
36442 /* The elements of PERM do not suggest that only the first operand
36443 is used, but both operands are identical. Allow easier matching
36444 of the permutation by folding the permutation into the single
36445 input vector. */
36446 for (i = 0; i < nelt; ++i)
36447 if (d.perm[i] >= nelt)
36448 d.perm[i] -= nelt;
36449 /* FALLTHRU */
36450
36451 case 1:
36452 d.op1 = d.op0;
36453 break;
36454
36455 case 2:
36456 for (i = 0; i < nelt; ++i)
36457 d.perm[i] -= nelt;
36458 d.op0 = d.op1;
36459 break;
36460 }
36461
36462 if (ix86_expand_vec_perm_const_1 (&d))
36463 return true;
36464
36465 /* If the mask says both arguments are needed, but they are the same,
36466 the above tried to expand with d.op0 == d.op1. If that didn't work,
36467 retry with d.op0 != d.op1 as that is what testing has been done with. */
36468 if (which == 3 && d.op0 == d.op1)
36469 {
36470 rtx seq;
36471 bool ok;
36472
36473 memcpy (d.perm, perm, sizeof (perm));
36474 d.op1 = gen_reg_rtx (d.vmode);
36475 start_sequence ();
36476 ok = ix86_expand_vec_perm_const_1 (&d);
36477 seq = get_insns ();
36478 end_sequence ();
36479 if (ok)
36480 {
36481 emit_move_insn (d.op1, d.op0);
36482 emit_insn (seq);
36483 return true;
36484 }
36485 }
36486
36487 return false;
36488 }
36489
36490 /* Implement targetm.vectorize.vec_perm_const_ok. */
36491
36492 static bool
36493 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
36494 const unsigned char *sel)
36495 {
36496 struct expand_vec_perm_d d;
36497 unsigned int i, nelt, which;
36498 bool ret, one_vec;
36499
36500 d.vmode = vmode;
36501 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
36502 d.testing_p = true;
36503
36504 /* Given sufficient ISA support we can just return true here
36505 for selected vector modes. */
36506 if (GET_MODE_SIZE (d.vmode) == 16)
36507 {
36508 /* All implementable with a single vpperm insn. */
36509 if (TARGET_XOP)
36510 return true;
36511 /* All implementable with 2 pshufb + 1 ior. */
36512 if (TARGET_SSSE3)
36513 return true;
36514 /* All implementable with shufpd or unpck[lh]pd. */
36515 if (d.nelt == 2)
36516 return true;
36517 }
36518
36519 /* Extract the values from the vector CST into the permutation
36520 array in D. */
36521 memcpy (d.perm, sel, nelt);
36522 for (i = which = 0; i < nelt; ++i)
36523 {
36524 unsigned char e = d.perm[i];
36525 gcc_assert (e < 2 * nelt);
36526 which |= (e < nelt ? 1 : 2);
36527 }
36528
36529 /* For all elements from second vector, fold the elements to first. */
36530 if (which == 2)
36531 for (i = 0; i < nelt; ++i)
36532 d.perm[i] -= nelt;
36533
36534 /* Check whether the mask can be applied to the vector type. */
36535 one_vec = (which != 3);
36536
36537 /* Implementable with shufps or pshufd. */
36538 if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
36539 return true;
36540
36541 /* Otherwise we have to go through the motions and see if we can
36542 figure out how to generate the requested permutation. */
36543 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
36544 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
36545 if (!one_vec)
36546 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
36547
36548 start_sequence ();
36549 ret = ix86_expand_vec_perm_const_1 (&d);
36550 end_sequence ();
36551
36552 return ret;
36553 }
36554
36555 void
36556 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
36557 {
36558 struct expand_vec_perm_d d;
36559 unsigned i, nelt;
36560
36561 d.target = targ;
36562 d.op0 = op0;
36563 d.op1 = op1;
36564 d.vmode = GET_MODE (targ);
36565 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
36566 d.testing_p = false;
36567
36568 for (i = 0; i < nelt; ++i)
36569 d.perm[i] = i * 2 + odd;
36570
36571 /* We'll either be able to implement the permutation directly... */
36572 if (expand_vec_perm_1 (&d))
36573 return;
36574
36575 /* ... or we use the special-case patterns. */
36576 expand_vec_perm_even_odd_1 (&d, odd);
36577 }
36578
36579 /* Expand an insert into a vector register through pinsr insn.
36580 Return true if successful. */
36581
36582 bool
36583 ix86_expand_pinsr (rtx *operands)
36584 {
36585 rtx dst = operands[0];
36586 rtx src = operands[3];
36587
36588 unsigned int size = INTVAL (operands[1]);
36589 unsigned int pos = INTVAL (operands[2]);
36590
36591 if (GET_CODE (dst) == SUBREG)
36592 {
36593 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
36594 dst = SUBREG_REG (dst);
36595 }
36596
36597 if (GET_CODE (src) == SUBREG)
36598 src = SUBREG_REG (src);
36599
36600 switch (GET_MODE (dst))
36601 {
36602 case V16QImode:
36603 case V8HImode:
36604 case V4SImode:
36605 case V2DImode:
36606 {
36607 enum machine_mode srcmode, dstmode;
36608 rtx (*pinsr)(rtx, rtx, rtx, rtx);
36609
36610 srcmode = mode_for_size (size, MODE_INT, 0);
36611
36612 switch (srcmode)
36613 {
36614 case QImode:
36615 if (!TARGET_SSE4_1)
36616 return false;
36617 dstmode = V16QImode;
36618 pinsr = gen_sse4_1_pinsrb;
36619 break;
36620
36621 case HImode:
36622 if (!TARGET_SSE2)
36623 return false;
36624 dstmode = V8HImode;
36625 pinsr = gen_sse2_pinsrw;
36626 break;
36627
36628 case SImode:
36629 if (!TARGET_SSE4_1)
36630 return false;
36631 dstmode = V4SImode;
36632 pinsr = gen_sse4_1_pinsrd;
36633 break;
36634
36635 case DImode:
36636 gcc_assert (TARGET_64BIT);
36637 if (!TARGET_SSE4_1)
36638 return false;
36639 dstmode = V2DImode;
36640 pinsr = gen_sse4_1_pinsrq;
36641 break;
36642
36643 default:
36644 return false;
36645 }
36646
36647 dst = gen_lowpart (dstmode, dst);
36648 src = gen_lowpart (srcmode, src);
36649
36650 pos /= size;
36651
36652 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
36653 return true;
36654 }
36655
36656 default:
36657 return false;
36658 }
36659 }
36660 \f
36661 /* This function returns the calling abi specific va_list type node.
36662 It returns the FNDECL specific va_list type. */
36663
36664 static tree
36665 ix86_fn_abi_va_list (tree fndecl)
36666 {
36667 if (!TARGET_64BIT)
36668 return va_list_type_node;
36669 gcc_assert (fndecl != NULL_TREE);
36670
36671 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
36672 return ms_va_list_type_node;
36673 else
36674 return sysv_va_list_type_node;
36675 }
36676
36677 /* Returns the canonical va_list type specified by TYPE. If there
36678 is no valid TYPE provided, it return NULL_TREE. */
36679
36680 static tree
36681 ix86_canonical_va_list_type (tree type)
36682 {
36683 tree wtype, htype;
36684
36685 /* Resolve references and pointers to va_list type. */
36686 if (TREE_CODE (type) == MEM_REF)
36687 type = TREE_TYPE (type);
36688 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
36689 type = TREE_TYPE (type);
36690 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
36691 type = TREE_TYPE (type);
36692
36693 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
36694 {
36695 wtype = va_list_type_node;
36696 gcc_assert (wtype != NULL_TREE);
36697 htype = type;
36698 if (TREE_CODE (wtype) == ARRAY_TYPE)
36699 {
36700 /* If va_list is an array type, the argument may have decayed
36701 to a pointer type, e.g. by being passed to another function.
36702 In that case, unwrap both types so that we can compare the
36703 underlying records. */
36704 if (TREE_CODE (htype) == ARRAY_TYPE
36705 || POINTER_TYPE_P (htype))
36706 {
36707 wtype = TREE_TYPE (wtype);
36708 htype = TREE_TYPE (htype);
36709 }
36710 }
36711 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
36712 return va_list_type_node;
36713 wtype = sysv_va_list_type_node;
36714 gcc_assert (wtype != NULL_TREE);
36715 htype = type;
36716 if (TREE_CODE (wtype) == ARRAY_TYPE)
36717 {
36718 /* If va_list is an array type, the argument may have decayed
36719 to a pointer type, e.g. by being passed to another function.
36720 In that case, unwrap both types so that we can compare the
36721 underlying records. */
36722 if (TREE_CODE (htype) == ARRAY_TYPE
36723 || POINTER_TYPE_P (htype))
36724 {
36725 wtype = TREE_TYPE (wtype);
36726 htype = TREE_TYPE (htype);
36727 }
36728 }
36729 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
36730 return sysv_va_list_type_node;
36731 wtype = ms_va_list_type_node;
36732 gcc_assert (wtype != NULL_TREE);
36733 htype = type;
36734 if (TREE_CODE (wtype) == ARRAY_TYPE)
36735 {
36736 /* If va_list is an array type, the argument may have decayed
36737 to a pointer type, e.g. by being passed to another function.
36738 In that case, unwrap both types so that we can compare the
36739 underlying records. */
36740 if (TREE_CODE (htype) == ARRAY_TYPE
36741 || POINTER_TYPE_P (htype))
36742 {
36743 wtype = TREE_TYPE (wtype);
36744 htype = TREE_TYPE (htype);
36745 }
36746 }
36747 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
36748 return ms_va_list_type_node;
36749 return NULL_TREE;
36750 }
36751 return std_canonical_va_list_type (type);
36752 }
36753
36754 /* Iterate through the target-specific builtin types for va_list.
36755 IDX denotes the iterator, *PTREE is set to the result type of
36756 the va_list builtin, and *PNAME to its internal type.
36757 Returns zero if there is no element for this index, otherwise
36758 IDX should be increased upon the next call.
36759 Note, do not iterate a base builtin's name like __builtin_va_list.
36760 Used from c_common_nodes_and_builtins. */
36761
36762 static int
36763 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
36764 {
36765 if (TARGET_64BIT)
36766 {
36767 switch (idx)
36768 {
36769 default:
36770 break;
36771
36772 case 0:
36773 *ptree = ms_va_list_type_node;
36774 *pname = "__builtin_ms_va_list";
36775 return 1;
36776
36777 case 1:
36778 *ptree = sysv_va_list_type_node;
36779 *pname = "__builtin_sysv_va_list";
36780 return 1;
36781 }
36782 }
36783
36784 return 0;
36785 }
36786
36787 #undef TARGET_SCHED_DISPATCH
36788 #define TARGET_SCHED_DISPATCH has_dispatch
36789 #undef TARGET_SCHED_DISPATCH_DO
36790 #define TARGET_SCHED_DISPATCH_DO do_dispatch
36791 #undef TARGET_SCHED_REASSOCIATION_WIDTH
36792 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
36793
36794 /* The size of the dispatch window is the total number of bytes of
36795 object code allowed in a window. */
36796 #define DISPATCH_WINDOW_SIZE 16
36797
36798 /* Number of dispatch windows considered for scheduling. */
36799 #define MAX_DISPATCH_WINDOWS 3
36800
36801 /* Maximum number of instructions in a window. */
36802 #define MAX_INSN 4
36803
36804 /* Maximum number of immediate operands in a window. */
36805 #define MAX_IMM 4
36806
36807 /* Maximum number of immediate bits allowed in a window. */
36808 #define MAX_IMM_SIZE 128
36809
36810 /* Maximum number of 32 bit immediates allowed in a window. */
36811 #define MAX_IMM_32 4
36812
36813 /* Maximum number of 64 bit immediates allowed in a window. */
36814 #define MAX_IMM_64 2
36815
36816 /* Maximum total of loads or prefetches allowed in a window. */
36817 #define MAX_LOAD 2
36818
36819 /* Maximum total of stores allowed in a window. */
36820 #define MAX_STORE 1
36821
36822 #undef BIG
36823 #define BIG 100
36824
36825
36826 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
36827 enum dispatch_group {
36828 disp_no_group = 0,
36829 disp_load,
36830 disp_store,
36831 disp_load_store,
36832 disp_prefetch,
36833 disp_imm,
36834 disp_imm_32,
36835 disp_imm_64,
36836 disp_branch,
36837 disp_cmp,
36838 disp_jcc,
36839 disp_last
36840 };
36841
36842 /* Number of allowable groups in a dispatch window. It is an array
36843 indexed by dispatch_group enum. 100 is used as a big number,
36844 because the number of these kind of operations does not have any
36845 effect in dispatch window, but we need them for other reasons in
36846 the table. */
36847 static unsigned int num_allowable_groups[disp_last] = {
36848 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
36849 };
36850
36851 char group_name[disp_last + 1][16] = {
36852 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
36853 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
36854 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
36855 };
36856
36857 /* Instruction path. */
36858 enum insn_path {
36859 no_path = 0,
36860 path_single, /* Single micro op. */
36861 path_double, /* Double micro op. */
36862 path_multi, /* Instructions with more than 2 micro op.. */
36863 last_path
36864 };
36865
36866 /* sched_insn_info defines a window to the instructions scheduled in
36867 the basic block. It contains a pointer to the insn_info table and
36868 the instruction scheduled.
36869
36870 Windows are allocated for each basic block and are linked
36871 together. */
36872 typedef struct sched_insn_info_s {
36873 rtx insn;
36874 enum dispatch_group group;
36875 enum insn_path path;
36876 int byte_len;
36877 int imm_bytes;
36878 } sched_insn_info;
36879
36880 /* Linked list of dispatch windows. This is a two way list of
36881 dispatch windows of a basic block. It contains information about
36882 the number of uops in the window and the total number of
36883 instructions and of bytes in the object code for this dispatch
36884 window. */
36885 typedef struct dispatch_windows_s {
36886 int num_insn; /* Number of insn in the window. */
36887 int num_uops; /* Number of uops in the window. */
36888 int window_size; /* Number of bytes in the window. */
36889 int window_num; /* Window number between 0 or 1. */
36890 int num_imm; /* Number of immediates in an insn. */
36891 int num_imm_32; /* Number of 32 bit immediates in an insn. */
36892 int num_imm_64; /* Number of 64 bit immediates in an insn. */
36893 int imm_size; /* Total immediates in the window. */
36894 int num_loads; /* Total memory loads in the window. */
36895 int num_stores; /* Total memory stores in the window. */
36896 int violation; /* Violation exists in window. */
36897 sched_insn_info *window; /* Pointer to the window. */
36898 struct dispatch_windows_s *next;
36899 struct dispatch_windows_s *prev;
36900 } dispatch_windows;
36901
36902 /* Immediate valuse used in an insn. */
36903 typedef struct imm_info_s
36904 {
36905 int imm;
36906 int imm32;
36907 int imm64;
36908 } imm_info;
36909
36910 static dispatch_windows *dispatch_window_list;
36911 static dispatch_windows *dispatch_window_list1;
36912
36913 /* Get dispatch group of insn. */
36914
36915 static enum dispatch_group
36916 get_mem_group (rtx insn)
36917 {
36918 enum attr_memory memory;
36919
36920 if (INSN_CODE (insn) < 0)
36921 return disp_no_group;
36922 memory = get_attr_memory (insn);
36923 if (memory == MEMORY_STORE)
36924 return disp_store;
36925
36926 if (memory == MEMORY_LOAD)
36927 return disp_load;
36928
36929 if (memory == MEMORY_BOTH)
36930 return disp_load_store;
36931
36932 return disp_no_group;
36933 }
36934
36935 /* Return true if insn is a compare instruction. */
36936
36937 static bool
36938 is_cmp (rtx insn)
36939 {
36940 enum attr_type type;
36941
36942 type = get_attr_type (insn);
36943 return (type == TYPE_TEST
36944 || type == TYPE_ICMP
36945 || type == TYPE_FCMP
36946 || GET_CODE (PATTERN (insn)) == COMPARE);
36947 }
36948
36949 /* Return true if a dispatch violation encountered. */
36950
36951 static bool
36952 dispatch_violation (void)
36953 {
36954 if (dispatch_window_list->next)
36955 return dispatch_window_list->next->violation;
36956 return dispatch_window_list->violation;
36957 }
36958
36959 /* Return true if insn is a branch instruction. */
36960
36961 static bool
36962 is_branch (rtx insn)
36963 {
36964 return (CALL_P (insn) || JUMP_P (insn));
36965 }
36966
36967 /* Return true if insn is a prefetch instruction. */
36968
36969 static bool
36970 is_prefetch (rtx insn)
36971 {
36972 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
36973 }
36974
36975 /* This function initializes a dispatch window and the list container holding a
36976 pointer to the window. */
36977
36978 static void
36979 init_window (int window_num)
36980 {
36981 int i;
36982 dispatch_windows *new_list;
36983
36984 if (window_num == 0)
36985 new_list = dispatch_window_list;
36986 else
36987 new_list = dispatch_window_list1;
36988
36989 new_list->num_insn = 0;
36990 new_list->num_uops = 0;
36991 new_list->window_size = 0;
36992 new_list->next = NULL;
36993 new_list->prev = NULL;
36994 new_list->window_num = window_num;
36995 new_list->num_imm = 0;
36996 new_list->num_imm_32 = 0;
36997 new_list->num_imm_64 = 0;
36998 new_list->imm_size = 0;
36999 new_list->num_loads = 0;
37000 new_list->num_stores = 0;
37001 new_list->violation = false;
37002
37003 for (i = 0; i < MAX_INSN; i++)
37004 {
37005 new_list->window[i].insn = NULL;
37006 new_list->window[i].group = disp_no_group;
37007 new_list->window[i].path = no_path;
37008 new_list->window[i].byte_len = 0;
37009 new_list->window[i].imm_bytes = 0;
37010 }
37011 return;
37012 }
37013
37014 /* This function allocates and initializes a dispatch window and the
37015 list container holding a pointer to the window. */
37016
37017 static dispatch_windows *
37018 allocate_window (void)
37019 {
37020 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
37021 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
37022
37023 return new_list;
37024 }
37025
37026 /* This routine initializes the dispatch scheduling information. It
37027 initiates building dispatch scheduler tables and constructs the
37028 first dispatch window. */
37029
37030 static void
37031 init_dispatch_sched (void)
37032 {
37033 /* Allocate a dispatch list and a window. */
37034 dispatch_window_list = allocate_window ();
37035 dispatch_window_list1 = allocate_window ();
37036 init_window (0);
37037 init_window (1);
37038 }
37039
37040 /* This function returns true if a branch is detected. End of a basic block
37041 does not have to be a branch, but here we assume only branches end a
37042 window. */
37043
37044 static bool
37045 is_end_basic_block (enum dispatch_group group)
37046 {
37047 return group == disp_branch;
37048 }
37049
37050 /* This function is called when the end of a window processing is reached. */
37051
37052 static void
37053 process_end_window (void)
37054 {
37055 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
37056 if (dispatch_window_list->next)
37057 {
37058 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
37059 gcc_assert (dispatch_window_list->window_size
37060 + dispatch_window_list1->window_size <= 48);
37061 init_window (1);
37062 }
37063 init_window (0);
37064 }
37065
37066 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
37067 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
37068 for 48 bytes of instructions. Note that these windows are not dispatch
37069 windows that their sizes are DISPATCH_WINDOW_SIZE. */
37070
37071 static dispatch_windows *
37072 allocate_next_window (int window_num)
37073 {
37074 if (window_num == 0)
37075 {
37076 if (dispatch_window_list->next)
37077 init_window (1);
37078 init_window (0);
37079 return dispatch_window_list;
37080 }
37081
37082 dispatch_window_list->next = dispatch_window_list1;
37083 dispatch_window_list1->prev = dispatch_window_list;
37084
37085 return dispatch_window_list1;
37086 }
37087
37088 /* Increment the number of immediate operands of an instruction. */
37089
37090 static int
37091 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
37092 {
37093 if (*in_rtx == 0)
37094 return 0;
37095
37096 switch ( GET_CODE (*in_rtx))
37097 {
37098 case CONST:
37099 case SYMBOL_REF:
37100 case CONST_INT:
37101 (imm_values->imm)++;
37102 if (x86_64_immediate_operand (*in_rtx, SImode))
37103 (imm_values->imm32)++;
37104 else
37105 (imm_values->imm64)++;
37106 break;
37107
37108 case CONST_DOUBLE:
37109 (imm_values->imm)++;
37110 (imm_values->imm64)++;
37111 break;
37112
37113 case CODE_LABEL:
37114 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
37115 {
37116 (imm_values->imm)++;
37117 (imm_values->imm32)++;
37118 }
37119 break;
37120
37121 default:
37122 break;
37123 }
37124
37125 return 0;
37126 }
37127
37128 /* Compute number of immediate operands of an instruction. */
37129
37130 static void
37131 find_constant (rtx in_rtx, imm_info *imm_values)
37132 {
37133 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
37134 (rtx_function) find_constant_1, (void *) imm_values);
37135 }
37136
37137 /* Return total size of immediate operands of an instruction along with number
37138 of corresponding immediate-operands. It initializes its parameters to zero
37139 befor calling FIND_CONSTANT.
37140 INSN is the input instruction. IMM is the total of immediates.
37141 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
37142 bit immediates. */
37143
37144 static int
37145 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
37146 {
37147 imm_info imm_values = {0, 0, 0};
37148
37149 find_constant (insn, &imm_values);
37150 *imm = imm_values.imm;
37151 *imm32 = imm_values.imm32;
37152 *imm64 = imm_values.imm64;
37153 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
37154 }
37155
37156 /* This function indicates if an operand of an instruction is an
37157 immediate. */
37158
37159 static bool
37160 has_immediate (rtx insn)
37161 {
37162 int num_imm_operand;
37163 int num_imm32_operand;
37164 int num_imm64_operand;
37165
37166 if (insn)
37167 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37168 &num_imm64_operand);
37169 return false;
37170 }
37171
37172 /* Return single or double path for instructions. */
37173
37174 static enum insn_path
37175 get_insn_path (rtx insn)
37176 {
37177 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
37178
37179 if ((int)path == 0)
37180 return path_single;
37181
37182 if ((int)path == 1)
37183 return path_double;
37184
37185 return path_multi;
37186 }
37187
37188 /* Return insn dispatch group. */
37189
37190 static enum dispatch_group
37191 get_insn_group (rtx insn)
37192 {
37193 enum dispatch_group group = get_mem_group (insn);
37194 if (group)
37195 return group;
37196
37197 if (is_branch (insn))
37198 return disp_branch;
37199
37200 if (is_cmp (insn))
37201 return disp_cmp;
37202
37203 if (has_immediate (insn))
37204 return disp_imm;
37205
37206 if (is_prefetch (insn))
37207 return disp_prefetch;
37208
37209 return disp_no_group;
37210 }
37211
37212 /* Count number of GROUP restricted instructions in a dispatch
37213 window WINDOW_LIST. */
37214
37215 static int
37216 count_num_restricted (rtx insn, dispatch_windows *window_list)
37217 {
37218 enum dispatch_group group = get_insn_group (insn);
37219 int imm_size;
37220 int num_imm_operand;
37221 int num_imm32_operand;
37222 int num_imm64_operand;
37223
37224 if (group == disp_no_group)
37225 return 0;
37226
37227 if (group == disp_imm)
37228 {
37229 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37230 &num_imm64_operand);
37231 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
37232 || num_imm_operand + window_list->num_imm > MAX_IMM
37233 || (num_imm32_operand > 0
37234 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
37235 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
37236 || (num_imm64_operand > 0
37237 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
37238 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
37239 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
37240 && num_imm64_operand > 0
37241 && ((window_list->num_imm_64 > 0
37242 && window_list->num_insn >= 2)
37243 || window_list->num_insn >= 3)))
37244 return BIG;
37245
37246 return 1;
37247 }
37248
37249 if ((group == disp_load_store
37250 && (window_list->num_loads >= MAX_LOAD
37251 || window_list->num_stores >= MAX_STORE))
37252 || ((group == disp_load
37253 || group == disp_prefetch)
37254 && window_list->num_loads >= MAX_LOAD)
37255 || (group == disp_store
37256 && window_list->num_stores >= MAX_STORE))
37257 return BIG;
37258
37259 return 1;
37260 }
37261
37262 /* This function returns true if insn satisfies dispatch rules on the
37263 last window scheduled. */
37264
37265 static bool
37266 fits_dispatch_window (rtx insn)
37267 {
37268 dispatch_windows *window_list = dispatch_window_list;
37269 dispatch_windows *window_list_next = dispatch_window_list->next;
37270 unsigned int num_restrict;
37271 enum dispatch_group group = get_insn_group (insn);
37272 enum insn_path path = get_insn_path (insn);
37273 int sum;
37274
37275 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
37276 instructions should be given the lowest priority in the
37277 scheduling process in Haifa scheduler to make sure they will be
37278 scheduled in the same dispatch window as the refrence to them. */
37279 if (group == disp_jcc || group == disp_cmp)
37280 return false;
37281
37282 /* Check nonrestricted. */
37283 if (group == disp_no_group || group == disp_branch)
37284 return true;
37285
37286 /* Get last dispatch window. */
37287 if (window_list_next)
37288 window_list = window_list_next;
37289
37290 if (window_list->window_num == 1)
37291 {
37292 sum = window_list->prev->window_size + window_list->window_size;
37293
37294 if (sum == 32
37295 || (min_insn_size (insn) + sum) >= 48)
37296 /* Window 1 is full. Go for next window. */
37297 return true;
37298 }
37299
37300 num_restrict = count_num_restricted (insn, window_list);
37301
37302 if (num_restrict > num_allowable_groups[group])
37303 return false;
37304
37305 /* See if it fits in the first window. */
37306 if (window_list->window_num == 0)
37307 {
37308 /* The first widow should have only single and double path
37309 uops. */
37310 if (path == path_double
37311 && (window_list->num_uops + 2) > MAX_INSN)
37312 return false;
37313 else if (path != path_single)
37314 return false;
37315 }
37316 return true;
37317 }
37318
37319 /* Add an instruction INSN with NUM_UOPS micro-operations to the
37320 dispatch window WINDOW_LIST. */
37321
37322 static void
37323 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
37324 {
37325 int byte_len = min_insn_size (insn);
37326 int num_insn = window_list->num_insn;
37327 int imm_size;
37328 sched_insn_info *window = window_list->window;
37329 enum dispatch_group group = get_insn_group (insn);
37330 enum insn_path path = get_insn_path (insn);
37331 int num_imm_operand;
37332 int num_imm32_operand;
37333 int num_imm64_operand;
37334
37335 if (!window_list->violation && group != disp_cmp
37336 && !fits_dispatch_window (insn))
37337 window_list->violation = true;
37338
37339 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37340 &num_imm64_operand);
37341
37342 /* Initialize window with new instruction. */
37343 window[num_insn].insn = insn;
37344 window[num_insn].byte_len = byte_len;
37345 window[num_insn].group = group;
37346 window[num_insn].path = path;
37347 window[num_insn].imm_bytes = imm_size;
37348
37349 window_list->window_size += byte_len;
37350 window_list->num_insn = num_insn + 1;
37351 window_list->num_uops = window_list->num_uops + num_uops;
37352 window_list->imm_size += imm_size;
37353 window_list->num_imm += num_imm_operand;
37354 window_list->num_imm_32 += num_imm32_operand;
37355 window_list->num_imm_64 += num_imm64_operand;
37356
37357 if (group == disp_store)
37358 window_list->num_stores += 1;
37359 else if (group == disp_load
37360 || group == disp_prefetch)
37361 window_list->num_loads += 1;
37362 else if (group == disp_load_store)
37363 {
37364 window_list->num_stores += 1;
37365 window_list->num_loads += 1;
37366 }
37367 }
37368
37369 /* Adds a scheduled instruction, INSN, to the current dispatch window.
37370 If the total bytes of instructions or the number of instructions in
37371 the window exceed allowable, it allocates a new window. */
37372
37373 static void
37374 add_to_dispatch_window (rtx insn)
37375 {
37376 int byte_len;
37377 dispatch_windows *window_list;
37378 dispatch_windows *next_list;
37379 dispatch_windows *window0_list;
37380 enum insn_path path;
37381 enum dispatch_group insn_group;
37382 bool insn_fits;
37383 int num_insn;
37384 int num_uops;
37385 int window_num;
37386 int insn_num_uops;
37387 int sum;
37388
37389 if (INSN_CODE (insn) < 0)
37390 return;
37391
37392 byte_len = min_insn_size (insn);
37393 window_list = dispatch_window_list;
37394 next_list = window_list->next;
37395 path = get_insn_path (insn);
37396 insn_group = get_insn_group (insn);
37397
37398 /* Get the last dispatch window. */
37399 if (next_list)
37400 window_list = dispatch_window_list->next;
37401
37402 if (path == path_single)
37403 insn_num_uops = 1;
37404 else if (path == path_double)
37405 insn_num_uops = 2;
37406 else
37407 insn_num_uops = (int) path;
37408
37409 /* If current window is full, get a new window.
37410 Window number zero is full, if MAX_INSN uops are scheduled in it.
37411 Window number one is full, if window zero's bytes plus window
37412 one's bytes is 32, or if the bytes of the new instruction added
37413 to the total makes it greater than 48, or it has already MAX_INSN
37414 instructions in it. */
37415 num_insn = window_list->num_insn;
37416 num_uops = window_list->num_uops;
37417 window_num = window_list->window_num;
37418 insn_fits = fits_dispatch_window (insn);
37419
37420 if (num_insn >= MAX_INSN
37421 || num_uops + insn_num_uops > MAX_INSN
37422 || !(insn_fits))
37423 {
37424 window_num = ~window_num & 1;
37425 window_list = allocate_next_window (window_num);
37426 }
37427
37428 if (window_num == 0)
37429 {
37430 add_insn_window (insn, window_list, insn_num_uops);
37431 if (window_list->num_insn >= MAX_INSN
37432 && insn_group == disp_branch)
37433 {
37434 process_end_window ();
37435 return;
37436 }
37437 }
37438 else if (window_num == 1)
37439 {
37440 window0_list = window_list->prev;
37441 sum = window0_list->window_size + window_list->window_size;
37442 if (sum == 32
37443 || (byte_len + sum) >= 48)
37444 {
37445 process_end_window ();
37446 window_list = dispatch_window_list;
37447 }
37448
37449 add_insn_window (insn, window_list, insn_num_uops);
37450 }
37451 else
37452 gcc_unreachable ();
37453
37454 if (is_end_basic_block (insn_group))
37455 {
37456 /* End of basic block is reached do end-basic-block process. */
37457 process_end_window ();
37458 return;
37459 }
37460 }
37461
37462 /* Print the dispatch window, WINDOW_NUM, to FILE. */
37463
37464 DEBUG_FUNCTION static void
37465 debug_dispatch_window_file (FILE *file, int window_num)
37466 {
37467 dispatch_windows *list;
37468 int i;
37469
37470 if (window_num == 0)
37471 list = dispatch_window_list;
37472 else
37473 list = dispatch_window_list1;
37474
37475 fprintf (file, "Window #%d:\n", list->window_num);
37476 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
37477 list->num_insn, list->num_uops, list->window_size);
37478 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
37479 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
37480
37481 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
37482 list->num_stores);
37483 fprintf (file, " insn info:\n");
37484
37485 for (i = 0; i < MAX_INSN; i++)
37486 {
37487 if (!list->window[i].insn)
37488 break;
37489 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
37490 i, group_name[list->window[i].group],
37491 i, (void *)list->window[i].insn,
37492 i, list->window[i].path,
37493 i, list->window[i].byte_len,
37494 i, list->window[i].imm_bytes);
37495 }
37496 }
37497
37498 /* Print to stdout a dispatch window. */
37499
37500 DEBUG_FUNCTION void
37501 debug_dispatch_window (int window_num)
37502 {
37503 debug_dispatch_window_file (stdout, window_num);
37504 }
37505
37506 /* Print INSN dispatch information to FILE. */
37507
37508 DEBUG_FUNCTION static void
37509 debug_insn_dispatch_info_file (FILE *file, rtx insn)
37510 {
37511 int byte_len;
37512 enum insn_path path;
37513 enum dispatch_group group;
37514 int imm_size;
37515 int num_imm_operand;
37516 int num_imm32_operand;
37517 int num_imm64_operand;
37518
37519 if (INSN_CODE (insn) < 0)
37520 return;
37521
37522 byte_len = min_insn_size (insn);
37523 path = get_insn_path (insn);
37524 group = get_insn_group (insn);
37525 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37526 &num_imm64_operand);
37527
37528 fprintf (file, " insn info:\n");
37529 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
37530 group_name[group], path, byte_len);
37531 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
37532 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
37533 }
37534
37535 /* Print to STDERR the status of the ready list with respect to
37536 dispatch windows. */
37537
37538 DEBUG_FUNCTION void
37539 debug_ready_dispatch (void)
37540 {
37541 int i;
37542 int no_ready = number_in_ready ();
37543
37544 fprintf (stdout, "Number of ready: %d\n", no_ready);
37545
37546 for (i = 0; i < no_ready; i++)
37547 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
37548 }
37549
37550 /* This routine is the driver of the dispatch scheduler. */
37551
37552 static void
37553 do_dispatch (rtx insn, int mode)
37554 {
37555 if (mode == DISPATCH_INIT)
37556 init_dispatch_sched ();
37557 else if (mode == ADD_TO_DISPATCH_WINDOW)
37558 add_to_dispatch_window (insn);
37559 }
37560
37561 /* Return TRUE if Dispatch Scheduling is supported. */
37562
37563 static bool
37564 has_dispatch (rtx insn, int action)
37565 {
37566 if ((ix86_tune == PROCESSOR_BDVER1 || ix86_tune == PROCESSOR_BDVER2)
37567 && flag_dispatch_scheduler)
37568 switch (action)
37569 {
37570 default:
37571 return false;
37572
37573 case IS_DISPATCH_ON:
37574 return true;
37575 break;
37576
37577 case IS_CMP:
37578 return is_cmp (insn);
37579
37580 case DISPATCH_VIOLATION:
37581 return dispatch_violation ();
37582
37583 case FITS_DISPATCH_WINDOW:
37584 return fits_dispatch_window (insn);
37585 }
37586
37587 return false;
37588 }
37589
37590 /* Implementation of reassociation_width target hook used by
37591 reassoc phase to identify parallelism level in reassociated
37592 tree. Statements tree_code is passed in OPC. Arguments type
37593 is passed in MODE.
37594
37595 Currently parallel reassociation is enabled for Atom
37596 processors only and we set reassociation width to be 2
37597 because Atom may issue up to 2 instructions per cycle.
37598
37599 Return value should be fixed if parallel reassociation is
37600 enabled for other processors. */
37601
37602 static int
37603 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
37604 enum machine_mode mode)
37605 {
37606 int res = 1;
37607
37608 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
37609 res = 2;
37610 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
37611 res = 2;
37612
37613 return res;
37614 }
37615
37616 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
37617 place emms and femms instructions. */
37618
37619 static enum machine_mode
37620 ix86_preferred_simd_mode (enum machine_mode mode)
37621 {
37622 if (!TARGET_SSE)
37623 return word_mode;
37624
37625 switch (mode)
37626 {
37627 case QImode:
37628 return TARGET_AVX2 ? V32QImode : V16QImode;
37629 case HImode:
37630 return TARGET_AVX2 ? V16HImode : V8HImode;
37631 case SImode:
37632 return TARGET_AVX2 ? V8SImode : V4SImode;
37633 case DImode:
37634 return TARGET_AVX2 ? V4DImode : V2DImode;
37635
37636 case SFmode:
37637 if (TARGET_AVX && !TARGET_PREFER_AVX128)
37638 return V8SFmode;
37639 else
37640 return V4SFmode;
37641
37642 case DFmode:
37643 if (!TARGET_VECTORIZE_DOUBLE)
37644 return word_mode;
37645 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
37646 return V4DFmode;
37647 else if (TARGET_SSE2)
37648 return V2DFmode;
37649 /* FALLTHRU */
37650
37651 default:
37652 return word_mode;
37653 }
37654 }
37655
37656 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
37657 vectors. */
37658
37659 static unsigned int
37660 ix86_autovectorize_vector_sizes (void)
37661 {
37662 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
37663 }
37664
37665 /* Initialize the GCC target structure. */
37666 #undef TARGET_RETURN_IN_MEMORY
37667 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
37668
37669 #undef TARGET_LEGITIMIZE_ADDRESS
37670 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
37671
37672 #undef TARGET_ATTRIBUTE_TABLE
37673 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
37674 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
37675 # undef TARGET_MERGE_DECL_ATTRIBUTES
37676 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
37677 #endif
37678
37679 #undef TARGET_COMP_TYPE_ATTRIBUTES
37680 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
37681
37682 #undef TARGET_INIT_BUILTINS
37683 #define TARGET_INIT_BUILTINS ix86_init_builtins
37684 #undef TARGET_BUILTIN_DECL
37685 #define TARGET_BUILTIN_DECL ix86_builtin_decl
37686 #undef TARGET_EXPAND_BUILTIN
37687 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
37688
37689 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
37690 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
37691 ix86_builtin_vectorized_function
37692
37693 #undef TARGET_VECTORIZE_BUILTIN_CONVERSION
37694 #define TARGET_VECTORIZE_BUILTIN_CONVERSION ix86_vectorize_builtin_conversion
37695
37696 #undef TARGET_BUILTIN_RECIPROCAL
37697 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
37698
37699 #undef TARGET_ASM_FUNCTION_EPILOGUE
37700 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
37701
37702 #undef TARGET_ENCODE_SECTION_INFO
37703 #ifndef SUBTARGET_ENCODE_SECTION_INFO
37704 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
37705 #else
37706 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
37707 #endif
37708
37709 #undef TARGET_ASM_OPEN_PAREN
37710 #define TARGET_ASM_OPEN_PAREN ""
37711 #undef TARGET_ASM_CLOSE_PAREN
37712 #define TARGET_ASM_CLOSE_PAREN ""
37713
37714 #undef TARGET_ASM_BYTE_OP
37715 #define TARGET_ASM_BYTE_OP ASM_BYTE
37716
37717 #undef TARGET_ASM_ALIGNED_HI_OP
37718 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
37719 #undef TARGET_ASM_ALIGNED_SI_OP
37720 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
37721 #ifdef ASM_QUAD
37722 #undef TARGET_ASM_ALIGNED_DI_OP
37723 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
37724 #endif
37725
37726 #undef TARGET_PROFILE_BEFORE_PROLOGUE
37727 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
37728
37729 #undef TARGET_ASM_UNALIGNED_HI_OP
37730 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
37731 #undef TARGET_ASM_UNALIGNED_SI_OP
37732 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
37733 #undef TARGET_ASM_UNALIGNED_DI_OP
37734 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
37735
37736 #undef TARGET_PRINT_OPERAND
37737 #define TARGET_PRINT_OPERAND ix86_print_operand
37738 #undef TARGET_PRINT_OPERAND_ADDRESS
37739 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
37740 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
37741 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
37742 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
37743 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
37744
37745 #undef TARGET_SCHED_INIT_GLOBAL
37746 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
37747 #undef TARGET_SCHED_ADJUST_COST
37748 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
37749 #undef TARGET_SCHED_ISSUE_RATE
37750 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
37751 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
37752 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
37753 ia32_multipass_dfa_lookahead
37754
37755 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
37756 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
37757
37758 #ifdef HAVE_AS_TLS
37759 #undef TARGET_HAVE_TLS
37760 #define TARGET_HAVE_TLS true
37761 #endif
37762 #undef TARGET_CANNOT_FORCE_CONST_MEM
37763 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
37764 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
37765 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
37766
37767 #undef TARGET_DELEGITIMIZE_ADDRESS
37768 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
37769
37770 #undef TARGET_MS_BITFIELD_LAYOUT_P
37771 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
37772
37773 #if TARGET_MACHO
37774 #undef TARGET_BINDS_LOCAL_P
37775 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
37776 #endif
37777 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
37778 #undef TARGET_BINDS_LOCAL_P
37779 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
37780 #endif
37781
37782 #undef TARGET_ASM_OUTPUT_MI_THUNK
37783 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
37784 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
37785 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
37786
37787 #undef TARGET_ASM_FILE_START
37788 #define TARGET_ASM_FILE_START x86_file_start
37789
37790 #undef TARGET_OPTION_OVERRIDE
37791 #define TARGET_OPTION_OVERRIDE ix86_option_override
37792
37793 #undef TARGET_REGISTER_MOVE_COST
37794 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
37795 #undef TARGET_MEMORY_MOVE_COST
37796 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
37797 #undef TARGET_RTX_COSTS
37798 #define TARGET_RTX_COSTS ix86_rtx_costs
37799 #undef TARGET_ADDRESS_COST
37800 #define TARGET_ADDRESS_COST ix86_address_cost
37801
37802 #undef TARGET_FIXED_CONDITION_CODE_REGS
37803 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
37804 #undef TARGET_CC_MODES_COMPATIBLE
37805 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
37806
37807 #undef TARGET_MACHINE_DEPENDENT_REORG
37808 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
37809
37810 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
37811 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
37812
37813 #undef TARGET_BUILD_BUILTIN_VA_LIST
37814 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
37815
37816 #undef TARGET_ENUM_VA_LIST_P
37817 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
37818
37819 #undef TARGET_FN_ABI_VA_LIST
37820 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
37821
37822 #undef TARGET_CANONICAL_VA_LIST_TYPE
37823 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
37824
37825 #undef TARGET_EXPAND_BUILTIN_VA_START
37826 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
37827
37828 #undef TARGET_MD_ASM_CLOBBERS
37829 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
37830
37831 #undef TARGET_PROMOTE_PROTOTYPES
37832 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
37833 #undef TARGET_STRUCT_VALUE_RTX
37834 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
37835 #undef TARGET_SETUP_INCOMING_VARARGS
37836 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
37837 #undef TARGET_MUST_PASS_IN_STACK
37838 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
37839 #undef TARGET_FUNCTION_ARG_ADVANCE
37840 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
37841 #undef TARGET_FUNCTION_ARG
37842 #define TARGET_FUNCTION_ARG ix86_function_arg
37843 #undef TARGET_FUNCTION_ARG_BOUNDARY
37844 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
37845 #undef TARGET_PASS_BY_REFERENCE
37846 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
37847 #undef TARGET_INTERNAL_ARG_POINTER
37848 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
37849 #undef TARGET_UPDATE_STACK_BOUNDARY
37850 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
37851 #undef TARGET_GET_DRAP_RTX
37852 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
37853 #undef TARGET_STRICT_ARGUMENT_NAMING
37854 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
37855 #undef TARGET_STATIC_CHAIN
37856 #define TARGET_STATIC_CHAIN ix86_static_chain
37857 #undef TARGET_TRAMPOLINE_INIT
37858 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
37859 #undef TARGET_RETURN_POPS_ARGS
37860 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
37861
37862 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
37863 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
37864
37865 #undef TARGET_SCALAR_MODE_SUPPORTED_P
37866 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
37867
37868 #undef TARGET_VECTOR_MODE_SUPPORTED_P
37869 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
37870
37871 #undef TARGET_C_MODE_FOR_SUFFIX
37872 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
37873
37874 #ifdef HAVE_AS_TLS
37875 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
37876 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
37877 #endif
37878
37879 #ifdef SUBTARGET_INSERT_ATTRIBUTES
37880 #undef TARGET_INSERT_ATTRIBUTES
37881 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
37882 #endif
37883
37884 #undef TARGET_MANGLE_TYPE
37885 #define TARGET_MANGLE_TYPE ix86_mangle_type
37886
37887 #ifndef TARGET_MACHO
37888 #undef TARGET_STACK_PROTECT_FAIL
37889 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
37890 #endif
37891
37892 #undef TARGET_FUNCTION_VALUE
37893 #define TARGET_FUNCTION_VALUE ix86_function_value
37894
37895 #undef TARGET_FUNCTION_VALUE_REGNO_P
37896 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
37897
37898 #undef TARGET_PROMOTE_FUNCTION_MODE
37899 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
37900
37901 #undef TARGET_SECONDARY_RELOAD
37902 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
37903
37904 #undef TARGET_CLASS_MAX_NREGS
37905 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
37906
37907 #undef TARGET_PREFERRED_RELOAD_CLASS
37908 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
37909 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
37910 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
37911 #undef TARGET_CLASS_LIKELY_SPILLED_P
37912 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
37913
37914 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
37915 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
37916 ix86_builtin_vectorization_cost
37917 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
37918 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
37919 ix86_vectorize_vec_perm_const_ok
37920 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
37921 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
37922 ix86_preferred_simd_mode
37923 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
37924 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
37925 ix86_autovectorize_vector_sizes
37926
37927 #undef TARGET_SET_CURRENT_FUNCTION
37928 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
37929
37930 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
37931 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
37932
37933 #undef TARGET_OPTION_SAVE
37934 #define TARGET_OPTION_SAVE ix86_function_specific_save
37935
37936 #undef TARGET_OPTION_RESTORE
37937 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
37938
37939 #undef TARGET_OPTION_PRINT
37940 #define TARGET_OPTION_PRINT ix86_function_specific_print
37941
37942 #undef TARGET_CAN_INLINE_P
37943 #define TARGET_CAN_INLINE_P ix86_can_inline_p
37944
37945 #undef TARGET_EXPAND_TO_RTL_HOOK
37946 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
37947
37948 #undef TARGET_LEGITIMATE_ADDRESS_P
37949 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
37950
37951 #undef TARGET_LEGITIMATE_CONSTANT_P
37952 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
37953
37954 #undef TARGET_FRAME_POINTER_REQUIRED
37955 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
37956
37957 #undef TARGET_CAN_ELIMINATE
37958 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
37959
37960 #undef TARGET_EXTRA_LIVE_ON_ENTRY
37961 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
37962
37963 #undef TARGET_ASM_CODE_END
37964 #define TARGET_ASM_CODE_END ix86_code_end
37965
37966 #undef TARGET_CONDITIONAL_REGISTER_USAGE
37967 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
37968
37969 #if TARGET_MACHO
37970 #undef TARGET_INIT_LIBFUNCS
37971 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
37972 #endif
37973
37974 struct gcc_target targetm = TARGET_INITIALIZER;
37975 \f
37976 #include "gt-i386.h"